# TODO

## Major
- [x] create term-by-document matrix (calculate words frequncies for each term-document pair)
 - [ ] check that it's actually correct - especially if we don't map terms to wrong documents
- [x] convert term-by-document frequencies to tf-idf (calcualte tf-idf for each term-document pair)
 - [ ] check
- [ ] we may need actual (numpy?) matrix?
- [ ] LSI magic

### Minor
- [x] remove numbers from terms - done but not sure if it's good thing to do, maybe it's also important for relevancy of docs,
like for example when there is year written?

In [45]:
import pandas as pd
import numpy as np
import string
import nltk
import re
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [46]:
np.random.seed(42)

In [47]:
bp_data = pd.read_csv("articles.csv", header=0)

In [48]:
bp_data.head(1)

Unnamed: 0,author,claps,reading_time,link,title,text
0,Justin Lee,8.3K,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T..."


In [49]:
def get_lemmatization_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dictionary = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dictionary.get(tag, wordnet.NOUN)

In [50]:
def preprocess_docs(docs, use_lemmatizer = True):
    '''Tokenize and preprocess documents
    
    Parameters
    ----------
    use_lemmatizer : bool
                     Uses lemmazizer if True, othrerwise uses stemmer.
    '''
    preproccessed_docs = []
    
    # English stop words list
    en_stop = set(stopwords.words('english'))
    
    # Word tokenizer that removes punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    
    # lemmatizer / Stemmer
    if use_lemmatizer:
        lemmatizer = WordNetLemmatizer()
    else:
        stemmer = SnowballStemmer("english")
    
    for row in docs.itertuples(index=True, name='Doc'):
        text = row.text
        
        # remove numbers
        text = re.sub(r'\d+', '', text)
        
        text_words = tokenizer.tokenize(text)
        
        if use_lemmatizer:
            text_words = [lemmatizer.lemmatize(word.lower(), get_lemmatization_pos(word.lower())) for word in text_words
                          if word not in string.punctuation and word.lower() not in en_stop]
        else:
            text_words = [stemmer.stem(word.lower()) for word in text_words
                         if word not in string.punctuation and word.lower() not in en_stop]
        
        preproccessed_docs.append({'words': text_words})
    
    pdocs = pd.DataFrame(preproccessed_docs)
    return pdocs

In [51]:
preproccessed_docs = preprocess_docs(bp_data)
preproccessed_docs

Unnamed: 0,words
0,"[oh, headline, blare, chatbots, next, big, thi..."
1,"[ever, found, look, question, concept, syntax,..."
2,"[machine, learn, increasingly, move, hand, des..."
3,"[understand, machine, learn, big, question, ma..."
4,"[want, learn, apply, artificial, intelligence,..."
...,...
332,"[click, share, article, linkedin, skip, part, ..."
333,"[opinion, deep, neural, network, machine, lear..."
334,"[everyone, remotely, tune, recent, progress, m..."
335,"[one, big, misconception, around, idea, deep, ..."


In [52]:
def get_term_by_document_frequency(preprocessed_docs):
    document_by_term = {}
    
    for index, row in preprocessed_docs.iterrows():
        doc_id = index
        doc_words = row['words']
        
        # computed later, @TODO: move computation here and fix below or remove
#         document_by_term[doc_id] = {
#             'total_words': len(doc_words)
#         }
        document_by_term[doc_id] = {}
        
        for word in set(row['words']):
            document_by_term[doc_id][word] = doc_words.count(word)

    df = pd.DataFrame(document_by_term)
    
    return df

In [53]:
df_frequency = get_term_by_document_frequency(preproccessed_docs)

In [54]:
df_frequency

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,327,328,329,330,331,332,333,334,335,336
plus,1.0,,,,1.0,,,,,1.0,...,,,,,,,,,,
transformational,1.0,,,,,,,,,,...,,,,,,,,,,
traditional,1.0,,,,1.0,,,1.0,,1.0,...,,,,,1.0,,,1.0,,1.0
extra,1.0,,,,1.0,1.0,,,,,...,,,,,,1.0,,1.0,,
vcr,2.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
regularise,,,,,,,,,,,...,,,,,,,,,,1.0
mistaken,,,,,,,,,,,...,,,,,,,,,,1.0
loopy,,,,,,,,,,,...,,,,,,,,,,1.0
zeolearn,,,,,,,,,,,...,,,,,,,,,,1.0


In [55]:
def reduce_terms(df_frequency, max_df=1, min_df=0, max_terms=None):
    '''Remove unimportant terms from term-by-document matrix.
    
    Parameters
    ----------
    df : pd.DataFrame
    max_df : float , between [0, 1]
             Terms that appear in more % of documents will be ignored
    min_df : float , between [0, 1]
             Terms that appear in less % of documents will be ignored
    max_terms : int , None
                If not None, only top `max_terms` terms will be returned.
    '''
    df = df_frequency.copy()
    corpus_size = df.shape[1]

    if 'doc_frequency' not in df:
        df['doc_frequency'] = df_frequency.fillna(0).astype(bool).sum(axis=1) / corpus_size
            
    df = df[df.doc_frequency <= max_df]
    df = df[df.doc_frequency >= min_df]
    
    if max_terms is not None and max_terms < df.shape[0]:
        df['term_count'] = df_frequency.fillna(0).sum(axis=1)
        df = df.sort_values('term_count', ascending=False)
        df = df.head(max_terms)
        df.drop('term_count',axis=1, inplace=True)
    
    return df

In [56]:
reduce_terms(df_frequency).sort_values('doc_frequency', ascending=False).shape

(15584, 338)

In [73]:
reduce_terms(df_frequency, 0.8, 0.1,1000).sort_values('doc_frequency', ascending=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,328,329,330,331,332,333,334,335,336,doc_frequency
way,11.0,3.0,1.0,5.0,4.0,5.0,2.0,2.0,2.0,9.0,...,,14.0,,5.0,1.0,2.0,,1.0,7.0,0.792285
take,7.0,3.0,4.0,5.0,9.0,2.0,,4.0,,5.0,...,6.0,2.0,,6.0,2.0,1.0,4.0,,3.0,0.777448
machine,6.0,2.0,10.0,11.0,2.0,,20.0,3.0,2.0,5.0,...,2.0,8.0,2.0,1.0,6.0,11.0,4.0,4.0,,0.768546
give,1.0,3.0,2.0,2.0,5.0,2.0,1.0,,5.0,8.0,...,1.0,2.0,,7.0,2.0,2.0,3.0,,4.0,0.762611
go,2.0,2.0,2.0,3.0,4.0,2.0,1.0,2.0,,10.0,...,1.0,2.0,3.0,,8.0,5.0,,2.0,2.0,0.756677
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
body,,,,,,,,,,1.0,...,,,,,,1.0,,,,0.100890
max,,,,,,,,,,,...,1.0,,,,2.0,,,,,0.100890
filter,,3.0,,,,19.0,,,,,...,,,,,,,1.0,,,0.100890
competition,1.0,,2.0,,,1.0,,,,,...,,,1.0,,,,,,,0.100890


In [74]:
df_reduced = reduce_terms(df_frequency, 0.8, 0.1)

In [75]:
def get_tf_idf(df_frequency):
    df = df_frequency.copy()
    # tf := word frequency / total frequency
    df.loc['total_words'] = df.sum()
        
    df = df.drop('total_words')[:] / df.loc['total_words']
    
    # idf := log ( len(all_documents) / len(documents_containing_word) )
    
    corpus_size = df.shape[1]

    # number of non-zero cols
    if 'doc_frequency' not in df_frequency:
        df['doc_frequency'] = df.fillna(0).astype(bool).sum(axis=1)
        
    df['idf'] = np.log( corpus_size / df['doc_frequency'] )
    
    # tf-idf := tf * idf
    _cols = df.columns.difference(['idf', 'doc_frequency'])
    df[_cols] = df[_cols].multiply(df["idf"], axis="index")
    
    df.drop(columns=['doc_frequency', 'idf'], inplace=True)
    
    return df

In [76]:
df_tf_idf = get_tf_idf(df_reduced)
display(df_tf_idf)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,327,328,329,330,331,332,333,334,335,336
plus,0.018128,,,,0.015164,,,,,0.012408,...,,,,,,,,,,
traditional,0.017530,,,,0.014664,,,0.038944,,0.011999,...,,,,,0.022740,,,0.049646,,0.018635
extra,0.018097,,,,0.015138,0.011341,,,,,...,,,,,,0.023191,,0.051251,,
v,0.017571,0.027739,,,0.014698,,,,,,...,0.013324,,0.018868,,,,,,0.033515,
facebook,0.034387,,,,,,0.031735,,,0.011769,...,0.013038,,,,,,,,,0.018277
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
improvement,,,,,,,,,,,...,,,0.037919,,0.022904,,,,0.067357,
movie,,,,,,,,,,,...,,,,,,,,,,
behavior,,,,,,,,,,,...,,,,,,,,0.050999,0.206094,
chosen,,,,,,,,,,,...,,,,,,,,,,0.019410


In [77]:
values = df_tf_idf.fillna(0).to_numpy()
values

array([[0.0181276 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.0175299 , 0.        , 0.        , ..., 0.04964576, 0.        ,
        0.01863494],
       [0.01809662, 0.        , 0.        , ..., 0.05125073, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.05099898, 0.20609357,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.0194102 ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [78]:
u, s, vh = np.linalg.svd(values, full_matrices=True)

In [79]:
vh

array([[ 4.13422687e-02,  4.38980971e-02,  5.42467578e-02, ...,
         6.28584745e-02,  5.52754112e-02,  6.37051731e-02],
       [-4.59603930e-03, -1.60052362e-02, -4.41035270e-03, ...,
        -2.89272097e-02,  7.00899028e-05, -3.63891881e-02],
       [ 4.84119183e-03, -1.79407607e-02,  1.39207083e-02, ...,
        -2.22773097e-02, -9.73433515e-03, -5.58716957e-02],
       ...,
       [ 0.00000000e+00,  3.27527643e-18,  1.09808824e-17, ...,
        -1.69592453e-18, -3.57372013e-18, -5.18229992e-18],
       [ 0.00000000e+00, -1.38677648e-16, -3.09392388e-16, ...,
        -1.21430643e-17, -5.11743425e-17, -9.62771529e-17],
       [ 8.16456815e-01, -6.66133815e-16,  5.55111512e-16, ...,
        -6.43745040e-17,  5.79235011e-17, -8.25348904e-18]])