# TODO

## Major
- [x] create term-by-document matrix (calculate words frequncies for each term-document pair)
 - [ ] check that it's actually correct - especially if we don't map terms to wrong documents
- [x] convert term-by-document frequencies to tf-idf (calcualte tf-idf for each term-document pair)
 - [ ] check
- [ ] we may need actual (numpy?) matrix?
- [ ] LSI magic

### Minor
- [x] remove numbers from terms - done but not sure if it's good thing to do, maybe it's also important for relevancy of docs,
like for example when there is year written?

In [2]:
import pandas as pd
import numpy as np
import string
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
np.random.seed(42)

In [4]:
bp_data = pd.read_csv("articles.csv", header=0)

In [5]:
bp_data.head(1)

Unnamed: 0,author,claps,reading_time,link,title,text
0,Justin Lee,8.3K,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T..."


In [6]:
def preprocess_docs(docs, use_lemmatizer = True):
    '''Tokenize and preprocess documents
    
    Parameters
    ----------
    use_lemmatizer : bool
                     Uses lemmazizer if True, othrerwise uses stemmer.
    '''
    preproccessed_docs = []
    
    # English stop words list
    en_stop = set(stopwords.words('english'))
    
    # Word tokenizer that removes punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    
    # lemmatizer / Stemmer
    if use_lemmatizer:
        lemmatizer = WordNetLemmatizer()
    else:
        stemmer = SnowballStemmer("english")
    
    for row in docs.itertuples(index=True, name='Doc'):
        text = row.text
        
        # remove numbers
        text = re.sub(r'\d+', '', text)
        
        text_words = tokenizer.tokenize(text)
        
        if use_lemmatizer:
            text_words = [lemmatizer.lemmatize(word, pos="v").lower() for word in text_words
                          if word not in string.punctuation and word.lower() not in en_stop]
        else:
            text_words = [stemmer.stem(word).lower() for word in text_words
                         if word not in string.punctuation and word.lower() not in en_stop]
        
        preproccessed_docs.append({'words': text_words})
    
    pdocs = pd.DataFrame(preproccessed_docs)
    return pdocs

In [7]:
preproccessed_docs = preprocess_docs(bp_data)
display(preproccessed_docs)

Unnamed: 0,words
0,"[oh, headline, blare, chatbots, next, big, thi..."
1,"[ever, find, look, question, concept, syntax, ..."
2,"[machine, learn, increasingly, move, hand, des..."
3,"[understand, machine, learning, big, question,..."
4,"[want, learn, apply, artificial, intelligence,..."
...,...
332,"[click, share, article, linkedin, skip, part, ..."
333,"[opinions, deep, neural, network, machine, lea..."
334,"[everyone, remotely, tune, recent, progress, m..."
335,"[one, biggest, misconceptions, around, idea, d..."


In [8]:
def get_term_by_document_frequency(preprocessed_docs):
    document_by_term = {}
    
    for index, row in preprocessed_docs.iterrows():
        doc_id = index
        doc_words = row['words']
        
        document_by_term[doc_id] = {
            'total_words': len(doc_words)
        }
        
        
        for word in set(row['words']):
            document_by_term[doc_id][word] = doc_words.count(word)

    df = pd.DataFrame(document_by_term)
    
    return df

In [9]:
df_frequency = get_term_by_document_frequency(preproccessed_docs)

In [10]:
df_frequency

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,327,328,329,330,331,332,333,334,335,336
total_words,1121.0,699.0,1241.0,713.0,1267.0,1612.0,585.0,503.0,524.0,1626.0,...,3394.0,673.0,1042.0,433.0,795.0,887.0,1011.0,424.0,625.0,1032.0
way,9.0,1.0,1.0,2.0,4.0,4.0,2.0,2.0,2.0,9.0,...,20.0,,11.0,,5.0,1.0,1.0,,1.0,7.0
help,2.0,1.0,2.0,2.0,2.0,,3.0,1.0,,9.0,...,1.0,1.0,4.0,1.0,,,3.0,,,5.0
live,1.0,1.0,,,,,,2.0,,2.0,...,,1.0,1.0,,,,4.0,,,
matt,1.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
regularisation,,,,,,,,,,,...,,,,,,,,,,3.0
oncologist,,,,,,,,,,,...,,,,,,,,,,1.0
intelligible,,,,,,,,,,,...,,,,,,,,,,1.0
loopy,,,,,,,,,,,...,,,,,,,,,,1.0


In [11]:
def get_tf_idf(df_frequency):
    df = df_frequency.copy()
    # tf := word frequency / total frequency
    df = df.drop('total_words', inplace=False)[:] / df.loc['total_words']
    
    # idf := log ( len(all_documents) / len(documents_containing_word) )
    
    corpus_size = df.shape[1]

    # number of non-zero cols
    df['doc_frequency'] = df.fillna(0).astype(bool).sum(axis=1)
        
    df['idf'] = np.log( corpus_size / df['doc_frequency'] )
    
    # tf-idf := tf * idf
    _cols = df.columns.difference(['idf', 'doc_frequency'])
    df[_cols] = df[_cols].multiply(df["idf"], axis="index")
    
    df.drop(columns=['doc_frequency', 'idf'], inplace=True)
    
    return df

In [12]:
df_tf_idf = get_tf_idf(df_frequency)
display(df_tf_idf)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,327,328,329,330,331,332,333,334,335,336
way,0.002145,0.000382,0.000215,0.000749,0.000843,0.000663,0.000913,0.001062,0.001020,0.001479,...,1.574092e-03,,0.002820,,0.001680,0.000301,0.000264,,0.000427,0.001812
help,0.000769,0.000617,0.000695,0.001209,0.000680,,0.002210,0.000857,,0.002386,...,1.269921e-04,0.000640,0.001655,0.000995,,,0.001279,,,0.002088
live,0.001057,0.001696,,,,,,0.004713,,0.001458,...,,0.001761,0.001138,,,,0.004690,,,
matt,0.003955,,,,,,,,,,...,,,,,,,,,,
enjoy,0.000003,0.000009,0.000002,0.000004,0.000002,0.000004,0.000005,0.000024,0.000006,0.000002,...,8.755953e-07,0.000004,0.000003,0.000007,0.000004,0.000003,0.000003,0.000014,0.000005,0.000003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
regularisation,,,,,,,,,,,...,,,,,,,,,,0.016919
oncologist,,,,,,,,,,,...,,,,,,,,,,0.005640
intelligible,,,,,,,,,,,...,,,,,,,,,,0.005640
loopy,,,,,,,,,,,...,,,,,,,,,,0.005640


In [13]:
values = df_tf_idf.fillna(0).to_numpy()
values

array([[0.00214461, 0.00038215, 0.00021525, ..., 0.        , 0.0004274 ,
        0.00181188],
       [0.00076898, 0.00061661, 0.00069462, ..., 0.        , 0.        ,
        0.00208823],
       [0.00105741, 0.00169579, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00563962],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00563962],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00563962]])