# Metadata

```
Author: Linnaea Kavulich
Contact: qpk4kp@virginia.edu
Course: DS 5001 (Spring 2023)
```

<h1><center>The State of Natural Language Processing</center></h1>
<h3><center>An exploration of the infamous Locke-Hobbes-Rousseau Debate Using NLP techniques</center></h3>

*** 

<center>This notebook imports F3 CORPUS and VOCAB tables to compute and add TFIDF (F4)</center>

| Book_id | Title | Author |
| :- | :- | :- |
| 1 | The Social Contract | Jean-Jacques Rousseau 
| 2 | Leviathan | Thomas Hobbes
| 3 | Second Treatise of Government | John Locke
| 4 | Discourse on the Origin and Basis of Inequality Among Men | Jean-Jacques Rousseau
| 5 | An Essay Concerning Humane Understanding, Vol. 1 | John Locke
| 6 | An Essay Concerning Humane Understanding, Vol. 2 | John Locke

## Import Packages

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import nltk
import re

import os
os.chdir('C:/Users/linna/Box/MSDS/DS5001/Final Project/Corpus/')

***

## Load CORPUS, VOCAB

In [2]:
CORPUS = pd.read_csv('CORPUS.csv')

In [3]:
CORPUS.head()

Unnamed: 0,book_id,chap_num,para_num,sent_num,token_num,token_str,term_str,pos
0,1,1,0,0,0,SUBJECT,subject,NN
1,1,1,0,0,1,OF,of,IN
2,1,1,0,0,2,THE,the,DT
3,1,1,0,0,3,FIRST,first,JJ
4,1,1,0,0,4,BOOK,book,NN


In [4]:
VOCAB = pd.read_csv('VOCAB.csv')

In [5]:
VOCAB.head()

Unnamed: 0,term_str,n,p,i,n_chars,h,max_pos,cat_pos,stop,stem_porter,stem_snowball,stem_lancaster
0,the,41075,0.059972,4.059561,3,0.243461,DT,{'DT'},1,the,the,the
1,of,31825,0.046467,4.427661,2,0.205739,IN,{'IN'},1,of,of,of
2,and,22465,0.0328,4.930142,3,0.161711,CC,{'CC'},1,and,and,and
3,to,22072,0.032227,4.955604,2,0.159702,TO,{'TO'},1,to,to,to
4,that,13160,0.019214,5.701662,4,0.109554,IN,"{'RB', 'VBN', 'VB', 'DT', 'IN', 'WDT'}",1,that,that,that


***

## TFIDF

In [6]:
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']

TOKENS = OHCO[:5]
SENTS = OHCO[:4]
PARAS = OHCO[:3]
CHAPS = OHCO[:2]
BOOKS = OHCO[:1]

In [7]:
def create_bow(CORPUS, bag, item_type='term_str'):
    BOW = CORPUS.groupby(bag+[item_type])[item_type].count().to_frame('n')
    return BOW

In [8]:
def get_tfidf(BOW, tf_method='max', df_method='standard', item_type='term_str'):
            
    DTCM = BOW.n.unstack(fill_value=0) # Create Doc-Term Count Matrix
    
    if tf_method == 'sum':
        TF = (DTCM.T / DTCM.T.sum()).T
    elif tf_method == 'max':
        TF = (DTCM.T / DTCM.T.max()).T
    elif tf_method == 'log':
        TF = (np.log2(1 + DTCM.T)).T
    elif tf_method == 'raw':
        TF = DTCM
    elif tf_method == 'bool':
        TF = DTCM.astype('bool').astype('int')
    else:
        raise ValueError(f"TF method {tf_method} not found.")

    DF = DTCM.astype('bool').sum()
    N_docs = len(DTCM)
    
    if df_method == 'standard':
        IDF = np.log2(N_docs/DF) # This what the students were asked to use
    elif df_method == 'textbook':
        IDF = np.log2(N_docs/(DF + 1))
    elif df_method == 'sklearn':
        IDF = np.log2(N_docs/DF) + 1
    elif df_method == 'sklearn_smooth':
        IDF = np.log2((N_docs + 1)/(DF + 1)) + 1
    else:
        raise ValueError(f"DF method {df_method} not found.")
    
    TFIDF = TF * IDF

    return TFIDF

In [9]:
book_BOW = create_bow(CORPUS, BOOKS)

In [10]:
book_BOW.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,n
book_id,term_str,Unnamed: 2_level_1
1,a,1774
1,abandon,2
1,abandoned,3


In [11]:
TFIDF_books = get_tfidf(book_BOW)

In [12]:
TFIDF_books.head(3)

term_str,a,aaron,aarons,ab,abandon,abandoned,abandons,abased,abate,abated,...,zones,æthereum,ævum,œconomy,œrarium,εὔνοιαι,κοινὰι,νόμος,οἰκός,τύπτω
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.000145,0.000218,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00032,0.00032,0.0,0.0,0.00032,0.00032,0.00032
2,0.0,0.005246,0.000175,0.0,0.000317,0.000198,0.0,0.000175,7.9e-05,0.0,...,0.0,0.000175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.002285,0.000345,0.000172,0.000762,0.0,0.000172,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
book_BOW['tfidf'] = TFIDF_books.stack()

In [14]:
book_BOW.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,tfidf
book_id,term_str,Unnamed: 2_level_1,Unnamed: 3_level_1
1,a,1774,0.0
1,abandon,2,0.000145
1,abandoned,3,0.000218
1,abbé,2,0.000641
1,abdicated,1,0.00032


***

## Add TF-IDF to CORPUS

In [15]:
CORPUS = CORPUS.set_index(['book_id', 'term_str'])

In [16]:
CORPUS['tfidf'] = TFIDF_books.stack()

In [17]:
CORPUS = CORPUS.reset_index().set_index(['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num'])

In [18]:
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,term_str,token_str,pos,tfidf
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,0,0,0,subject,SUBJECT,NN,0.000000
1,1,0,0,1,of,OF,IN,0.000000
1,1,0,0,2,the,THE,DT,0.000000
1,1,0,0,3,first,FIRST,JJ,0.000000
1,1,0,0,4,book,BOOK,NN,0.001233
...,...,...,...,...,...,...,...,...
6,32,10,3,50,one,one,CD,0.000000
6,32,10,3,51,from,from,IN,0.000000
6,32,10,3,52,another,another,DT,0.000000
6,32,11,0,0,the,The,DT,0.000000


***

## Add DF-IDF to VOCAB

In [19]:
TOKEN = pd.read_csv('CORPUS.csv')

In [20]:
TOKEN.set_index(['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str,pos
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,0,0,0,SUBJECT,subject,NN
1,1,0,0,1,OF,of,IN
1,1,0,0,2,THE,the,DT
1,1,0,0,3,FIRST,first,JJ
1,1,0,0,4,BOOK,book,NN
...,...,...,...,...,...,...,...
6,32,10,3,50,one,one,CD
6,32,10,3,51,from,from,IN
6,32,10,3,52,another,another,DT
6,32,11,0,0,The,the,DT


In [21]:
BOW = TOKEN.groupby(CHAPS+['term_str']).term_str.count().to_frame('n') 

In [22]:
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n
book_id,chap_num,term_str,Unnamed: 3_level_1
1,1,a,3
1,1,about,1
1,1,account,1
1,1,all,1
1,1,and,6
...,...,...,...
6,32,within,1
6,32,word,6
6,32,words,2
6,32,world,1


In [23]:
DTCM = BOW.n.unstack().fillna(0).astype('int')

In [24]:
DTCM

Unnamed: 0_level_0,term_str,a,aaron,aarons,ab,abandon,abandoned,abandons,abased,abate,abated,...,zones,æthereum,ævum,œconomy,œrarium,εὔνοιαι,κοινὰι,νόμος,οἰκός,τύπτω
book_id,chap_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,41,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,18,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6,27,59,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,28,64,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,29,52,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,31,77,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
DF = DTCM.astype('bool').sum()

In [26]:
N = DTCM.shape[0]
idf_method = 'standard'

print('IDF method:', idf_method)
if idf_method == 'standard':
    IDF = np.log2(N / DF)
elif idf_method == 'max':
    IDF = np.log2(DF.max() / DF) 
elif idf_method == 'smooth':
    IDF = np.log2((1 + N) / (1 + DF)) + 1

IDF method: standard


In [27]:
DF = pd.DataFrame(DF).reset_index().set_index('term_str')

In [28]:
IDF = pd.DataFrame(IDF).reset_index().set_index('term_str')

In [29]:
VOCAB = VOCAB.set_index('term_str')

In [30]:
VOCAB['df'] = DF
VOCAB['idf'] = IDF

In [31]:
VOCAB['dfidf'] = VOCAB.df * VOCAB.idf

In [32]:
VOCAB = VOCAB.drop(columns = ['df','idf'])

In [33]:
VOCAB

Unnamed: 0_level_0,n,p,i,n_chars,h,max_pos,cat_pos,stop,stem_porter,stem_snowball,stem_lancaster,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
the,41075,0.059972,4.059561,3,0.243461,DT,{'DT'},1,the,the,the,0.000000
of,31825,0.046467,4.427661,2,0.205739,IN,{'IN'},1,of,of,of,0.000000
and,22465,0.032800,4.930142,3,0.161711,CC,{'CC'},1,and,and,and,0.000000
to,22072,0.032227,4.955604,2,0.159702,TO,{'TO'},1,to,to,to,0.000000
that,13160,0.019214,5.701662,4,0.109554,IN,"{'RB', 'VBN', 'VB', 'DT', 'IN', 'WDT'}",1,that,that,that,1.438746
...,...,...,...,...,...,...,...,...,...,...,...,...
ammunition,1,0.000001,19.385534,10,0.000028,NN,{'NN'},0,ammunit,ammunit,ammunit,7.515700
spoiles,1,0.000001,19.385534,7,0.000028,NNS,{'NNS'},0,spoil,spoil,spoil,7.515700
serene,1,0.000001,19.385534,6,0.000028,JJ,{'JJ'},0,seren,seren,ser,7.515700
achor,1,0.000001,19.385534,5,0.000028,NN,{'NN'},0,achor,achor,ach,7.515700


***

## Save Updated VOCAB and CORPUS

In [34]:
CORPUS.to_csv(f"C:/Users/linna/Box/MSDS/DS5001/Final Project/Corpus/CORPUS_tfidf.csv")

In [35]:
VOCAB.to_csv(f"C:/Users/linna/Box/MSDS/DS5001/Final Project/Corpus/VOCAB_dfidf.csv")