# Most distinctive words

Following on [the previous notebook](6A_document_term_matrices.ipynb)...

How can we find the words directly which distinguish Republican and Democrat States of the Union? Or between pre- and post-war America?


## Preliminaries

In [None]:
# import some things
import os
import pandas as pd
from textblob import TextBlob
pd.set_option("display.max_rows", 20)

In [None]:
# Set text folder and metadata path
# (If you don't have this corpus, please download it here): https://www.dropbox.com/sh/xd854hgyvbysqlm/AAAhbS6r7MFe4SVg1BFuuMTCa?dl=1

text_folder = '../corpora/peregrine'
path_to_metadata='../corpora/peregrine/peregrine.csv'

### Functions from last time

In [None]:
# Loop over each of the filenames

def make_dtm(text_folder,n_top_words=1000,normalize=False):
    # get stopwords
    from nltk.corpus import stopwords
    stopwords=set(stopwords.words('english'))

    # make an empty results list
    all_results = []

    # make a count for all words
    from collections import Counter
    all_counts = Counter()

    # for each filename
    filenames=sorted(os.listdir(text_folder))
    for i,fn in enumerate(filenames):
        if not i%10: print('>> looping through #',i,'of',len(filenames),'files:',fn)
        # make sure is a text file
        if not fn.endswith('.txt'): continue
        
        # full path
        full_path = os.path.join(text_folder,fn)

        # open the file
        with open(full_path) as file:
            txt=file.read()

        # make a blob
        blob = TextBlob(txt.lower())

        # make a result dictionary
        text_result = {}

        # set the filename
        text_result['fn']=fn

        # loop over the word counts
        num_words = len(blob.words)

        # for each word,count pair in the blob.word_counts dictionary...
        for word,count in blob.word_counts.items():
            # is the word in the stopwords?
            if word in stopwords: continue  

            # is the word a punctuation?
            if not word[0].isalpha(): continue
            
            # set the normalized version
            if normalize:
                # get the term frequency (count divided by number of words)
                tf = count / num_words

                # set the term frequency result to the key 'word' in the text_result dictionary
                text_result[word] = tf
            else:
                # set the count as a result
                text_result[word] = count

            # add the count to the dictionary of counts for all words
            all_counts[word]+=count

        # add results
        all_results.append(text_result)
    
    # Get the most frequent words
    most_common_words_plus_counts = all_counts.most_common(n_top_words)
    
    # Get only the words
    word_columns = []
    for word,count in most_common_words_plus_counts:
        word_columns.append(word)
    
    # Get columns
    columns=[]
    columns.append('fn')
    columns.extend(word_columns)
    
    # Make dataframe
    df = pd.DataFrame(all_results, columns=columns).set_index('fn').fillna(0)
    
    # return dataframe
    return df

In [None]:
# Make the document term matrix
dtm = make_dtm(text_folder,normalize=True)

In [None]:
# Get the metadata for this corpus
df_meta = pd.read_csv(path_to_metadata).set_index('fn')
df_meta

In [None]:
# Add the metadata
dtm_meta=df_meta.merge(dtm,on='fn')
dtm_meta

## Finding the most distinctive words

### (1) Difference of means

In [None]:
dtm_meta.groupby('season').mean()

In [None]:
round(dtm_meta.groupby('season').mean().T * 1000,2)

In [None]:
dtm_meta_T = dtm_meta.groupby('Party').mean().T * 1000
dtm_meta_T

In [None]:
dtm_meta_T['D-R']=dtm_meta_T['Democrat'] - dtm_meta_T['Republican']
round(dtm_meta_T,2)

In [None]:
round(dtm_meta_T.sort_values('D-R'),2)

In [None]:
dtm_meta.boxplot('government',by='Party',figsize=(8,5))

In [None]:
dtm_meta.boxplot('war',by='Party',figsize=(8,5))

In [None]:
dtm_meta.sort_values("n't",ascending=False)

In [None]:
# Sort by government
dtm_meta.sort_values('government',ascending=False)

In [None]:
# Why is Nixon using government so much?
nixon_path = os.path.join(text_folder, '1971.Nixon.txt')
print(nixon_path)

# Open the file
with open(nixon_path) as file:
    nixon_txt=file.read()
    
# make nltk version of the text (useful for concordance)
import nltk
nixon_words = nltk.word_tokenize(nixon_txt)
nixon_nltk = nltk.text.Text(nixon_words)

# get concordance
nixon_nltk.concordance('government',width=100,lines=1000)

In [None]:
def concordance(text_folder,filename,word,width=100,lines=1000):
    # Get the path
    text_path = os.path.join(text_folder, filename)
    print(text_path)

    # Open the file
    with open(text_path) as file:
        text_txt=file.read()

    # make nltk version of the text (useful for concordance)
    import nltk
    text_words = nltk.word_tokenize(text_txt)
    text_nltk = nltk.text.Text(text_words)

    # get concordance
    text_nltk.concordance(word,width=width,lines=lines)
    

In [None]:
concordance(text_folder,'1900.McKinley.txt',"islands")

In [None]:
dtm_meta.boxplot('war',by='Party',figsize=(8,5))

### (2) TF-IDF

#### TF: Term Frequency

<center><img src="https://latex.codecogs.com/png.latex?TF = \frac{n_w}{n_d}"></center>

Where:
* *Nw* is the number of times a given word *w* appears in a document.
* *Nd* is the number of words in that document.

In [None]:
# set a given word?
given_word='jobs'

In [None]:
# We already have that calculated here:
tf_series = dtm[given_word]
tf_series.sort_values(ascending=False)

#### IDF: Inverse Document Frequency

<center><img src="https://latex.codecogs.com/png.latex?IDF = \log \left( \frac{c_d}{i_d} \right)"></center>

Where:
* <img src="https://latex.codecogs.com/png.latex?{c_d}"> is the count of documents in the corpus.
* <img src="https://latex.codecogs.com/png.latex?{i_d}"> = is the number of documents in which that word appears.

In [None]:
# Get the number of documents
num_docs = len(dtm)
num_docs

In [None]:
# Get the number of documents a given word appears
dtm[dtm[given_word]>0][given_word]

In [None]:
num_docs_with_word=len(dtm[dtm[given_word]>0])

In [None]:
import numpy as np
idf = np.log(num_docs/num_docs_with_word)
idf

In [None]:
tfidf_series = tf_series * idf
tfidf_series.sort_values(ascending=False)

In [None]:
pd.DataFrame({'tfidf':tfidf_series, 'tf':tf_series}).plot(x='tf',y='tfidf',kind='scatter')

In [None]:
# make
def to_tfidf(dtm):
    # list of dictionaries
    dtm_tfidf = pd.DataFrame()
    
    for word in dtm.columns:
        # tf
        tf_series = dtm[word]
        
        # idf
        num_docs = len(dtm)
        num_docs_with_word=len(dtm[dtm[word]>0])
        idf=np.log(num_docs/num_docs_with_word)
        
        # tfidf
        tfidf_series = tf_series * idf
        dtm_tfidf[word]=tfidf_series
    
    return dtm_tfidf

In [None]:
dtm_tfidf=to_tfidf(dtm)
dtm_tfidf

In [None]:
word='america'
dtm[word].nlargest(10)

In [None]:
dtm_tfidf[word].nlargest(10)

In [None]:
pd.DataFrame({'tf':dtm[word], 'tfidf':dtm_tfidf[word]}).plot(x='tf',y='tfidf')

In [None]:
fn='2002.Bush.txt'
dtm_tfidf.loc[fn].nlargest(10)

In [None]:
dtm.loc[fn].nlargest(10)

In [None]:
fn='2017.Trump.txt'
dtm_tfidf.loc[fn].nlargest(10)

In [None]:
n_words = 10
for index in reversed(dtm_tfidf.index):
    # get row for this index
    row=dtm_tfidf.loc[index]
    
    # get the lagest words
    top_words_series=row.nlargest(n_words)
    top_words_list=list(top_words_series.index)
    top_words_str=', '.join(top_words_list)
    
    # print
    print('##',index.upper())
    print(top_words_str)
    print()

### (3) Fisher's exact test

In [None]:
# For this we need a document-term matrix *of raw counts*
dtm_counts = make_dtm(text_folder,normalize=False)

In [None]:
dtm_counts_meta = df_meta.merge(dtm_counts,on='fn')
dtm_counts_meta

In [None]:
Rs=dtm_counts[df_meta.Party == 'Republican']
Ds=dtm_counts[df_meta.Party == 'Democrat']
Rs

In [None]:
word='immigration'
sum_word_Rs = Rs[word].sum()
sum_word_Ds = Ds[word].sum()

print(sum_word_Rs,sum_word_Ds)

In [None]:
Rs.sum()

In [None]:
sum_allword_Rs=Rs.sum().sum()
sum_allword_Rs

In [None]:
sum_allword_Ds=Ds.sum().sum()
sum_allword_Ds

In [None]:
sum_notword_Rs = sum_allword_Rs - sum_word_Rs
sum_notword_Rs

In [None]:
sum_notword_Ds = sum_allword_Ds - sum_word_Ds
sum_notword_Ds

In [None]:
contingency_table = [
    [sum_word_Rs, sum_notword_Rs],
    [sum_word_Ds, sum_notword_Ds]
]

In [None]:
contingency_table

In [None]:
from scipy.stats import fisher_exact

oddsratio, pvalue = fisher_exact(contingency_table)
oddsratio, pvalue

#### "Stacking" a DTM

In [None]:
dtm_counts.stack()

In [None]:
dtm_stacked = dtm_counts.stack().reset_index()
dtm_stacked

In [None]:
dtm_stacked.columns = ['fn','word','count']
dtm_stacked

In [None]:
# Pivot table back to unstacked original form
dtm_stacked.pivot(index='fn',columns='word',values='count')

In [None]:
# Merge stacked DTM with meta
dtm_stacked_meta = df_meta.merge(dtm_stacked,on='fn')
dtm_stacked_meta

In [None]:
num_word_Rs=dtm_stacked_meta.query('word == "government" & Party == "Republican"')['count'].sum()
num_word_notRs=dtm_stacked_meta.query('word == "government" & Party != "Republican"')['count'].sum()
num_notword_Rs=dtm_stacked_meta.query('word != "government" & Party == "Republican"')['count'].sum()
num_notword_notRs=dtm_stacked_meta.query('word != "government" & Party != "Republican"')['count'].sum()

In [None]:
contingency_table = [
    [num_word_Rs, num_word_notRs],
    [num_notword_Rs, num_notword_notRs]
]
fisher_exact(contingency_table)

In [None]:
# Try every word!
result_list=[]

party='Republican'
for word in dtm_stacked_meta['word'].unique():
    num_word_Rs=dtm_stacked_meta.query('word == "'+word+'" & Party == "'+party+'"')['count'].sum()
    num_word_notRs=dtm_stacked_meta.query('word == "'+word+'" & Party != "'+party+'"')['count'].sum()
    num_notword_Rs=dtm_stacked_meta.query('word != "'+word+'" & Party == "'+party+'"')['count'].sum()
    num_notword_notRs=dtm_stacked_meta.query('word != "'+word+'" & Party != "'+party+'"')['count'].sum()
    contingency_table = [
        [num_word_Rs, num_word_notRs],
        [num_notword_Rs, num_notword_notRs]
    ]
    oddsratio,pvalue=fisher_exact(contingency_table)
    if oddsratio>2 and pvalue<0.05:
        print('{oddsratio} to 1 = the odds of "{word}" appearing in {party} (vs. non-{party}) texts'.format(
            word=word,party=party,oddsratio=round(oddsratio,2)))

    result_dict={}
    result_dict['word']=word
    result_dict['oddsratio']=oddsratio
    result_dict['pvalue']=pvalue
    result_dict['group']=party
    result_list.append(result_dict)

df_mdw = pd.DataFrame(result_list)

In [None]:
df_mdw[df_mdw.group=='Democrat'].sort_values('oddsratio',ascending=False).head(20)

In [None]:
df_mdw[df_mdw.group=='Republican'].sort_values('oddsratio',ascending=False).head(20)

In [None]:
# Forest?
dtm['forest'].nlargest(10)

#### (4) Mann-Whitney U test

In [None]:
# See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html
from scipy.stats import mannwhitneyu

In [None]:
Rs=dtm[df_meta.Party=='Republican']
Ds=dtm[df_meta.Party=='Democrat']

In [None]:
word='forest'
x=Rs[word]
y=Ds[word]
mannwhitneyu(x,y)

In [None]:
#dtm_stacked_meta.query('Party == "Republican" and word=="forest"')

In [None]:
def compute_mannwhitney(group1,group2,words=None):
    if not words:
        words = set(group1.columns) & set(group2.columns)
    
    result_list=[]
    for word in words:
        x=group1[word]
        y=group2[word]
        
        mwU, pvalue = mannwhitneyu(x,y)
    
        result_dict={}
        result_dict['word']=word
        result_dict['mannwhitney_U']=mwU
        result_dict['mannwhitney_pvalue']=pvalue
        result_list.append(result_dict)
        
    return pd.DataFrame(result_list)

In [None]:
df_mannwhitney=compute_mannwhitney(Rs,Ds)

In [None]:
# Top 20 Republicans
df_mannwhitney.sort_values('mannwhitney_U',ascending=True).head(20)

In [None]:
# Top 20 Democrats
df_mannwhitney.sort_values('mannwhitney_U',ascending=False).head(20)

In [None]:
df_mannwhitney[df_mannwhitney.word=="iraq"]

In [None]:
df_mdw[df_mdw.word=="iraq"]

In [None]:
dtm_meta_T.loc['iraq']

In [None]:
df_mdw.merge(df_mannwhitney,on='word').plot(x='oddsratio',logx=True,y='mannwhitney_U',kind='scatter',figsize=(8,8))