In [5]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import inaugural
from nltk.corpus import stopwords
import math
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
preWWI_lst = [] #Create list to collect preWWI (till 1913) fileids
postWWI_lst =[] #Create list to collect postWWI (after 1917) fileids

for fileid in inaugural.fileids():
    if int(fileid[0:4])<=1913: #Collect the fileids before or on 1913
        preWWI_lst.append(fileid)
    elif int(fileid[0:4]) >1917: #Collect the fileids after on 1917
        postWWI_lst.append(fileid)
    else:
        pass

In [7]:
pre_words_lst = []
for i in range(len(preWWI_lst)):
    pre_words_lst.append(list(inaugural.words(preWWI_lst[i])))
    
pre_words_lst2 = []
for i in range(len(pre_words_lst)):
    pre_words_lst2 = pre_words_lst2 + pre_words_lst[i]    
pre_words_lst2 = [word.lower() for word in pre_words_lst2]
print(len(pre_words_lst2))

89718


In [8]:
post_words_lst = []
for i in range(len(postWWI_lst)):
    post_words_lst.append(list(inaugural.words(postWWI_lst[i])))
    
post_words_lst2 = []
for i in range(len(post_words_lst)):
    post_words_lst2 = post_words_lst2 + post_words_lst[i]    
post_words_lst2 = [word.lower() for word in post_words_lst2]
print(len(post_words_lst2))


54361


In [9]:
def make_cfdist(text, size=3):
    """
    Extract unigrams and two-word tuples in the windows and create a 
    FrequencyDist dictionary for both (and returns them in a list)
    """
    unigrams = []
    tuples = []
    # Scan over windows of the appropriate size.
    for center in range(size, len(text)-size):
        # enter the coocurrence (center word and each of all other words) in the dictionary
        wunis = set()
        wtuples = set() # for tuples in this context; set is to count only once
        thisword = text[center]
        
        # iterate though the test of the window
        for i in range(1, size+1): # i starts from 1 (center +/- i)
            nextleft = text[center-i]
            nextright = text[center+i]
            # add them next word in this window's unigram set
            wunis.add(nextleft)
            wunis.add(nextright)
            # create the next left tuple
            if not thisword == nextleft:
                if thisword < nextleft:
                    tup = (thisword,nextleft)
                else:
                    tup = (nextleft,thisword)
                # and add it in this window's tuple set
                wtuples.add(tup) #
            # create the next right tuple
            if not thisword == nextright:
                if thisword < nextright:
                    tup = (thisword,nextright)
                else:
                    tup = (nextright,thisword)
                # and add it in this window's tuple set
                wtuples.add(tup) #
        
        # add all unigrams in the text tuples list
        for wuni in wunis:
            unigrams.append(wuni)
        # add all tuples in the text tuples list
        for wtup in wtuples:
            tuples.append(wtup)
            
    # create a frequency dictionary from unigrams and tuples
    ufd = nltk.FreqDist(unigrams)
    cfd = nltk.FreqDist(tuples)
    # and return the dictionaries in a list
    return [ufd, cfd]

##Calling the make_cfdist function to find bigrams and unigrams for both pre-WWI and post-WWI dataset

In [10]:
text = pre_words_lst2
pre_ufd, pre_cfd = make_cfdist(text, size=3)

text = post_words_lst2
post_ufd, post_cfd = make_cfdist(text, size=3)

##pre-WWI PMI Calculation

In [11]:
pre_cfd_lst=[]
for item in list(pre_cfd.keys()):
    #print(item[0])
    if len(item[0])>2 and len(item[1])>2 and item[0] not in stopwords.words() and item[1] not in stopwords.words():
        pre_cfd_lst.append(item)
        
#Collect frequency column after filtering
val_lst=[]
for key in pre_cfd_lst:
    val_lst.append(pre_cfd[key])

#Collect bigrams after filtering    
key1_lst = [item[0] for item in pre_cfd_lst]
key2_lst = [item[1] for item in pre_cfd_lst]

#Combine bigrams' terms and corresponding frequency
df=pd.DataFrame(list(zip(key1_lst,key2_lst,val_lst)), columns=["key1","key2", "freq"])
df.to_csv("pre_PMI_bigrams_filter.csv",index=False) 

df=pd.read_csv("pre_PMI_bigrams_filter.csv", index_col=False)

#Calculate PMI for pre-WWI
N = df.freq.sum()
pre_PMI_lst=[]
for i in range(0, df.shape[0]):
    w1 = df.iloc[i].key1
    w2 = df.iloc[i].key2
    bigram_cnt = int(df.freq[df.key1==w1][df.key2==w2])
    w1_cnt = pre_ufd[w1]
    w2_cnt = pre_ufd[w2]

    pre_PMI_lst.append(math.log2((bigram_cnt*(N-3)) / (w1_cnt*w2_cnt)))

#Sort out all the bigrams include the word "economy" in either position for pre-WWI
df["PMI"] = pre_PMI_lst
pre_arr = np.array(df)
j=1
pre_final_lst=[]
for i in range(len(pre_arr)):
    if "economy" in pre_arr[i]:
        #print(j, pre_arr[i])
        pre_final_lst.append(pre_arr[i])    

##post-WWI PMI Calculation

In [12]:
post_cfd_lst=[]
for item in list(post_cfd.keys()):
    #print(item[0])
    if len(item[0])>2 and len(item[1])>2 and item[0] not in stopwords.words() and item[1] not in stopwords.words():
        post_cfd_lst.append(item)
        
#Collect frequency column after filtering
val_lst=[]
for key in post_cfd_lst:
    val_lst.append(post_cfd[key])

#Collect bigrams after filtering    
key1_lst = [item[0] for item in post_cfd_lst]
key2_lst = [item[1] for item in post_cfd_lst]

#Combine bigrams' terms and corresponding frequency
df=pd.DataFrame(list(zip(key1_lst,key2_lst,val_lst)), columns=["key1","key2", "freq"])
df.to_csv("post_PMI_bigrams_filter.csv",index=False) 

df=pd.read_csv("post_PMI_bigrams_filter.csv", index_col=False)

#Calculate PMI for post-WWI
N = df.freq.sum()
post_PMI_lst=[]
for i in range(0, df.shape[0]):
    w1 = df.iloc[i].key1
    w2 = df.iloc[i].key2
    bigram_cnt = int(df.freq[df.key1==w1][df.key2==w2])
    w1_cnt = post_ufd[w1]
    w2_cnt = post_ufd[w2]

    post_PMI_lst.append(math.log2((bigram_cnt*(N-3)) / (w1_cnt*w2_cnt)))

#Sort out all the bigrams include the word "economy" in either position for post-WWI
df["PMI"] = post_PMI_lst
post_arr = np.array(df)
j=1
post_final_lst=[]
for i in range(len(post_arr)):
    if "economy" in post_arr[i]:
        #print(j, post_arr[i])
        post_final_lst.append(post_arr[i])    

##Print Bigrams and PMI on txt file

In [13]:
pre_final_lst = np.array(pre_final_lst)
df_pre=pd.DataFrame(pre_final_lst, columns=["key1","key2","freq","PMI"])
df_pre.sort_values(by=["PMI"], ascending=False, inplace=True)
df_pre=df_pre.head(45)
df_pre.sort_values(by=["PMI" ,"key1", "key2"], ascending=[False, True, True], inplace=True)

post_final_lst = np.array(post_final_lst)
df_post=pd.DataFrame(post_final_lst, columns=["key1","key2","freq","PMI"])
df_post.sort_values(by=["PMI"], ascending=False, inplace=True)
df_post=df_post.head(45)
df_post.sort_values(by=["PMI" ,"key1", "key2"], ascending=[False, True, True], inplace=True)

with open("pmi.txt", "w") as f:
    f.write("{0:>12}{1:>40}\n".format("pre-WWI","post-WWI" ))
    f.write("%s\n"%("="*80))
    for i in range(0, df_pre.shape[0]):
        f.write("{0:2}  {1:<26} {2:0.7f}".format(i+1,"("+df_pre.iloc[i].key1+", "+df_pre.iloc[i].key2+")",df_pre.iloc[i].PMI,))
        f.write("{0:>3}{1:<27} {2:0.7f}\n".format(" ","("+df_post.iloc[i].key1+", "+df_post.iloc[i].key2+")",df_post.iloc[i].PMI,))
f.close()