# Chi-Square Test
### Determine which words are significantly associated with low vs. high entropy chunks

### Import Python Libraries

In [1]:
##### Imports

import numpy as np

from matplotlib import pyplot

%matplotlib inline

import nltk, re, pprint
import pandas as pd
from pandas import DataFrame
import operator
from pandas import Series, DataFrame
from scipy import stats
from numpy.random import permutation, shuffle
import string
from scipy.stats import chi2_contingency

import sys
import os

from nltk.corpus.reader import *
from nltk.corpus.reader.util import *
from nltk.text import Text

from IPython.display import clear_output

### Load Functions

In [2]:
#Function to clean punctuation
puncs = ['、','。','「','」','…','！','――','？','ゝ','『','』','（','）','／','＼','々','ーーー','］','・','ゞ','［','-','─','<',
         '＃','△','※','＊', '!', '\"','#', '\'', '^', '\(', '\)', '.', ',', '+']

def cleaner(text):
    for punc in puncs:
        text = text.replace(punc, '')
    text = re.sub(r'   ', ' ', text)          #get rid of double spaces
    return text

### Build DataFrame for Lowest and Highest Entropy Chunks for Japanese Corpus

In [24]:
#store word_freq data and class labels here
word_dict = {}
meta_rows = []

#identify files where chunks are stored
low_ent_file = r"Results/LowEntJP.txt"
high_ent_file = r"Results/HighEntJP.txt"

#clean and segment texts
raw = open(low_ent_file, encoding="utf-8")
low_ent_raw = raw.read()
low_ent_raw = cleaner(low_ent_raw)  #clean punctuation
low_ent_chunks = re.split(r'\n\n', low_ent_raw)[0:-1]

raw = open(high_ent_file, encoding="utf-8")
high_ent_raw = raw.read()
high_ent_raw = cleaner(high_ent_raw)  #clean punctuation
high_ent_chunks = re.split(r'\n\n', high_ent_raw)[0:-1]

#now turn chunk lists into a single DTM
j = 0
for chunk in low_ent_chunks:
    #split the text into a list of individual tokens
    tokens = chunk.split(' ')
    while '' in tokens: tokens.remove('')  #remove blank spaces
    while r'\u3000' in tokens: tokens.remove(r'\u3000')  #remove blank spaces

    #produce the frequency list
    fdist = nltk.FreqDist(tokens)
    freq_pairs = fdist.items()
    
    words = {}
    class_label = "low_ent"
    
    for pair in freq_pairs:
        words[pair[0]] = pair[1]
    
    word_dict[j] = words
    meta_data = pd.DataFrame({'class_label': class_label}, index=[0])
    meta_rows.append(meta_data)
    
    j+=1

for chunk in high_ent_chunks:
    #split the text into a list of individual tokens
    tokens = chunk.split(' ')
    while '' in tokens: tokens.remove('')  #remove blank spaces
    while r'\u3000' in tokens: tokens.remove(r'\u3000')  #remove blank spaces

    #produce the frequency list
    fdist = nltk.FreqDist(tokens)
    freq_pairs = fdist.items()
    
    words = {}
    class_label = "high_ent"
    
    for pair in freq_pairs:
        words[pair[0]] = pair[1]
    
    word_dict[j] = words
    meta_data = pd.DataFrame({'class_label': class_label}, index=[0])
    meta_rows.append(meta_data)
    
    j+=1    

#now combine everything from low and high entropy chunks
all_words = pd.DataFrame(word_dict, index=word_dict.keys(), dtype='float32') 
#all_words = pd.DataFrame.from_dict(word_dict)
all_words = all_words.fillna(0)
all_words = all_words.astype(int)

all_meta = pd.concat(meta_rows, join='outer')
all_meta.reset_index(drop=True, inplace=True)

dtm = pd.concat([all_meta, all_words], axis= 1, join="outer")
dtm.shape

(41084, 1307)

In [53]:
all_meta = pd.concat(meta_rows, join='outer')
all_meta.reset_index(drop=True, inplace=True)

In [95]:
all_words = pd.DataFrame.from_dict(word_dict)
all_words = all_words.fillna(0)
all_words = all_words.astype(int)
all_words.insert(0, "class_label", all_meta.class_label.tolist())
all_words.shape

ValueError: Length of values does not match length of index

In [94]:
temp = all_meta.class_label.tolist()
temp[0:5]

['low_ent', 'low_ent', 'low_ent', 'low_ent', 'low_ent']

In [82]:
#dtm = pd.concat([all_words, all_meta], axis=1, join="outer")
dtm.shape

AttributeError: 'NoneType' object has no attribute 'shape'

In [90]:
all_words[0:10]

Unnamed: 0,class_label,0,1,2,3,4,5,6,7,8,...,1296,1297,1298,1299,1300,1301,1302,1303,1304,1305
(,,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
),,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
000,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
:,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Chi-Square Test

In [None]:
stats = []

labels = list(set(dtm.class_label.values))

class_counts = {}
#store class labels and counts to use below
for i in range(len(dtm.class_label.value_counts().keys())):
    class_counts[dtm.class_label.value_counts().keys()[i]] = dtm.class_label.value_counts()[i]

for feature in dtm.columns[1:]:
    feat_counts = {}
    for label in labels:
        feat_counts[label] = dtm[(dtm['class_label'] == label) & (dtm[feature] > 0)].shape[0]
    
    no_docs = sum(feat_counts.values())
    
    feat_inv = {}
    for key in feat_counts:
        feat_inv[key] = class_counts[key] - feat_counts[key]

    a = []
    b = []
    for key in feat_counts:
        a.append(feat_inv[key])
        b.append(feat_counts[key])
    obs = np.array([a, b])
    
    #only include features that have a count of 4 or more for each position in table
    x = [el for el in obs[0]]
    y = [el for el in obs[1]]
    for el in y:
        x.append(el)
    
    if len([item for item in x if item >= 4]) == 4:  #make sure all elements greater than 4
        chi_sq_stat = chi2_contingency(obs)[0]
        #append the data
        stats.append((feature, chi_sq_stat, no_docs, feat_counts[labels[0]], feat_counts[labels[1]]))

#turn into a dataframe and sort by chi_sq score
df = pd.DataFrame(stats, columns=['feature', 'chi_sq', 'no_docs', 'high_ent_freq', 'low_ent_freq'])
sorted_df = df.sort_values(by='chi_sq', ascending=False)

#view the top 10 rows
sorted_df[0:10]

### Run Same Procedure for Chinese Corpus

In [None]:
#now do this for the Chinese texts
#a function to strip punctuation
puncs = ['】','【','○','■','▉','●','，','。','“','”','：',',','？',':','！','、','?','；','!','’','‘',';','.','）','（','》',
         '《','……','^','I','-','*','——','—','......','l','(',')','①','!......','>','」','「','•','A','B','C','D','E','F',
         'G','H','J','K','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','1','2','3','4','5','6','7','8','9',
         '0','一','二','三','四','五','六','七','八','九','十','百','〇','\"','第','回','a','b','c','d','e','f','g','h','i',
        'j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

def cleaner(text):
    for punc in puncs:
        text = text.replace(punc, '')
    text = re.sub(r'   ', ' ', text)    #turn double spaces to single spaces
    return text

In [None]:
# build a DTM for the low and high entropy chunks

#store word_freq data and class labels here
word_dict = {}
meta_rows = []

low_ent_file = r"LowEntCH.txt"
high_ent_file = r"HighEntCH.txt"

#clean and segment texts
raw = open(low_ent_file)
low_ent_raw = raw.read()
low_ent_raw = cleaner(low_ent_raw)  #clean punctuation
low_ent_chunks = re.split(r'\r\n\r\n', low_ent_raw)[0:-1]

raw = open(high_ent_file)
high_ent_raw = raw.read()
high_ent_raw = cleaner(high_ent_raw)  #clean punctuation
high_ent_chunks = re.split(r'\r\n\r\n', high_ent_raw)[0:-1]

#now turn chunk lists into a single DTM
j = 0
for chunk in low_ent_chunks[0:100]:
    #split the text into a list of individual tokens
    tokens = chunk.split(' ')
    while '' in tokens: tokens.remove('')  #remove blank spaces
    while ur'\u3000' in tokens: tokens.remove(ur'\u3000')  #remove blank spaces

    #produce the frequency list
    fdist = nltk.FreqDist(tokens)
    freq_pairs = fdist.items()
    
    words = {}
    class_label = "low_ent"
    
    for pair in freq_pairs:
        words[pair[0]] = pair[1]
    
    word_dict[j] = words
    meta_data = pd.DataFrame({'class_label': class_label}, index=[0])
    meta_rows.append(meta_data)
    
    j+=1

for chunk in high_ent_chunks[0:100]:
    #split the text into a list of individual tokens
    tokens = chunk.split(' ')
    while '' in tokens: tokens.remove('')  #remove blank spaces
    while ur'\u3000' in tokens: tokens.remove(ur'\u3000')  #remove blank spaces

    #produce the frequency list
    fdist = nltk.FreqDist(tokens)
    freq_pairs = fdist.items()
    
    words = {}
    class_label = "high_ent"
    
    for pair in freq_pairs:
        words[pair[0]] = pair[1]
    
    word_dict[j] = words
    meta_data = pd.DataFrame({'class_label': class_label}, index=[0])
    meta_rows.append(meta_data)
    
    j+=1    

#now combine everything
all_words = pd.DataFrame(word_dict.values(), index=word_dict.keys(), dtype='float32') 
all_words = all_words.fillna(0)
all_words = all_words.astype(int)

all_meta = pd.concat(meta_rows, join='outer')
all_meta.reset_index(drop=True, inplace=True)

dtm = pd.concat([all_meta, all_words], axis= 1, join="outer")
dtm.shape

In [None]:
stats = []

labels = list(set(dtm.class_label.values))

class_counts = {}
#store class labels and counts to use below
for i in range(len(dtm.class_label.value_counts().keys())):
    class_counts[dtm.class_label.value_counts().keys()[i]] = dtm.class_label.value_counts()[i]

for feature in dtm.columns[1:]:
    feat_counts = {}
    for label in labels:
        feat_counts[label] = dtm[(dtm['class_label'] == label) & (dtm[feature] > 0)].shape[0]
    
    no_docs = sum(feat_counts.values())
    
    feat_inv = {}
    for key in feat_counts:
        feat_inv[key] = class_counts[key] - feat_counts[key]

    a = []
    b = []
    for key in feat_counts:
        a.append(feat_inv[key])
        b.append(feat_counts[key])
    obs = np.array([a, b])
    
    #only include features that have a count of 4 or more for each position in table
    x = [el for el in obs[0]]
    y = [el for el in obs[1]]
    for el in y:
        x.append(el)
    
    if len([item for item in x if item >= 4]) == 4:  #make sure all elements greater than 4
        chi_sq_stat = chi2_contingency(obs)[0]
        #append the data
        stats.append((feature, chi_sq_stat, no_docs, feat_counts[labels[0]], feat_counts[labels[1]]))

df2 = pd.DataFrame(stats, columns=['feature', 'chi_sq', 'no_docs', 'high_ent_freq', 'low_ent_freq'])
sorted_df2 = df2.sort_values(by='chi_sq', ascending=False)
sorted_df2[0:10]

### Output Results

In [None]:
#export the dfs to excel files
from pandas import ExcelWriter
writer = ExcelWriter('./DistinctEntTerms.xlsx')
sorted_df.to_excel(writer,'Sheet1')
sorted_df2.to_excel(writer,'Sheet2')
writer.save()