In [None]:
import pandas as pd
import numpy as np
import seaborn as sns 
from matplotlib import pyplot as plt
import matplotlib as mpl
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from langdetect import detect

### helper functions for speeding up computation

In [None]:
def count_word(corpus):
    result = 0
    for x in corpus:
        result += len(x.split())
    return result
def count_zero(arr):
    zero_arr = np.zeros((arr.shape[0], arr.shape[1]))
    return np.sum(arr == zero_arr)

### get data ready

 - language detection takes time, about 30 seconds on our dataset on my laptop
 - may consider not doing language detection on large dataset unless really necessary
 - may also consider running language detection after filtering by other dimensions
 - in our dataset, there are 55 non-English excerpts, we will NOT remove them in our analysis
 - this leaves our final analysis file with 26,526 excerpts in our corpus

In [None]:
data_dir = "/Users/joshwinnes/Library/Mobile Documents/com~apple~CloudDocs/Wheaton College/fall 2024/topics in data science/data/"
text_file_name = "osdg-community-data-v2024-04-01.csv"
text_df = pd.read_csv(data_dir + text_file_name,sep = "\t",  quotechar='"')
text_df.drop(text_df.columns.values[0],axis = 1, inplace=True)
text_df = text_df.query("agreement > 0.5 and (labels_positive - labels_negative) > 2").reset_index(drop=True)
text_df["lang"] = text_df["text"].apply(lambda x: detect(x))
text_df = text_df.query("lang == 'en'")
text_df.reset_index(inplace=True,drop=True)
text_df.info()

### study term frequencies using scikit-learn vectorizer

In [None]:
corpus = text_df.text
count_vectorizer = CountVectorizer() # default is unigram, no stop word removal
count_vectorizer.fit(corpus)
count_vector = count_vectorizer.transform(corpus).toarray() 
print('vocabulary size: ' , len(count_vectorizer.vocabulary_))
print('vector shape: ', count_vector.shape)
print("proportion of non-zeros: ", 1-np.sum(count_vector==0)/(count_vector.shape[0]*count_vector.shape[1]))

In [None]:
count_word(corpus), np.sum(count_vector), 1 - np.sum(count_vector)/count_word(corpus)

notice the difference
* the the total word count from string split and sum = 2,516,370
* sum of all term counts, as counted from the sklearn tokenizer = 2,498,921
* small difference (less than 1%), sperhaps due to certain processing details such as handling special characters?

Looking at results of vectorizer

* count_vectorizer.vocabulary_ is a mapping of terms to feature indices , the key is the term in the corpus, the value is the index to the feature array 
* count_vectorizer.vocabulary_.keys() provides the list of the terms in the corpus
* count_vectorizer.get_feature_names_out() is the corresponding array of the features

* count_vectorizer.transform(corpus).toarray() provided the document-term frequency array: row for document in corpus, column for the terms in corpus

In [9]:
count_vectorizer.vocabulary_

{'from': 17350,
 'gender': 17868,
 'perspective': 31592,
 'paulgaard': 31174,
 'points': 32244,
 'out': 30369,
 'that': 41492,
 'the': 41500,
 'labour': 24037,
 'markets': 26017,
 'of': 29692,
 'fishing': 16619,
 'villages': 44452,
 'have': 19297,
 'been': 5460,
 'highly': 19701,
 'segregated': 37508,
 'in': 21032,
 'terms': 41361,
 'existence': 15645,
 'male': 25732,
 'jobs': 22951,
 'and': 3519,
 'female': 16294,
 'however': 20120,
 'new': 28703,
 'business': 6976,
 'opportunities': 30053,
 'led': 24482,
 'to': 41908,
 'population': 32438,
 'peripheral': 31506,
 'areas': 4094,
 'now': 29263,
 'working': 45522,
 'service': 37751,
 'industry': 21369,
 'former': 17047,
 'boys': 6467,
 'girls': 18162,
 'are': 4090,
 'doing': 13152,
 'same': 36854,
 'indicates': 21265,
 'change': 8042,
 'because': 5429,
 'traditional': 42209,
 'boundaries': 6428,
 'between': 5721,
 'women': 45470,
 'men': 26594,
 'work': 45507,
 'being': 5509,
 'crossed': 10731,
 'but': 6990,
 'fact': 15943,
 'young': 458

In [None]:
count_vectorizer.vocabulary_.items()

In [None]:
count_vectorizer.vocabulary_.keys()

In [None]:
count_vectorizer.get_feature_names_out()

### Stop word removal

In [None]:
corpus = text_df.text
count_vectorizer = CountVectorizer(ngram_range=(1,1),stop_words='english') 
count_vectorizer.fit(corpus) 
count_vector = count_vectorizer.transform(corpus).toarray() 
print('vocabulary size: ' , len(count_vectorizer.vocabulary_))
print('vector shape: ', count_vector.shape)
print("proportion of non-zeros: ", 1-np.sum(count_vector==0)/(count_vector.shape[0]*count_vector.shape[1]))

Notice the difference in vocabular size between the results of stop-word removal vs. non removal. 

We can see what was used (and removed) as stop words in the documents.

In [None]:
count_vectorizer.get_stop_words()

 - take the document-term matrix (count vectorized array), make into a pandas dataframe with feature names (terms) as column names

In [None]:
corpus = text_df.text
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit(corpus)
count_vector = count_vectorizer.transform(corpus).toarray() 
count_vector_df = pd.DataFrame(count_vector, columns=count_vectorizer.get_feature_names_out())

we can obtain term frequency (across entire corpus) from the document-term dataframe, by summing across rows for each term (feature, column)

In [None]:
term_freq = pd.DataFrame({"term": count_vector_df.columns.values, "freq" : count_vector_df.sum(axis=0)})
term_freq.sort_values(by="freq", ascending=False)

 - take a look at a portion of the term-document matrix

In [None]:
count_vector_df.loc[100:125,term_freq.sort_values(by="freq", ascending =False)[:20].term] # take a portion

In [None]:
total_word_count = count_word(corpus) 
total_word_count, term_freq.freq.sum(), (1-term_freq.freq.sum()/total_word_count)

 - stop word removal reduced document word frequency count by 40%

### bi-grams

In [None]:
corpus = text_df.text
count_vectorizer = CountVectorizer(ngram_range=(2,2), stop_words='english') 
count_vectorizer.fit(corpus)
count_vector = count_vectorizer.transform(corpus).toarray() 
print('vocabulary size: ' , len(count_vectorizer.vocabulary_))
print('vector shape: ', count_vector.shape)
# array count_vector is much bigger, counting zeros tends to crash the kernel, so we should skip it.
# print("proportion of non-zeros: ", 1-count_zero(count_vector)/(count_vector.shape[0]*count_vector.shape[1]))

### tri-grams

In [None]:
corpus = text_df.text
count_vectorizer = CountVectorizer(ngram_range=(3,3), stop_words='english') 
count_vectorizer.fit(corpus)
count_vector = count_vectorizer.transform(corpus).toarray() 
print('vocabulary size: ' , len(count_vectorizer.vocabulary_))
print('vector shape: ', count_vector.shape)
# array count_vector is much bigger, counting zeros tends to crash the kernel, so we should skip it.
# print("proportion of non-zeros: ", 1-np.sum(count_vector==0)/(count_vector.shape[0]*count_vector.shape[1]))

Notice that the vocabulary sizes increases by big proportion when we go from unigram, to bigram, to trigram

we can specify the minimum size of document frequency, min_df, so that only the terms appearing in minimum number of min_df documents in the corpus would be considered for vocabulary


### Term frequency with minimum document frequency

In [None]:
# the lass line in this cell that calls for count_vector_df.sum will not finish 
# unless min_df is set to 2 or above in the CountVectorizer call at top of this cell, make sure to specify
count_vectorizer = CountVectorizer(ngram_range = (2,2),stop_words='english', min_df=5)
count_vectorizer.fit(corpus)
count_vector = count_vectorizer.transform(corpus).toarray() 
print('vocabulary size: ' , len(count_vectorizer.vocabulary_))
print('vector shape: ', count_vector.shape)
count_vector_df = pd.DataFrame(count_vector, columns=count_vectorizer.get_feature_names_out())
bigram_freq = pd.DataFrame({"term": count_vector_df.columns.values, "freq" : count_vector_df.sum(axis=0)})

Notice the vocabulary size dropped from 881392 to 28447

How do we verify that term frequency computed in such a sequence of multiple operations (stop word removal, min_frequency, multiple aggregations) is correct? 

In [None]:
bigram_freq.query("term == 'rural areas'")

In [None]:
print("vocabulary term and frequency: ", bigram_freq.query("term == 'rural areas'")["freq"])
print("vocabulary index is: ", count_vectorizer.vocabulary_.get("rural areas"))
print("vocabulary index is: ", count_vectorizer.vocabulary_[u'rural areas'])# u'string' for unicode string 'string'
print("vocabulary index is: ", count_vectorizer.vocabulary_.get(u'rural areas') )

In [None]:
count_vectorizer.get_feature_names_out()[23033]

In [None]:
bigram_freq.iloc[23033]

In [None]:
corpus.apply(lambda x : "rural areas" in x).sum()

In [None]:
corpus.apply(lambda x : x.count("rural areas")).sum()

In [None]:
corpus.apply(lambda x : x.lower().count("rural areas")).sum()

### computing word frequency

get count vector and then followed by sum is an expensive way to get word frequency
 - may be better rely on vocabulary, which is a dictionary

In [None]:
bigram_freq.sort_values(by="freq", ascending =False)[:30]

In [None]:
count_vectorizer.vocabulary_

In [None]:
bigram_freq.query("freq==5").nunique()


### putting together as a function
 - enable stop words removal
 - enable unigram, bigram, tri-gram

In [None]:
def get_term_freq(corpus, ngram_range = (1, 1), stop_words = None, min_df = 2): # min_df has to be 2 or larger
    count_vectorizer = CountVectorizer(ngram_range = ngram_range, stop_words = stop_words, min_df =min_df)
    count_vector = count_vectorizer.fit_transform(corpus).toarray()
    count_vector_df = pd.DataFrame(count_vector, columns=count_vectorizer.get_feature_names_out())
    term_freq = pd.DataFrame({"term": count_vector_df.columns.values, "freq" : count_vector_df.sum(axis=0)})
    return term_freq

In [None]:
corpus = text_df.text
term_freq_1_1_remove_stop = get_term_freq(corpus, stop_words='english').sort_values(by="freq", ascending=False)
term_freq_1_1_keep_stop = get_term_freq(corpus).sort_values(by="freq", ascending=False)
term_freq_2_2_remove_stop = get_term_freq(corpus, ngram_range = (2,2), stop_words = 'english').sort_values(by="freq", ascending=False)
term_freq_2_2_keep_stop = get_term_freq(corpus, ngram_range = (2,2)).sort_values(by="freq", ascending=False)

In [None]:
term_freq_2_2_keep_stop.head(50)

In [None]:
term_freq_2_2_remove_stop.head(50)

In [None]:
term_freq_2_2_keep_stop.shape, term_freq_2_2_remove_stop.shape