In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
from matplotlib import pyplot as plt
import matplotlib as mpl
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from langdetect import detect

### helper functions for speeding up computation

In [2]:
def count_word(corpus):
    result = 0
    for x in corpus:
        result += len(x.split())
    return result
def count_zero(arr):
    zero_arr = np.zeros((arr.shape[0], arr.shape[1]))
    return np.sum(arr == zero_arr)

### get data ready

 - language detection takes time, about 30 seconds on our dataset on my laptop
 - may consider not doing language detection on large dataset unless really necessary
 - may also consider running language detection after filtering by other dimensions
 - in our dataset, there are 55 non-English excerpts, we will NOT remove them in our analysis
 - this leaves our final analysis file with 26,526 excerpts in our corpus

In [3]:
data_dir = "/Users/joshwinnes/Library/Mobile Documents/com~apple~CloudDocs/Wheaton College/fall 2024/topics in data science/data/"
text_file_name = "osdg-community-data-v2024-04-01.csv"
text_df = pd.read_csv(data_dir + text_file_name,sep = "\t",  quotechar='"')
text_df.drop(text_df.columns.values[0],axis = 1, inplace=True)
text_df = text_df.query("agreement > 0.5 and (labels_positive - labels_negative) > 2").reset_index(drop=True)
text_df["lang"] = text_df["text"].apply(lambda x: detect(x))
text_df = text_df.query("lang == 'en'")
text_df.reset_index(inplace=True,drop=True)
text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26471 entries, 0 to 26470
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   text_id          26471 non-null  object 
 1   text             26471 non-null  object 
 2   sdg              26471 non-null  int64  
 3   labels_negative  26471 non-null  int64  
 4   labels_positive  26471 non-null  int64  
 5   agreement        26471 non-null  float64
 6   lang             26471 non-null  object 
dtypes: float64(1), int64(3), object(3)
memory usage: 1.4+ MB


### study term frequencies using scikit-learn vectorizer

In [4]:
corpus = text_df.text
count_vectorizer = CountVectorizer() # default is unigram, no stop word removal
count_vectorizer.fit(corpus)
count_vector = count_vectorizer.transform(corpus).toarray() 
print('vocabulary size: ' , len(count_vectorizer.vocabulary_))
print('vector shape: ', count_vector.shape)
print("proportion of non-zeros: ", 1-np.sum(count_vector==0)/(count_vector.shape[0]*count_vector.shape[1]))

vocabulary size:  46122
vector shape:  (26471, 46122)
proportion of non-zeros:  0.0014430694968051716


In [5]:
count_word(corpus), np.sum(count_vector), 1 - np.sum(count_vector)/count_word(corpus)

(2510738, 2493380, 0.00691350511283928)

notice the difference
* the the total word count from string split and sum = 2,516,370
* sum of all term counts, as counted from the sklearn tokenizer = 2,498,921
* small difference (less than 1%), sperhaps due to certain processing details such as handling special characters?

Looking at results of vectorizer

* count_vectorizer.vocabulary_ is a mapping of terms to feature indices , the key is the term in the corpus, the value is the index to the feature array 
* count_vectorizer.vocabulary_.keys() provides the list of the terms in the corpus
* count_vectorizer.get_feature_names_out() is the corresponding array of the features

* count_vectorizer.transform(corpus).toarray() provided the document-term frequency array: row for document in corpus, column for the terms in corpus

In [6]:
count_vectorizer.vocabulary_

{'from': 17379,
 'gender': 17898,
 'perspective': 31637,
 'paulgaard': 31218,
 'points': 32290,
 'out': 30410,
 'that': 41548,
 'the': 41556,
 'labour': 24073,
 'markets': 26056,
 'of': 29732,
 'fishing': 16648,
 'villages': 44507,
 'have': 19327,
 'been': 5464,
 'highly': 19731,
 'segregated': 37565,
 'in': 21062,
 'terms': 41417,
 'existence': 15673,
 'male': 25769,
 'jobs': 22988,
 'and': 3522,
 'female': 16323,
 'however': 20150,
 'new': 28743,
 'business': 6981,
 'opportunities': 30093,
 'led': 24518,
 'to': 41964,
 'population': 32484,
 'peripheral': 31550,
 'areas': 4097,
 'now': 29303,
 'working': 45577,
 'service': 37809,
 'industry': 21403,
 'former': 17076,
 'boys': 6471,
 'girls': 18192,
 'are': 4093,
 'doing': 13175,
 'same': 36911,
 'indicates': 21299,
 'change': 8049,
 'because': 5433,
 'traditional': 42264,
 'boundaries': 6432,
 'between': 5725,
 'women': 45525,
 'men': 26634,
 'work': 45562,
 'being': 5513,
 'crossed': 10748,
 'but': 6995,
 'fact': 15971,
 'young': 458

In [7]:
count_vectorizer.vocabulary_.items()



In [8]:
count_vectorizer.vocabulary_.keys()



In [9]:
count_vectorizer.get_feature_names_out()

array(['00', '000', '0000002', ..., 'œopen', 'ʿadawiyya', '四个全面'],
      dtype=object)

### Stop word removal

In [10]:
corpus = text_df.text
count_vectorizer = CountVectorizer(ngram_range=(1,1),stop_words='english') 
count_vectorizer.fit(corpus) 
count_vector = count_vectorizer.transform(corpus).toarray() 
print('vocabulary size: ' , len(count_vectorizer.vocabulary_))
print('vector shape: ', count_vector.shape)
print("proportion of non-zeros: ", 1-np.sum(count_vector==0)/(count_vector.shape[0]*count_vector.shape[1]))

vocabulary size:  45823
vector shape:  (26471, 45823)
proportion of non-zeros:  0.0009974833621272383


Notice the difference in vocabular size between the results of stop-word removal vs. non removal. 

We can see what was used (and removed) as stop words in the documents.

In [11]:
count_vectorizer.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

 - take the document-term matrix (count vectorized array), make into a pandas dataframe with feature names (terms) as column names

In [12]:
corpus = text_df.text
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit(corpus)
count_vector = count_vectorizer.transform(corpus).toarray() 
count_vector_df = pd.DataFrame(count_vector, columns=count_vectorizer.get_feature_names_out())

we can obtain term frequency (across entire corpus) from the document-term dataframe, by summing across rows for each term (feature, column)

In [13]:
term_freq = pd.DataFrame({"term": count_vector_df.columns.values, "freq" : count_vector_df.sum(axis=0)})
term_freq.sort_values(by="freq", ascending=False)

Unnamed: 0,term,freq
countries,countries,8493
women,women,6017
development,development,5889
public,public,4819
social,social,4754
...,...,...
escarpment,escarpment,1
escherichia,escherichia,1
painelbio,painelbio,1
paim,paim,1


 - take a look at a portion of the term-document matrix

In [14]:
count_vector_df.loc[100:125,term_freq.sort_values(by="freq", ascending =False)[:20].term] # take a portion

Unnamed: 0,countries,women,development,public,social,health,water,education,policy,international,national,energy,law,rights,economic,oecd,use,income,new,level
100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
101,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
102,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
103,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1
104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0
105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
106,1,0,0,2,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1
107,2,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0
108,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
109,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [15]:
total_word_count = count_word(corpus) 
total_word_count, term_freq.freq.sum(), (1-term_freq.freq.sum()/total_word_count)

(2510738, 1440843, 0.42612769631877156)

 - stop word removal reduced document word frequency count by 40%

### bi-grams

In [16]:
corpus = text_df.text
count_vectorizer = CountVectorizer(ngram_range=(2,2), stop_words='english') 
count_vectorizer.fit(corpus)
count_vector = count_vectorizer.transform(corpus).toarray() 
print('vocabulary size: ' , len(count_vectorizer.vocabulary_))
print('vector shape: ', count_vector.shape)
# array count_vector is much bigger, counting zeros tends to crash the kernel, so we should skip it.
# print("proportion of non-zeros: ", 1-count_zero(count_vector)/(count_vector.shape[0]*count_vector.shape[1]))

vocabulary size:  877716
vector shape:  (26471, 877716)


### tri-grams

Notice that the vocabulary sizes increases by big proportion when we go from unigram, to bigram, to trigram

we can specify the minimum size of document frequency, min_df, so that only the terms appearing in minimum number of min_df documents in the corpus would be considered for vocabulary


### Term frequency with minimum document frequency

In [17]:
# the lass line in this cell that calls for count_vector_df.sum will not finish 
# unless min_df is set to 2 or above in the CountVectorizer call at top of this cell, make sure to specify
count_vectorizer = CountVectorizer(ngram_range = (2,2),stop_words='english', min_df=5)
count_vectorizer.fit(corpus)
count_vector = count_vectorizer.transform(corpus).toarray() 
print('vocabulary size: ' , len(count_vectorizer.vocabulary_))
print('vector shape: ', count_vector.shape)
count_vector_df = pd.DataFrame(count_vector, columns=count_vectorizer.get_feature_names_out())
bigram_freq = pd.DataFrame({"term": count_vector_df.columns.values, "freq" : count_vector_df.sum(axis=0)})

vocabulary size:  28393
vector shape:  (26471, 28393)


Notice the vocabulary size dropped from 881392 to 28447

How do we verify that term frequency computed in such a sequence of multiple operations (stop word removal, min_frequency, multiple aggregations) is correct? 

In [18]:
bigram_freq.query("term == 'rural areas'")

Unnamed: 0,term,freq
rural areas,rural areas,441


In [19]:
print("vocabulary term and frequency: ", bigram_freq.query("term == 'rural areas'")["freq"])
print("vocabulary index is: ", count_vectorizer.vocabulary_.get("rural areas"))
print("vocabulary index is: ", count_vectorizer.vocabulary_[u'rural areas'])# u'string' for unicode string 'string'
print("vocabulary index is: ", count_vectorizer.vocabulary_.get(u'rural areas') )

vocabulary term and frequency:  rural areas    441
Name: freq, dtype: int64
vocabulary index is:  22984
vocabulary index is:  22984
vocabulary index is:  22984


In [20]:
count_vectorizer.get_feature_names_out()[23033]

'safety issues'

In [21]:
bigram_freq.iloc[23033]

term    safety issues
freq                5
Name: safety issues, dtype: object

In [22]:
corpus.apply(lambda x : "rural areas" in x).sum()

375

In [23]:
corpus.apply(lambda x : x.count("rural areas")).sum()

438

In [24]:
corpus.apply(lambda x : x.lower().count("rural areas")).sum()

442

### computing word frequency

get count vector and then followed by sum is an expensive way to get word frequency
 - may be better rely on vocabulary, which is a dictionary

In [25]:
bigram_freq.sort_values(by="freq", ascending =False)[:30]

Unnamed: 0,term,freq
human rights,human rights,2002
climate change,climate change,1360
et al,et al,1253
oecd countries,oecd countries,949
developing countries,developing countries,895
health care,health care,888
united states,united states,833
long term,long term,798
international law,international law,779
labour market,labour market,760


In [26]:
count_vectorizer.vocabulary_

{'gender perspective': 10495,
 'labour markets': 14457,
 'gender segregated': 10504,
 'new business': 17396,
 'business opportunities': 3262,
 'peripheral areas': 18829,
 'boys girls': 3088,
 'women men': 27956,
 'men work': 16433,
 'young people': 28367,
 'people working': 18731,
 'young adults': 28360,
 'described earlier': 6436,
 'spend time': 24682,
 'average figure': 2503,
 'large differences': 14608,
 'parts population': 18542,
 'likely limited': 15266,
 'limited access': 15297,
 'access primary': 912,
 'primary care': 20029,
 'care addition': 3437,
 'addition poor': 1220,
 'findings consistent': 9787,
 'previous work': 19972,
 'differences wage': 6823,
 'wage inequality': 27475,
 'inequality countries': 13354,
 'fournier koske': 10181,
 'koske 2012': 14433,
 'returns education': 22589,
 'education important': 7714,
 'important role': 12474,
 'earnings inequality': 7407,
 '2014 countries': 471,
 'supply demand': 25376,
 'demand skills': 6324,
 'role played': 22933,
 'labour marke

In [27]:
bigram_freq.query("freq==5").nunique()


term    6595
freq       1
dtype: int64

### putting together as a function
 - enable stop words removal
 - enable unigram, bigram, tri-gram

In [28]:
def get_term_freq(corpus, ngram_range = (1, 1), stop_words = None, min_df = 5): # min_df has to be 2 or larger
    count_vectorizer = CountVectorizer(ngram_range = ngram_range, stop_words = stop_words, min_df =min_df)
    count_vector = count_vectorizer.fit_transform(corpus).toarray()
    count_vector_df = pd.DataFrame(count_vector, columns=count_vectorizer.get_feature_names_out())
    term_freq = pd.DataFrame({"term": count_vector_df.columns.values, "freq" : count_vector_df.sum(axis=0)})
    return term_freq

In [29]:
corpus = text_df.text
term_freq_1_1_remove_stop = get_term_freq(corpus, stop_words='english').sort_values(by="freq", ascending=False)
term_freq_1_1_keep_stop = get_term_freq(corpus).sort_values(by="freq", ascending=False)
term_freq_2_2_remove_stop = get_term_freq(corpus, ngram_range = (2,2), stop_words = 'english').sort_values(by="freq", ascending=False)
term_freq_2_2_keep_stop = get_term_freq(corpus, ngram_range = (2,2)).sort_values(by="freq", ascending=False)

In [30]:
term_freq_2_2_keep_stop.head(50)

Unnamed: 0,term,freq
of the,of the,19574
in the,in the,16048
to the,to the,7295
and the,and the,7048
on the,on the,5475
for the,for the,4176
to be,to be,3762
such as,such as,3409
by the,by the,3404
with the,with the,3040


In [31]:
term_freq_2_2_remove_stop.head(50)

Unnamed: 0,term,freq
human rights,human rights,2002
climate change,climate change,1360
et al,et al,1253
oecd countries,oecd countries,949
developing countries,developing countries,895
health care,health care,888
united states,united states,833
long term,long term,798
international law,international law,779
labour market,labour market,760


In [32]:
term_freq_2_2_keep_stop.shape, term_freq_2_2_remove_stop.shape

((66411, 2), (28393, 2))