In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
from matplotlib import pyplot as plt
import matplotlib as mpl
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from langdetect import detect

In [2]:
data_dir = "/Users/yingli/Development/TopicsInDataScience/"
sdg_names = pd.read_excel(data_dir + "Digital Science SDG training set searches.xlsx")
sdg_names = sdg_names.drop([0,1,2], axis=0)
sdg_names = sdg_names.set_axis(["sdg", "sdg_name", "sdg_definition"],axis=1, copy=False)
sdg_names.sdg_name.tolist()

['No Poverty',
 'Zero Hunger',
 'Good Health and Well Being',
 'Quality Education',
 'Gender Equality',
 'Clean Water and Sanitation',
 'Affordable and Clean Energy',
 'Decent Work and Economic Growth',
 'Industry, Innovation and Infrastructure',
 'Reduced Inequalities',
 'Sustainable Cities and Communities',
 'Responsible Consumption and Production',
 'Climate Action',
 'Life Below Water',
 'Life on Land',
 'Peace, Justice and Strong Institutions',
 'Partnerships for the Goals']

### helper functions for speeding up computation

In [3]:
def count_word(corpus):
    result = 0
    for x in corpus:
        result += len(x.split())
    return result
def count_zero(arr):
    zero_arr = np.zeros((arr.shape[0], arr.shape[1]))
    return np.sum(arr == zero_arr)

### get data ready

 - language detection takes time, about 30 seconds on our dataset on my laptop
 - may consider not doing language detection on large dataset unless really necessary
 - may also consider running language detection after filtering by other dimensions
 - in our dataset, there are 55 non-English excerpts, we will NOT remove them in our analysis
 - this leaves our final analysis file with 26,526 excerpts in our corpus

In [4]:
text_file_name = "osdg-community-data-v2024-04-01.csv"
text_df = pd.read_csv(data_dir + text_file_name,sep = "\t",  quotechar='"')
text_df.drop(text_df.columns.values[0],axis = 1, inplace=True)
text_df = text_df.query("agreement > 0.5 and (labels_positive - labels_negative) > 2").reset_index(drop=True)
#text_df["lang"] = text_df["text"].apply(lambda x: detect(x))
#text_df = text_df.query("lang == 'en'")
text_df.reset_index(inplace=True,drop=True)
text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26526 entries, 0 to 26525
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   text_id          26526 non-null  object 
 1   text             26526 non-null  object 
 2   sdg              26526 non-null  int64  
 3   labels_negative  26526 non-null  int64  
 4   labels_positive  26526 non-null  int64  
 5   agreement        26526 non-null  float64
dtypes: float64(1), int64(3), object(2)
memory usage: 1.2+ MB


### study term frequencies using scikit-learn vectorizer

In [40]:
corpus = text_df.text
count_vectorizer = CountVectorizer() # default is unigram, no stop word removal
count_vectorizer.fit(corpus)
count_vector = count_vectorizer.transform(corpus).toarray() 
print('vocabulary size: ' , len(count_vectorizer.vocabulary_))
print('vector shape: ', count_vector.shape)
print("proportion of non-zeros: ", 1-np.sum(count_vector==0)/(count_vector.shape[0]*count_vector.shape[1]))

vocabulary size:  47208
vector shape:  (26526, 47208)
proportion of non-zeros:  0.0014101041611684906


In [39]:
count_word(corpus), np.sum(count_vector), 1 - np.sum(count_vector)/count_word(corpus)

(2516370, 2498921, 0.006934194891848211)

notice the difference
* the the total word count from string split and sum = 2,516,370
* sum of all term counts, as counted from the sklearn tokenizer = 2,498,921
* small difference (less than 1%), sperhaps due to certain processing details such as handling special characters?

Looking at results of vectorizer

* count_vectorizer.vocabulary_ is a mapping of terms to feature indices , the key is the term in the corpus, the value is the index to the feature array 
* count_vectorizer.vocabulary_.keys() provides the list of the terms in the corpus
* count_vectorizer.get_feature_names_out() is the corresponding array of the features

* count_vectorizer.transform(corpus).toarray() provided the document-term frequency array: row for document in corpus, column for the terms in corpus

In [6]:
count_vectorizer.vocabulary_

{'from': 17852,
 'gender': 18376,
 'perspective': 32400,
 'paulgaard': 31958,
 'points': 33060,
 'out': 31134,
 'that': 42563,
 'the': 42571,
 'labour': 24659,
 'markets': 26678,
 'of': 30445,
 'fishing': 17110,
 'villages': 45573,
 'have': 19823,
 'been': 5581,
 'highly': 20229,
 'segregated': 38506,
 'in': 21583,
 'terms': 42431,
 'existence': 16101,
 'male': 26385,
 'jobs': 23556,
 'and': 3577,
 'female': 16772,
 'however': 20654,
 'new': 29436,
 'business': 7115,
 'opportunities': 30809,
 'led': 25110,
 'to': 42984,
 'population': 33258,
 'peripheral': 32305,
 'areas': 4169,
 'now': 30005,
 'working': 46655,
 'service': 38760,
 'industry': 21936,
 'former': 17545,
 'boys': 6605,
 'girls': 18677,
 'are': 4165,
 'doing': 13486,
 'same': 37839,
 'indicates': 21828,
 'change': 8208,
 'because': 5550,
 'traditional': 43291,
 'boundaries': 6566,
 'between': 5850,
 'women': 46603,
 'men': 27270,
 'work': 46640,
 'being': 5631,
 'crossed': 10994,
 'but': 7129,
 'fact': 16407,
 'young': 469

In [7]:
count_vectorizer.vocabulary_.items()



In [8]:
count_vectorizer.vocabulary_.keys()



In [9]:
count_vectorizer.get_feature_names_out()

array(['00', '000', '0000002', ..., 'œopen', 'ʿadawiyya', '四个全面'],
      dtype=object)

### Stop word removal

In [10]:
corpus = text_df.text
count_vectorizer = CountVectorizer(ngram_range=(1,1),stop_words='english') 
count_vectorizer.fit(corpus) 
count_vector = count_vectorizer.transform(corpus).toarray() 
print('vocabulary size: ' , len(count_vectorizer.vocabulary_))
print('vector shape: ', count_vector.shape)
print("proportion of non-zeros: ", 1-np.sum(count_vector==0)/(count_vector.shape[0]*count_vector.shape[1]))

vocabulary size:  46909
vector shape:  (26526, 46909)
proportion of non-zeros:  0.0009752945969250248


Notice the difference in vocabular size between the results of stop-word removal vs. non removal. 

We can see what was used (and removed) as stop words in the documents.

In [11]:
count_vectorizer.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

 - take the document-term matrix (count vectorized array), make into a pandas dataframe with feature names (terms) as column names

In [12]:
corpus = text_df.text
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit(corpus)
count_vector = count_vectorizer.transform(corpus).toarray() 
count_vector_df = pd.DataFrame(count_vector, columns=count_vectorizer.get_feature_names_out())

we can obtain term frequency (across entire corpus) from the document-term dataframe, by summing across rows for each term (feature, column)

In [13]:
term_freq = pd.DataFrame({"term": count_vector_df.columns.values, "freq" : count_vector_df.sum(axis=0)})
term_freq.sort_values(by="freq", ascending=False)

Unnamed: 0,term,freq
countries,countries,8498
women,women,6019
development,development,5891
public,public,4820
social,social,4759
...,...,...
madrepora,madrepora,1
madisonian,madisonian,1
madin,madin,1
madhya,madhya,1


 - take a look at a portion of the term-document matrix

In [14]:
count_vector_df.loc[100:125,term_freq.sort_values(by="freq", ascending =False)[:20].term] # take a portion

Unnamed: 0,countries,women,development,public,social,health,water,education,policy,international,national,energy,law,rights,economic,oecd,use,income,new,level
100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
101,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
102,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
103,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1
104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0
105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
106,1,0,0,2,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1
107,2,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0
108,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
109,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [34]:
total_word_count = count_word(corpus) 
total_word_count, term_freq.freq.sum(), (1-term_freq.freq.sum()/total_word_count)

(2516370, 1445705, 0.42547995723999255)

 - stop word removal reduced document word frequency count by 40%

### bi-grams

In [16]:
corpus = text_df.text
count_vectorizer = CountVectorizer(ngram_range=(2,2), stop_words='english') 
count_vectorizer.fit(corpus)
count_vector = count_vectorizer.transform(corpus).toarray() 
print('vocabulary size: ' , len(count_vectorizer.vocabulary_))
print('vector shape: ', count_vector.shape)
# array count_vector is much bigger, counting zeros tends to crash the kernel, so we should skip it.
# print("proportion of non-zeros: ", 1-count_zero(count_vector)/(count_vector.shape[0]*count_vector.shape[1]))

vocabulary size:  881392
vector shape:  (26526, 881392)


### tri-grams

In [17]:
corpus = text_df.text
count_vectorizer = CountVectorizer(ngram_range=(3,3), stop_words='english') 
count_vectorizer.fit(corpus)
count_vector = count_vectorizer.transform(corpus).toarray() 
print('vocabulary size: ' , len(count_vectorizer.vocabulary_))
print('vector shape: ', count_vector.shape)
# array count_vector is much bigger, counting zeros tends to crash the kernel, so we should skip it.
# print("proportion of non-zeros: ", 1-np.sum(count_vector==0)/(count_vector.shape[0]*count_vector.shape[1]))

vocabulary size:  1296218
vector shape:  (26526, 1296218)


Notice that the vocabulary sizes increases by big proportion when we go from unigram, to bigram, to trigram

we can specify the minimum size of document frequency, min_df, so that only the terms appearing in minimum number of min_df documents in the corpus would be considered for vocabulary


### Term frequency with minimum document frequency

In [18]:
# the lass line in this cell that calls for count_vector_df.sum will not finish 
# unless min_df is set to 2 or above in the CountVectorizer call at top of this cell, make sure to specify
count_vectorizer = CountVectorizer(ngram_range = (2,2),stop_words='english', min_df=5)
count_vectorizer.fit(corpus)
count_vector = count_vectorizer.transform(corpus).toarray() 
print('vocabulary size: ' , len(count_vectorizer.vocabulary_))
print('vector shape: ', count_vector.shape)
count_vector_df = pd.DataFrame(count_vector, columns=count_vectorizer.get_feature_names_out())
bigram_freq = pd.DataFrame({"term": count_vector_df.columns.values, "freq" : count_vector_df.sum(axis=0)})

vocabulary size:  28447
vector shape:  (26526, 28447)


Notice the vocabulary size dropped from 881392 to 28447

How do we verify that term frequency computed in such a sequence of multiple operations (stop word removal, min_frequency, multiple aggregations) is correct? 

In [19]:
bigram_freq.query("term == 'rural areas'")

Unnamed: 0,term,freq
rural areas,rural areas,441


In [20]:
print("vocabulary term and frequency: ", bigram_freq.query("term == 'rural areas'")["freq"])
print("vocabulary index is: ", count_vectorizer.vocabulary_.get("rural areas"))
print("vocabulary index is: ", count_vectorizer.vocabulary_[u'rural areas'])# u'string' for unicode string 'string'
print("vocabulary index is: ", count_vectorizer.vocabulary_.get(u'rural areas') )

vocabulary term and frequency:  rural areas    441
Name: freq, dtype: int64
vocabulary index is:  23033
vocabulary index is:  23033
vocabulary index is:  23033


In [21]:
count_vectorizer.get_feature_names_out()[23033]

'rural areas'

In [22]:
bigram_freq.iloc[23033]

term    rural areas
freq            441
Name: rural areas, dtype: object

In [23]:
corpus.apply(lambda x : "rural areas" in x).sum()

375

In [24]:
corpus.apply(lambda x : x.count("rural areas")).sum()

438

In [25]:
corpus.apply(lambda x : x.lower().count("rural areas")).sum()

442

### computing word frequency

get count vector and then followed by sum is an expensive way to get word frequency
 - may be better rely on vocabulary, which is a dictionary

In [26]:
bigram_freq.sort_values(by="freq", ascending =False)[:30]

Unnamed: 0,term,freq
human rights,human rights,2003
climate change,climate change,1361
et al,et al,1253
oecd countries,oecd countries,951
developing countries,developing countries,895
health care,health care,888
united states,united states,833
long term,long term,798
international law,international law,779
labour market,labour market,761


In [27]:
count_vectorizer.vocabulary_

{'gender perspective': 10518,
 'labour markets': 14485,
 'gender segregated': 10527,
 'new business': 17433,
 'business opportunities': 3265,
 'peripheral areas': 18871,
 'boys girls': 3091,
 'women men': 28010,
 'men work': 16469,
 'young people': 28421,
 'people working': 18773,
 'young adults': 28414,
 'described earlier': 6449,
 'spend time': 24731,
 'average figure': 2505,
 'large differences': 14636,
 'parts population': 18583,
 'likely limited': 15301,
 'limited access': 15332,
 'access primary': 914,
 'primary care': 20074,
 'care addition': 3440,
 'addition poor': 1222,
 'findings consistent': 9810,
 'previous work': 20017,
 'differences wage': 6836,
 'wage inequality': 27529,
 'inequality countries': 13381,
 'fournier koske': 10204,
 'koske 2012': 14460,
 'returns education': 22638,
 'education important': 7728,
 'important role': 12500,
 'earnings inequality': 7421,
 '2014 countries': 473,
 'supply demand': 25426,
 'demand skills': 6335,
 'role played': 22982,
 'labour marke

In [28]:
bigram_freq.query("freq==5").nunique()


term    6606
freq       1
dtype: int64

### putting together as a function
 - enable stop words removal
 - enable unigram, bigram, tri-gram

In [29]:
def get_term_freq(corpus, ngram_range = (1, 1), stop_words = None, min_df = 2): # min_df has to be 2 or larger
    count_vectorizer = CountVectorizer(ngram_range = ngram_range, stop_words = stop_words, min_df =min_df)
    count_vector = count_vectorizer.fit_transform(corpus).toarray()
    count_vector_df = pd.DataFrame(count_vector, columns=count_vectorizer.get_feature_names_out())
    term_freq = pd.DataFrame({"term": count_vector_df.columns.values, "freq" : count_vector_df.sum(axis=0)})
    return term_freq

In [30]:
corpus = text_df.text
term_freq_1_1_remove_stop = get_term_freq(corpus, stop_words='english').sort_values(by="freq", ascending=False)
term_freq_1_1_keep_stop = get_term_freq(corpus).sort_values(by="freq", ascending=False)
term_freq_2_2_remove_stop = get_term_freq(corpus, ngram_range = (2,2), stop_words = 'english').sort_values(by="freq", ascending=False)
term_freq_2_2_keep_stop = get_term_freq(corpus, ngram_range = (2,2)).sort_values(by="freq", ascending=False)

In [31]:
term_freq_2_2_keep_stop.head(50)

Unnamed: 0,term,freq
of the,of the,19579
in the,in the,16052
to the,to the,7296
and the,and the,7051
on the,on the,5477
for the,for the,4178
to be,to be,3762
such as,such as,3410
by the,by the,3404
with the,with the,3041


In [32]:
term_freq_2_2_remove_stop.head(50)

Unnamed: 0,term,freq
human rights,human rights,2003
climate change,climate change,1361
et al,et al,1253
oecd countries,oecd countries,951
developing countries,developing countries,895
health care,health care,888
united states,united states,833
long term,long term,798
international law,international law,779
labour market,labour market,761


In [33]:
term_freq_2_2_keep_stop.shape, term_freq_2_2_remove_stop.shape

((205848, 2), (160975, 2))