To run this notebook SentenceTransformer needed to be installed

In [1]:
#!pip install SentenceTransformer

## Import Libraries

In [2]:
'''basics'''
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join('../..', 'src')))
sys.setrecursionlimit(20500)
import vectorize_embed as em
import pandas as pd
#import pickle5 as pickle
import pickle
import numpy as np

'''Plotting'''
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

'''features'''
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import label_binarize

'''Classifiers'''
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB


'''Metrics/Evaluation'''
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from scipy import interp
from itertools import cycle
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn_hierarchical_classification.classifier import HierarchicalClassifier
from sklearn_hierarchical_classification.constants import ROOT
from sklearn_hierarchical_classification.metrics import h_fbeta_score, multi_labeled
from sklearn.pipeline import Pipeline


import warnings
warnings.filterwarnings('ignore')

import joblib



## Import data

In [3]:
df = pd.read_csv('../../data/processed/encoded_labels/_social_inclusion___engagement.csv')
df.columns

Index(['PIMS_ID', 'all_text_clean', 'all_text_clean_spacy',
       '_social_inclusion___engagement', 'artisanal_miners', 'disabled',
       'elderly', 'indigenous_peoples', 'local_community_csos', 'no tag',
       'private_sector', 'smallholder_farmers', 'waste_picker', 'women',
       'youth_children'],
      dtype='object')

# Compare different embeddings performances

In [4]:
categories = ['artisanal_miners', 'disabled',
       'elderly', 'indigenous_peoples', 'local_community_csos', 'no tag',
       'private_sector', 'smallholder_farmers', 'waste_picker', 'women',
       'youth_children']

#Turning the labels into numbers
y = pd.DataFrame(df, columns = categories)

X = df['all_text_clean'].astype('str').tolist()


## Train multiple Embeddings with SGD / OneVsRest multi-label strategy

In [None]:
#Creating a dict of the embeddings
embedding_dict = {'Glove' : 'average_word_embeddings_glove.6B.300d', 
                  'Distilbert':'distilbert-base-nli-mean-tokens', 
                  'Roberta' : 'roberta-base-nli-stsb-mean-tokens', 
                  'Bert' : 'bert-base-nli-stsb-mean-tokens'}
              
sgd_classifier = SGDClassifier(alpha=1e-06,
                               loss='log',
                               max_iter=1000,
                               penalty='l1',
                               random_state = 3,
                               tol=0.001)
model = OneVsRestClassifier(sgd_classifier)

#Train test split with stratified sampling for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = .3, 
                                                    shuffle = True,  
                                                    random_state = 3)
y_train = y_train.dropna(axis=1)

#Function to get the scores for each model in a df
def model_score_df(embedding_dict, X_train, X_test, y_train, y_test, category):   
    embedding_name, ac_score_list, p_score_list, r_score_list, f1_score_list = [], [], [], [], []
    for k,v in embedding_dict.items():   
        print ('processing :' + str(k))
        embedding_name.append(k)   
        model.fit(em.get_embeddings(v, X_train), y_train)
        
        # save the model to disk
        filename = '../'+category+'_'+k+'model.sav'
        joblib.dump(model, filename)
        
        y_pred = model.predict(em.get_embeddings(v, X_test))
        ac_score_list.append(accuracy_score(y_test, y_pred))
        p_score_list.append(precision_score(y_test, y_pred, average='macro'))
        r_score_list.append(recall_score(y_test, y_pred, average='macro'))
        f1_score_list.append(f1_score(y_test, y_pred, average='macro'))
        model_comparison_df = pd.DataFrame([embedding_name, ac_score_list, p_score_list, r_score_list, f1_score_list]).T
        model_comparison_df.columns = ['embedding_name', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score']
        model_comparison_df = model_comparison_df.sort_values(by='f1_score', ascending=False)
    return model_comparison_df

      
lis = []
for category in categories:
    dic = {}
    dff = model_score_df(embedding_dict, X_train, X_test, y_train[category], y_test[category], category)
    # Using DataFrame.insert() to add a column
    dic['Category'] = category
    dic['Classifiers'] = '    \n '.join(dff.embedding_name.apply(str).tolist())
    dic['accuracy_score'] = '    \n '.join(dff.accuracy_score.apply(str).tolist()) 
    dic['precision_score'] = '   \n '.join(dff.precision_score.apply(str).tolist())
    dic['recall_score'] = '    \n '.join(dff.recall_score.apply(str).tolist())
    dic['f1_score'] = '    \n '.join(dff.f1_score.apply(str).tolist())
    lis.append(dic)
    
#print (tabulate(rows, header, tablefmt='html'))


processing :Glove
processing :Distilbert
processing :Roberta
processing :Bert
processing :Glove
processing :Distilbert


In [8]:
from tabulate import tabulate  
header = lis[0].keys()
rows =  [x.values() for x in lis]
print(tabulate(rows, header, tablefmt='html'))

<table>
<thead>
<tr><th>Category            </th><th>Classifiers  </th><th>accuracy_score  </th><th>precision_score  </th><th>recall_score  </th><th>f1_score  </th></tr>
</thead>
<tbody>
<tr><td>artisanal_miners    </td><td>Roberta    
 Distilbert    
 Glove    
 Bert              </td><td>0.9779005524861878    
 0.9613259668508287    
 0.9558011049723757    
 0.9558011049723757                 </td><td>0.9888268156424581   
 0.6526217228464419   
 0.6108757062146892   
 0.6108757062146892                  </td><td>0.6666666666666666    
 0.5776190476190476    
 0.5747619047619048    
 0.5747619047619048               </td><td>0.7443502824858756    
 0.6011960969468051    
 0.5886363636363636    
 0.5886363636363636           </td></tr>
<tr><td>disabled            </td><td>Glove    
 Distilbert    
 Bert    
 Roberta              </td><td>0.9834254143646409    
 0.9779005524861878    
 0.9723756906077348    
 0.9502762430939227                 </td><td>0.744413407821229   
 0.661048689

### copy-past the output of the last cell here


In [None]:
<table>
<thead>
<tr><th>Category            </th><th>Classifiers  </th><th>accuracy_score  </th><th>precision_score  </th><th>recall_score  </th><th>f1_score  </th></tr>
</thead>
<tbody>
<tr><td>artisanal_miners    </td><td>Roberta    
 Distilbert    
 Glove    
 Bert              </td><td>0.9779005524861878    
 0.9613259668508287    
 0.9558011049723757    
 0.9558011049723757                 </td><td>0.9888268156424581   
 0.6526217228464419   
 0.6108757062146892   
 0.6108757062146892                  </td><td>0.6666666666666666    
 0.5776190476190476    
 0.5747619047619048    
 0.5747619047619048               </td><td>0.7443502824858756    
 0.6011960969468051    
 0.5886363636363636    
 0.5886363636363636           </td></tr>
<tr><td>disabled            </td><td>Glove    
 Distilbert    
 Bert    
 Roberta              </td><td>0.9834254143646409    
 0.9779005524861878    
 0.9723756906077348    
 0.9502762430939227                 </td><td>0.744413407821229   
 0.6610486891385767   
 0.6193502824858756   
 0.5567196531791907                  </td><td>0.6638576779026217    
 0.6610486891385767    
 0.6582397003745318    
 0.647003745318352               </td><td>0.6957983193277311    
 0.6610486891385767    
 0.6358148893360162    
 0.5780885780885782           </td></tr>
<tr><td>elderly             </td><td>Distilbert    
 Glove    
 Roberta    
 Bert              </td><td>0.994475138121547    
 0.9834254143646409    
 0.9834254143646409    
 0.9834254143646409                 </td><td>0.4972375690607735   
 0.4972067039106145   
 0.4972067039106145   
 0.4972067039106145                  </td><td>0.5    
 0.49444444444444446    
 0.49444444444444446    
 0.49444444444444446               </td><td>0.4986149584487534    
 0.4958217270194986    
 0.4958217270194986    
 0.4958217270194986           </td></tr>
<tr><td>indigenous_peoples  </td><td>Distilbert    
 Glove    
 Roberta    
 Bert              </td><td>0.861878453038674    
 0.9005524861878453    
 0.861878453038674    
 0.8950276243093923                 </td><td>0.6439393939393939   
 0.6757889546351085   
 0.6116071428571428   
 0.646524064171123                  </td><td>0.7267045454545454    
 0.6350378787878788    
 0.6420454545454546    
 0.6037878787878788               </td><td>0.6701173726033389    
 0.6516253207869975    
 0.6237006237006236    
 0.6197899391929242           </td></tr>
<tr><td>local_community_csos</td><td>Distilbert    
 Glove    
 Roberta    
 Bert              </td><td>0.6519337016574586    
 0.6243093922651933    
 0.5911602209944752    
 0.56353591160221                 </td><td>0.6530321782178218   
 0.634180790960452   
 0.6080740117746005   
 0.5636046801095345                  </td><td>0.6510136785539815    
 0.6218246213971665    
 0.594162188568637    
 0.5624084025403029               </td><td>0.6503970322224607    
 0.6144110275689223    
 0.5788050314465409    
 0.5609089513281129           </td></tr>
<tr><td>no tag              </td><td>Glove    
 Bert    
 Distilbert    
 Roberta              </td><td>0.8232044198895028    
 0.7071823204419889    
 0.6740331491712708    
 0.7071823204419889                 </td><td>0.7618583495776479   
 0.582863304578633   
 0.5646988670244484   
 0.5634891424365108                  </td><td>0.6455399061032864    
 0.5901950162513543    
 0.5783676417479234    
 0.562296858071506               </td><td>0.6715063520871143    
 0.5857408127132184    
 0.5669680872632902    
 0.562861699703805           </td></tr>
<tr><td>private_sector      </td><td>Bert    
 Glove    
 Roberta    
 Distilbert              </td><td>0.6519337016574586    
 0.6298342541436464    
 0.6298342541436464    
 0.6243093922651933                 </td><td>0.6463892288861689   
 0.6828087167070218   
 0.6197339246119734   
 0.6052765093860984                  </td><td>0.6547619047619048    
 0.6758540372670807    
 0.6257763975155279    
 0.6074016563146998               </td><td>0.6446002805049088    
 0.6294270435446906    
 0.6193779228523901    
 0.6060179257362357           </td></tr>
<tr><td>smallholder_farmers </td><td>Bert    
 Distilbert    
 Glove    
 Roberta              </td><td>0.8950276243093923    
 0.8674033149171271    
 0.9171270718232044    
 0.9060773480662984                 </td><td>0.6595695970695971   
 0.618944099378882   
 0.7630681818181818   
 0.6602272727272727                  </td><td>0.6320075757575758    
 0.6450757575757575    
 0.5876893939393939    
 0.553409090909091               </td><td>0.6438852645749198    
 0.6298568507157465    
 0.6208630079597821    
 0.5703114090210863           </td></tr>
<tr><td>waste_picker        </td><td>Roberta    
 Bert    
 Glove    
 Distilbert              </td><td>0.994475138121547    
 0.994475138121547    
 0.988950276243094    
 0.988950276243094                 </td><td>0.4972375690607735   
 0.4972375690607735   
 0.49722222222222223   
 0.49722222222222223                  </td><td>0.5    
 0.5    
 0.49722222222222223    
 0.49722222222222223               </td><td>0.4986149584487534    
 0.4986149584487534    
 0.49722222222222223    
 0.49722222222222223           </td></tr>
<tr><td>women               </td><td>Glove    
 Distilbert    
 Bert    
 Roberta              </td><td>0.6574585635359116    
 0.6022099447513812    
 0.6077348066298343    
 0.585635359116022                 </td><td>0.6263115415657788   
 0.5883950617283951   
 0.5727163461538461   
 0.553943258719417                  </td><td>0.6245358090185676    
 0.5949602122015916    
 0.5722148541114058    
 0.5549734748010611               </td><td>0.6253338675213675    
 0.5867579908675798    
 0.5724456865289285    
 0.5542896542666711           </td></tr>
<tr><td>youth_children      </td><td>Glove    
 Roberta    
 Bert    
 Distilbert              </td><td>0.9171270718232044    
 0.9005524861878453    
 0.9005524861878453    
 0.9005524861878453                 </td><td>0.7458010335917313   
 0.6757889546351085   
 0.6499277456647399   
 0.6266666666666667                  </td><td>0.6441287878787878    
 0.6350378787878788    
 0.5785984848484849    
 0.5503787878787878               </td><td>0.6777448071216616    
 0.6516253207869975    
 0.5983727810650887    
 0.5644385026737968           </td></tr>
</tbody>
</table>