To run this notebook SentenceTransformer needed to be installed

In [1]:
#!pip install SentenceTransformer

## Import Libraries

In [2]:
'''basics'''
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join('../..', 'src')))
sys.setrecursionlimit(20500)
import vectorize_embed as em
import pandas as pd
#import pickle5 as pickle
import pickle
import numpy as np

'''Plotting'''
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

'''features'''
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import label_binarize

'''Classifiers'''
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB


'''Metrics/Evaluation'''
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from scipy import interp
from itertools import cycle
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn_hierarchical_classification.classifier import HierarchicalClassifier
from sklearn_hierarchical_classification.constants import ROOT
from sklearn_hierarchical_classification.metrics import h_fbeta_score, multi_labeled
from sklearn.pipeline import Pipeline


import warnings
warnings.filterwarnings('ignore')

import joblib
from tabulate import tabulate



## Import data

In [3]:
df = pd.read_json('../../data/processed/encoded_labels/main_strategies_all.json')
df.columns

Index(['PIMS_ID', 'all_text_clean', 'all_text_clean_spacy',
       'capacity_building', 'enabling', 'finance_economy',
       'food_and_agricultural_commodities', 'governance', 'law_regulation',
       'management_operation', 'mitigation_adaptation', 'monitor_inventory',
       'technology_innovation', 'category_1', 'category_2', 'category_3',
       'labels'],
      dtype='object')

# Compare different embeddings performances

In [4]:
categories = ['capacity_building', 'enabling', 'finance_economy',
       'food_and_agricultural_commodities', 'governance', 'law_regulation',
       'management_operation', 'mitigation_adaptation', 'monitor_inventory',
       'technology_innovation']

#Turning the labels into numbers
y = pd.DataFrame(df, columns = categories)

X = df['all_text_clean'].astype('str').tolist()


## Train multiple Embeddings with SGD / OneVsRest multi-label strategy

In [5]:
#Creating a dict of the embeddings
embedding_dict = {'Glove' : 'average_word_embeddings_glove.6B.300d', 
                  'Distilbert':'distilbert-base-nli-mean-tokens',
                  'Roberta' : 'roberta-base-nli-stsb-mean-tokens', 
                  'Bert' : 'bert-base-nli-stsb-mean-tokens'}
              
sgd_classifier = SGDClassifier(alpha=1e-06,
                               loss='log',
                               max_iter=1000,
                               penalty='l1',
                               random_state = 3,
                               tol=0.001)
model = OneVsRestClassifier(sgd_classifier)

#Train test split with stratified sampling for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = .3, 
                                                    shuffle = True,  
                                                    random_state = 3)
y_train = y_train.dropna(axis=1)

#Function to get the scores for each model in a df
def model_score_df(embedding_dict, X_train, X_test, y_train, y_test, category):   
    embedding_name, ac_score_list, p_score_list, r_score_list, f1_score_list = [], [], [], [], []
    for k,v in embedding_dict.items():   
        embedding_name.append(k)   
        model.fit(em.get_embeddings(v, X_train), y_train)
        
        # save the model to disk
        #filename = '../saved_models/landscapes/'+category+'_'+k+'model.sav'
        #pickle.dump(model, open(filename, 'wb'))
        
        filename = '../'+category+'_'+k+'model.sav'
        joblib.dump(model, filename)
        
        #with open(os.path.abspath(os.path.join('..', 'saved_models'))+'/'+category+'_'+k+'model.sav', 'wb') as handle:
            #pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
        y_pred = model.predict(em.get_embeddings(v, X_test))
        ac_score_list.append(accuracy_score(y_test, y_pred))
        p_score_list.append(precision_score(y_test, y_pred, average='macro'))
        r_score_list.append(recall_score(y_test, y_pred, average='macro'))
        f1_score_list.append(f1_score(y_test, y_pred, average='macro'))
        model_comparison_df = pd.DataFrame([embedding_name, ac_score_list, p_score_list, r_score_list, f1_score_list]).T
        model_comparison_df.columns = ['embedding_name', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score']
        model_comparison_df = model_comparison_df.sort_values(by='f1_score', ascending=False)
    return model_comparison_df

      
lis = []
for category in categories:
    dic = {}
    dff = model_score_df(embedding_dict, X_train, X_test, y_train[category], y_test[category], category)
    # Using DataFrame.insert() to add a column
    dic['Category'] = category
    dic['Classifiers'] = '    \n '.join(dff.embedding_name.apply(str).tolist())
    dic['accuracy_score'] = '    \n '.join(dff.accuracy_score.apply(str).tolist()) 
    dic['precision_score'] = '   \n '.join(dff.precision_score.apply(str).tolist())
    dic['recall_score'] = '    \n '.join(dff.recall_score.apply(str).tolist())
    dic['f1_score'] = '    \n '.join(dff.f1_score.apply(str).tolist())
    lis.append(dic)
    



In [7]:
from tabulate import tabulate  
header = lis[0].keys()
rows =  [x.values() for x in lis]
#print (tabulate(rows, header, tablefmt='html'))
print(tabulate(rows, header, tablefmt='html'))

<table>
<thead>
<tr><th>Category                         </th><th>Classifiers  </th><th>accuracy_score  </th><th>precision_score  </th><th>recall_score  </th><th>f1_score  </th></tr>
</thead>
<tbody>
<tr><td>capacity_building                </td><td>Glove    
 Distilbert    
 Bert    
 Roberta              </td><td>0.7333333333333333    
 0.6611111111111111    
 0.6166666666666667    
 0.6166666666666667                 </td><td>0.7292307692307692   
 0.6497681413710992   
 0.6009523809523809   
 0.5889480319372461                  </td><td>0.6935064935064934    
 0.6551948051948051    
 0.6032467532467533    
 0.5824675324675325               </td><td>0.7    
 0.6507522503896435    
 0.6016038492381716    
 0.5832074901842343           </td></tr>
<tr><td>enabling                         </td><td>Glove    
 Roberta    
 Distilbert    
 Bert              </td><td>0.85    
 0.8555555555555555    
 0.7888888888888889    
 0.7888888888888889                 </td><td>0.7224612736660929   
 

### copy-past the output of the last cell here


In [None]:
<table>
<thead>
<tr><th>Category                         </th><th>Classifiers  </th><th>accuracy_score  </th><th>precision_score  </th><th>recall_score  </th><th>f1_score  </th></tr>
</thead>
<tbody>
<tr><td>capacity_building                </td><td>Glove    
 Distilbert    
 Bert    
 Roberta              </td><td>0.7333333333333333    
 0.6611111111111111    
 0.6166666666666667    
 0.6166666666666667                 </td><td>0.7292307692307692   
 0.6497681413710992   
 0.6009523809523809   
 0.5889480319372461                  </td><td>0.6935064935064934    
 0.6551948051948051    
 0.6032467532467533    
 0.5824675324675325               </td><td>0.7    
 0.6507522503896435    
 0.6016038492381716    
 0.5832074901842343           </td></tr>
<tr><td>enabling                         </td><td>Glove    
 Roberta    
 Distilbert    
 Bert              </td><td>0.85    
 0.8555555555555555    
 0.7888888888888889    
 0.7888888888888889                 </td><td>0.7224612736660929   
 0.7660818713450293   
 0.5821102187759624   
 0.5821102187759624                  </td><td>0.6180634848138844    
 0.5935145010276319    
 0.5677095227220826    
 0.5677095227220826               </td><td>0.6434597608392635    
 0.6175220660346519    
 0.5729270729270729    
 0.5729270729270729           </td></tr>
<tr><td>finance_economy                  </td><td>Roberta    
 Distilbert    
 Bert    
 Glove              </td><td>0.7333333333333333    
 0.7611111111111111    
 0.7222222222222222    
 0.5277777777777778                 </td><td>0.6438923395445135   
 0.678796046720575   
 0.6066666666666667   
 0.5965526350548144                  </td><td>0.6353017521090201    
 0.5968526930564568    
 0.5778715120051914    
 0.6185918234912394               </td><td>0.6390374331550802    
 0.6057256380214966    
 0.5830244625648628    
 0.5212616164460715           </td></tr>
<tr><td>food_and_agricultural_commodities</td><td>Roberta    
 Bert    
 Distilbert    
 Glove              </td><td>0.8166666666666667    
 0.8333333333333334    
 0.8166666666666667    
 0.5555555555555556                 </td><td>0.65   
 0.6576819407008087   
 0.6298076923076923   
 0.5914702581369248                  </td><td>0.6633986928104575    
 0.6274509803921569    
 0.6176470588235294    
 0.6775599128540305               </td><td>0.6560708702449193    
 0.6394230769230769    
 0.623072529982867    
 0.5115995115995116           </td></tr>
<tr><td>governance                       </td><td>Bert    
 Roberta    
 Distilbert    
 Glove              </td><td>0.6777777777777778    
 0.6833333333333333    
 0.6388888888888888    
 0.4777777777777778                 </td><td>0.6741712023750619   
 0.6726133076181293   
 0.6327785368277232   
 0.592948717948718                  </td><td>0.6795002549719531    
 0.6597399286078531    
 0.6362825089240183    
 0.544365119836818               </td><td>0.6737500000000001    
 0.6622222222222223    
 0.6328951084057606    
 0.4341137123745819           </td></tr>
<tr><td>law_regulation                   </td><td>Bert    
 Distilbert    
 Roberta    
 Glove              </td><td>0.6944444444444444    
 0.6111111111111112    
 0.5944444444444444    
 0.5055555555555555                 </td><td>0.6842105263157895   
 0.595425765602791   
 0.5615942028985508   
 0.604955680902498                  </td><td>0.6864037895275893    
 0.5945461528613494    
 0.5457047753168608    
 0.5667008065548585               </td><td>0.6851044880562359    
 0.5949074074074074    
 0.5336291038154392    
 0.4812008160886039           </td></tr>
<tr><td>management_operation             </td><td>Glove    
 Roberta    
 Bert    
 Distilbert              </td><td>0.8222222222222222    
 0.7833333333333333    
 0.7611111111111111    
 0.6944444444444444                 </td><td>0.8229533881707795   
 0.7850790513833992   
 0.7562500000000001   
 0.7401585565882995                  </td><td>0.8118774429454041    
 0.7910099609128736    
 0.7584793846929769    
 0.7215357458075904               </td><td>0.8156446037639227    
 0.7825211437776882    
 0.7571459947915034    
 0.6923076923076923           </td></tr>
<tr><td>mitigation_adaptation            </td><td>Glove    
 Roberta    
 Distilbert    
 Bert              </td><td>0.8111111111111111    
 0.9    
 0.9055555555555556    
 0.8888888888888888                 </td><td>0.6197578522547815   
 0.649273803119957   
 0.6293103448275862   
 0.5309661436829067                  </td><td>0.7757575757575758    
 0.6121212121212121    
 0.5545454545454546    
 0.5151515151515151               </td><td>0.6405075187969925    
 0.6269000460617228    
 0.5701643489254109    
 0.5158687466379774           </td></tr>
<tr><td>monitor_inventory                </td><td>Glove    
 Roberta    
 Bert    
 Distilbert              </td><td>0.85    
 0.7944444444444444    
 0.7833333333333333    
 0.7333333333333333                 </td><td>0.8397950706175574   
 0.7202462380300958   
 0.7042374201589032   
 0.6705280172413793                  </td><td>0.7116977225672878    
 0.75    
 0.7261904761904763    
 0.7184265010351967               </td><td>0.7465449804432855    
 0.7317868793040958    
 0.7132235793945831    
 0.6790967166839994           </td></tr>
<tr><td>technology_innovation            </td><td>Glove    
 Roberta    
 Bert    
 Distilbert              </td><td>0.8222222222222222    
 0.8111111111111111    
 0.8111111111111111    
 0.8388888888888889                 </td><td>0.6451612903225807   
 0.5926773455377574   
 0.5926773455377574   
 0.6323529411764706                  </td><td>0.6361655773420479    
 0.5686274509803921    
 0.5686274509803921    
 0.5544662309368191               </td><td>0.6403596403596403    
 0.5762946552201607    
 0.5762946552201607    
 0.563216467241235           </td></tr>
</tbody>
</table>