To run this notebook SentenceTransformer needed to be installed

In [1]:
#!pip install SentenceTransformer

## Import Libraries

In [2]:
'''basics'''
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join('../..', 'src')))
sys.setrecursionlimit(20500)
import vectorize_embed as em
import pandas as pd
#import pickle5 as pickle
import pickle
import numpy as np

'''Plotting'''
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

'''features'''
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import label_binarize

'''Classifiers'''
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB


'''Metrics/Evaluation'''
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from scipy import interp
from itertools import cycle
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn_hierarchical_classification.classifier import HierarchicalClassifier
from sklearn_hierarchical_classification.constants import ROOT
from sklearn_hierarchical_classification.metrics import h_fbeta_score, multi_labeled
from sklearn.pipeline import Pipeline


import warnings
warnings.filterwarnings('ignore')

import joblib



## Import data

In [3]:
df = pd.read_json('../../data/processed/encoded_labels/main_landscapes_all.json')
df.columns

Index(['PIMS_ID', 'all_text_clean', 'all_text_clean_spacy', 'forest', 'tundra',
       'conserved_areas', 'freshwater', 'grassland', 'desert', 'marine',
       'wetlands', 'human_altered_areas', 'no tag', 'category_1', 'category_2',
       'category_3', 'labels'],
      dtype='object')

# A recap of previous comparison between different classifiers using TF-IDF

<table>
<thead>
<tr><th>Category           </th><th style="text-align: right;">  #Inputs</th><th>Classifiers  </th><th>accuracy_score  </th><th>precision_score  </th><th>recall_score  </th><th>f1_score  </th></tr>
</thead>
<tbody>
<tr><td>forest             </td><td style="text-align: right;">      120</td><td>Stochastic Gradient Descent    
 Decsision Tree    
 AdaBoost    
 Gaussian Naive Bayes    
 K Nearest Neighbor    
 Random Forest              </td><td>0.8055555555555556    
 0.8055555555555556    
 0.8222222222222222    
 0.7222222222222222    
 0.7944444444444444    
 0.8277777777777777                 </td><td>0.7319711538461539   
 0.7232142857142857   
 0.7625806451612904   
 0.6918160170447424   
 0.704045954045954   
 0.9088235294117647                  </td><td>0.7709247236357255    
 0.7193367257413581    
 0.6785400947534656    
 0.7685558869977189    
 0.6433584839445516    
 0.6219512195121951               </td><td>0.7462848858281986    
 0.7212266029470331    
 0.7031539888682746    
 0.6875000000000000    
 0.6607406652743111    
 0.6459166190748143           </td></tr>
<tr><td>tundra             </td><td style="text-align: right;">        3</td><td>Stochastic Gradient Descent    
 Random Forest    
 AdaBoost    
 Gaussian Naive Bayes    
 K Nearest Neighbor    
 Decsision Tree              </td><td>0.9944444444444445    
 0.9944444444444445    
 0.9944444444444445    
 0.9944444444444445    
 0.9944444444444445    
 0.9888888888888889                 </td><td>0.49722222222222223   
 0.49722222222222223   
 0.49722222222222223   
 0.49722222222222223   
 0.49722222222222223   
 0.4972067039106145                  </td><td>0.5000000000000000    
 0.5000000000000000    
 0.5000000000000000    
 0.5000000000000000    
 0.5000000000000000    
 0.4972067039106145               </td><td>0.4986072423398329    
 0.4986072423398329    
 0.4986072423398329    
 0.4986072423398329    
 0.4986072423398329    
 0.4972067039106145           </td></tr>
<tr><td>conserved_areas    </td><td style="text-align: right;">      153</td><td>K Nearest Neighbor    
 Stochastic Gradient Descent    
 Gaussian Naive Bayes    
 AdaBoost    
 Decsision Tree    
 Random Forest              </td><td>0.850000000000000    
 0.8388888888888889    
 0.8277777777777777    
 0.8333333333333334    
 0.8222222222222222    
 0.8277777777777777                 </td><td>0.7962962962962963   
 0.7813846382409746   
 0.7738669238187078   
 0.7736742424242424   
 0.7593582887700534   
 0.7695035460992907                  </td><td>0.8008021390374331    
 0.8165106951871658    
 0.8322192513368984    
 0.7897727272727273    
 0.7593582887700534    
 0.7476604278074866               </td><td>0.7984991085865916    
 0.795382384069617    
 0.7915966386554623    
 0.7809863724853991    
 0.7593582887700534    
 0.7572963333478318           </td></tr>
<tr><td>freshwater         </td><td style="text-align: right;">       58</td><td>AdaBoost    
 Stochastic Gradient Descent    
 K Nearest Neighbor    
 Decsision Tree    
 Gaussian Naive Bayes    
 Random Forest              </td><td>0.9111111111111111    
 0.8944444444444445    
 0.8944444444444445    
 0.8722222222222222    
 0.6888888888888889    
 0.9000000000000000                 </td><td>0.7976190476190477   
 0.7394715111478117   
 0.7394715111478117   
 0.6473975126669738   
 0.5917065390749601   
 0.949438202247191                  </td><td>0.6875000000000000    
 0.5906250000000000    
 0.5906250000000000    
 0.6000000000000000    
 0.7156250000000000    
 0.5500000000000000               </td><td>0.725609756097561    
 0.6196196196196196    
 0.6196196196196196    
 0.616346955796497    
 0.572228823629265    
 0.5642818719741797           </td></tr>
<tr><td>grassland          </td><td style="text-align: right;">       48</td><td>Decsision Tree    
 K Nearest Neighbor    
 Gaussian Naive Bayes    
 AdaBoost    
 Stochastic Gradient Descent    
 Random Forest              </td><td>0.9111111111111111    
 0.9277777777777778    
 0.8111111111111111    
 0.9111111111111111    
 0.9222222222222223    
 0.9222222222222223                 </td><td>0.6901893287435455   
 0.7685714285714286   
 0.5902777777777778   
 0.632183908045977   
 0.7134831460674158   
 0.7134831460674158                  </td><td>0.6901893287435455    
 0.601118760757315    
 0.7013769363166953    
 0.5593803786574871    
 0.5327022375215147    
 0.5327022375215147               </td><td>0.6901893287435455    
 0.6388331532643926    
 0.6051612903225807    
 0.5764705882352942    
 0.5421511627906976    
 0.5421511627906976           </td></tr>
<tr><td>desert             </td><td style="text-align: right;">       15</td><td>AdaBoost    
 K Nearest Neighbor    
 Decsision Tree    
 Stochastic Gradient Descent    
 Gaussian Naive Bayes    
 Random Forest              </td><td>0.9777777777777777    
 0.9777777777777777    
 0.9611111111111111    
 0.9722222222222222    
 0.8888888888888888    
 0.9777777777777777                 </td><td>0.7443181818181819   
 0.7443181818181819   
 0.6370767960363336   
 0.6581920903954802   
 0.5437500000000001   
 0.4888888888888889                  </td><td>0.7443181818181819    
 0.7443181818181819    
 0.7357954545454546    
 0.6193181818181819    
 0.6988636363636364    
 0.5000000000000000               </td><td>0.7443181818181819    
 0.7443181818181819    
 0.6717895285230528    
 0.6357749898826386    
 0.5535714285714286    
 0.4943820224719101           </td></tr>
<tr><td>marine             </td><td style="text-align: right;">       85</td><td>Stochastic Gradient Descent    
 AdaBoost    
 K Nearest Neighbor    
 Random Forest    
 Decsision Tree    
 Gaussian Naive Bayes              </td><td>0.8888888888888888    
 0.8777777777777778    
 0.8611111111111112    
 0.8833333333333333    
 0.7944444444444444    
 0.6833333333333333                 </td><td>0.7754168028767571   
 0.7448337825696316   
 0.7083333333333333   
 0.8591954022988506   
 0.5966666666666667   
 0.5937001594896332                  </td><td>0.7174193548387097    
 0.7109677419354838    
 0.7012903225806452    
 0.5967741935483871    
 0.6122580645161291    
 0.6819354838709677               </td><td>0.7410817031070195    
 0.7258377180836333    
 0.7047050331386573    
 0.6293754289636239    
 0.6029806259314456    
 0.5808652314228522           </td></tr>
<tr><td>wetlands           </td><td style="text-align: right;">       39</td><td>Stochastic Gradient Descent    
 Gaussian Naive Bayes    
 Random Forest    
 K Nearest Neighbor    
 AdaBoost    
 Decsision Tree              </td><td>0.9388888888888889    
 0.8222222222222222    
 0.9333333333333333    
 0.9333333333333333    
 0.9111111111111111    
 0.9000000000000000                 </td><td>0.9691011235955056   
 0.5856524427952999   
 0.9664804469273742   
 0.9664804469273742   
 0.5657142857142857   
 0.5367464905037159                  </td><td>0.5769230769230769    
 0.6913864578535237    
 0.5384615384615384    
 0.5384615384615384    
 0.5264854905573468    
 0.5204974666052511               </td><td>0.6173913043478261    
 0.6012184990307394    
 0.5540875309661437    
 0.5540875309661437    
 0.5321637426900585    
 0.5235294117647059           </td></tr>
<tr><td>human_altered_areas</td><td style="text-align: right;">      222</td><td>Stochastic Gradient Descent    
 K Nearest Neighbor    
 Random Forest    
 AdaBoost    
 Decsision Tree    
 Gaussian Naive Bayes              </td><td>0.7944444444444444    
 0.7500000000000000    
 0.7444444444444445    
 0.7000000000000000    
 0.6611111111111111    
 0.6277777777777778                 </td><td>0.7809806034482758   
 0.7341718485019786   
 0.7534097108565194   
 0.6772282683565229   
 0.636762360446571   
 0.6402439024390244                  </td><td>0.7755250297186633    
 0.7188614449874521    
 0.6840575881653679    
 0.669924712719588    
 0.635913353586052    
 0.6488574824990094               </td><td>0.7779925997533251    
 0.7242553191489363    
 0.6924676868221662    
 0.6726831896551724    
 0.636315458249147    
 0.6251748251748251           </td></tr>
<tr><td>no tag             </td><td style="text-align: right;">      516</td><td>Stochastic Gradient Descent    
 Gaussian Naive Bayes    
 AdaBoost    
 Random Forest    
 K Nearest Neighbor    
 Decsision Tree              </td><td>0.8611111111111112    
 0.7500000000000000    
 0.8500000000000000    
 0.8611111111111112    
 0.8444444444444444    
 0.7833333333333333                 </td><td>0.7259615384615384   
 0.6437252685132844   
 0.6897590361445782   
 0.7382352941176471   
 0.6727272727272727   
 0.5900000000000000                  </td><td>0.7047930283224401    
 0.7461873638344226    
 0.6067538126361656    
 0.5980392156862745    
 0.6034858387799564    
 0.5980392156862745               </td><td>0.7144488863506567    
 0.653475935828877    
 0.6284119581007721    
 0.6234624717596853    
 0.6226415094339623    
 0.5935383011985409           </td></tr>
</tbody>
</table>

# Compare different embeddings performances

In [4]:
categories = ['forest', 'tundra',
       'conserved_areas', 'freshwater', 'grassland', 'desert', 'marine',
       'wetlands', 'human_altered_areas', 'no tag']

#Turning the labels into numbers
y = pd.DataFrame(df, columns = categories)

X = df['all_text_clean'].astype('str').tolist()


## Train multiple Embeddings with SGD / OneVsRest multi-label strategy

In [5]:
#Creating a dict of the embeddings
embedding_dict = {'Glove' : 'average_word_embeddings_glove.6B.300d', 
                  'Distilbert':'distilbert-base-nli-mean-tokens', 
                  'Roberta' : 'roberta-base-nli-stsb-mean-tokens', 
                  'Bert' : 'bert-base-nli-stsb-mean-tokens'}
              
sgd_classifier = SGDClassifier(alpha=1e-06,
                               loss='log',
                               max_iter=1000,
                               penalty='l1',
                               random_state = 3,
                               tol=0.001)
model = OneVsRestClassifier(sgd_classifier)

#Train test split with stratified sampling for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = .3, 
                                                    shuffle = True,  
                                                    random_state = 3)
y_train = y_train.dropna(axis=1)

#Function to get the scores for each model in a df
def model_score_df(embedding_dict, X_train, X_test, y_train, y_test, category):   
    embedding_name, ac_score_list, p_score_list, r_score_list, f1_score_list = [], [], [], [], []
    for k,v in embedding_dict.items():   
        print ('processing :' + str(k))
        embedding_name.append(k)   
        model.fit(em.get_embeddings(v, X_train), y_train)
        
        # save the model to disk
        filename = '../'+category+'_'+k+'model.sav'
        joblib.dump(model, filename)
        
        y_pred = model.predict(em.get_embeddings(v, X_test))
        ac_score_list.append(accuracy_score(y_test, y_pred))
        p_score_list.append(precision_score(y_test, y_pred, average='macro'))
        r_score_list.append(recall_score(y_test, y_pred, average='macro'))
        f1_score_list.append(f1_score(y_test, y_pred, average='macro'))
        model_comparison_df = pd.DataFrame([embedding_name, ac_score_list, p_score_list, r_score_list, f1_score_list]).T
        model_comparison_df.columns = ['embedding_name', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score']
        model_comparison_df = model_comparison_df.sort_values(by='f1_score', ascending=False)
    return model_comparison_df

      
lis = []
for category in categories:
    dic = {}
    dff = model_score_df(embedding_dict, X_train, X_test, y_train[category], y_test[category], category)
    # Using DataFrame.insert() to add a column
    dic['Category'] = category
    dic['Classifiers'] = '    \n '.join(dff.embedding_name.apply(str).tolist())
    dic['accuracy_score'] = '    \n '.join(dff.accuracy_score.apply(str).tolist()) 
    dic['precision_score'] = '   \n '.join(dff.precision_score.apply(str).tolist())
    dic['recall_score'] = '    \n '.join(dff.recall_score.apply(str).tolist())
    dic['f1_score'] = '    \n '.join(dff.f1_score.apply(str).tolist())
    lis.append(dic)
    



processing :Glove
processing :Distilbert
processing :Roberta
processing :Bert
processing :Glove
processing :Distilbert
processing :Roberta
processing :Bert
processing :Glove
processing :Distilbert
processing :Roberta
processing :Bert
processing :Glove
processing :Distilbert
processing :Roberta
processing :Bert
processing :Glove
processing :Distilbert
processing :Roberta
processing :Bert
processing :Glove
processing :Distilbert
processing :Roberta
processing :Bert
processing :Glove
processing :Distilbert
processing :Roberta
processing :Bert
processing :Glove
processing :Distilbert
processing :Roberta
processing :Bert
processing :Glove
processing :Distilbert
processing :Roberta
processing :Bert
processing :Glove
processing :Distilbert
processing :Roberta
processing :Bert


In [8]:
from tabulate import tabulate  
header = lis[0].keys()
rows =  [x.values() for x in lis]
print(tabulate(rows, header, tablefmt='html'))

<table>
<thead>
<tr><th>Category           </th><th>Classifiers  </th><th>accuracy_score  </th><th>precision_score  </th><th>recall_score  </th><th>f1_score  </th></tr>
</thead>
<tbody>
<tr><td>forest             </td><td>Distilbert    
 Glove    
 Bert    
 Roberta              </td><td>0.8176795580110497    
 0.8232044198895028    
 0.7845303867403315    
 0.7513812154696132                 </td><td>0.6682168784029039   
 0.6727047146401985   
 0.6451219512195122   
 0.6362765330188679                  </td><td>0.6636865342163356    
 0.6536423841059602    
 0.6838852097130244    
 0.704083885209713               </td><td>0.665883537506293    
 0.6619981325863679    
 0.6583418033977058    
 0.6482705013602799           </td></tr>
<tr><td>tundra             </td><td>Roberta    
 Glove    
 Distilbert    
 Bert              </td><td>1.0    
 0.994475138121547    
 0.994475138121547    
 0.994475138121547                 </td><td>1.0   
 0.4972375690607735   
 0.4972375690607735   
 0.

### copy-past the output of the last cell here


In [None]:
<table>
<thead>
<tr><th>Category           </th><th>Classifiers  </th><th>accuracy_score  </th><th>precision_score  </th><th>recall_score  </th><th>f1_score  </th></tr>
</thead>
<tbody>
<tr><td>forest             </td><td>Distilbert    
 Glove    
 Bert    
 Roberta              </td><td>0.8176795580110497    
 0.8232044198895028    
 0.7845303867403315    
 0.7513812154696132                 </td><td>0.6682168784029039   
 0.6727047146401985   
 0.6451219512195122   
 0.6362765330188679                  </td><td>0.6636865342163356    
 0.6536423841059602    
 0.6838852097130244    
 0.704083885209713               </td><td>0.665883537506293    
 0.6619981325863679    
 0.6583418033977058    
 0.6482705013602799           </td></tr>
<tr><td>tundra             </td><td>Roberta    
 Glove    
 Distilbert    
 Bert              </td><td>1.0    
 0.994475138121547    
 0.994475138121547    
 0.994475138121547                 </td><td>1.0   
 0.4972375690607735   
 0.4972375690607735   
 0.4972375690607735                  </td><td>1.0    
 0.5    
 0.5    
 0.5               </td><td>1.0    
 0.4986149584487534    
 0.4986149584487534    
 0.4986149584487534           </td></tr>
<tr><td>conserved_areas    </td><td>Glove    
 Bert    
 Roberta    
 Distilbert              </td><td>0.8287292817679558    
 0.8397790055248618    
 0.8287292817679558    
 0.8121546961325967                 </td><td>0.7683023872679045   
 0.7584494773519164   
 0.7422194922194922   
 0.7182624113475178                  </td><td>0.8722856091277144    
 0.7730033124769967    
 0.7177033492822966    
 0.7265366212734634               </td><td>0.7896690032612362    
 0.7652189470859239    
 0.7284255360340739    
 0.7221921271217047           </td></tr>
<tr><td>freshwater         </td><td>Glove    
 Bert    
 Distilbert    
 Roberta              </td><td>0.8950276243093923    
 0.8397790055248618    
 0.8342541436464088    
 0.8287292817679558                 </td><td>0.746792130025663   
 0.6343672456575682   
 0.6078701155751238   
 0.5753105590062112                  </td><td>0.6717261904761904    
 0.6611607142857143    
 0.6166666666666667    
 0.5721726190476191               </td><td>0.6995194408038445    
 0.6454576156703816    
 0.6119210977701545    
 0.5736646151508245           </td></tr>
<tr><td>grassland          </td><td>Glove    
 Bert    
 Distilbert    
 Roberta              </td><td>0.9171270718232044    
 0.9171270718232044    
 0.8674033149171271    
 0.861878453038674                 </td><td>0.6906384505021521   
 0.6405038759689923   
 0.5557259713701431   
 0.5283357245337159                  </td><td>0.7620808678500987    
 0.6072485207100592    
 0.5806213017751479    
 0.5389546351084813               </td><td>0.7188567878223051    
 0.6208630079597822    
 0.563855421686747    
 0.5314279797038418           </td></tr>
<tr><td>desert             </td><td>Glove    
 Distilbert    
 Bert    
 Roberta              </td><td>0.9779005524861878    
 0.9723756906077348    
 0.9668508287292817    
 0.9392265193370166                 </td><td>0.49166666666666664   
 0.49162011173184356   
 0.49157303370786515   
 0.4913294797687861                  </td><td>0.49719101123595505    
 0.4943820224719101    
 0.49157303370786515    
 0.47752808988764045               </td><td>0.49441340782122906    
 0.4929971988795518    
 0.49157303370786515    
 0.4843304843304843           </td></tr>
<tr><td>marine             </td><td>Glove    
 Bert    
 Distilbert    
 Roberta              </td><td>0.8397790055248618    
 0.861878453038674    
 0.8453038674033149    
 0.8121546961325967                 </td><td>0.7260241346459193   
 0.7351282051282051   
 0.7042483660130718   
 0.6734929078014185                  </td><td>0.832282913165266    
 0.7140522875816994    
 0.7042483660130718    
 0.7284080298786181               </td><td>0.756144018583043    
 0.7236978689625695    
 0.7042483660130718    
 0.6921768707482994           </td></tr>
<tr><td>wetlands           </td><td>Roberta    
 Bert    
 Glove    
 Distilbert              </td><td>0.9226519337016574    
 0.9171270718232044    
 0.8839779005524862    
 0.8950276243093923                 </td><td>0.7487080103359174   
 0.7157142857142857   
 0.609923011120616   
 0.5433333333333333                  </td><td>0.6546184738955824    
 0.5909638554216867    
 0.6032128514056225    
 0.5182730923694779               </td><td>0.6876232741617357    
 0.6208630079597822    
 0.6063995029512271    
 0.5197598100823907           </td></tr>
<tr><td>human_altered_areas</td><td>Distilbert    
 Glove    
 Bert    
 Roberta              </td><td>0.7845303867403315    
 0.7513812154696132    
 0.712707182320442    
 0.7071823204419889                 </td><td>0.7650962176509621   
 0.7168488926268572   
 0.6816123188405797   
 0.6661059714045416                  </td><td>0.7220061128091136    
 0.7149208113364824    
 0.694984717977216    
 0.6646290636287857               </td><td>0.73539003636091    
 0.7158555729984302    
 0.685763888888889    
 0.665341008198151           </td></tr>
<tr><td>no tag             </td><td>Glove    
 Bert    
 Distilbert    
 Roberta              </td><td>0.8839779005524862    
 0.861878453038674    
 0.8729281767955801    
 0.856353591160221                 </td><td>0.6989708404802745   
 0.6632754342431761   
 0.6576346284935242   
 0.6177712800519818                  </td><td>0.7261208576998051    
 0.7137751786874593    
 0.6502599090318388    
 0.6177712800519818               </td><td>0.7111921586505585    
 0.6827900455660707    
 0.6538045738045738    
 0.6177712800519818           </td></tr>
</tbody>
</table>