In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join('../..', 'src')))
sys.setrecursionlimit(20500)
import pandas as pd
import spacy

import matplotlib.pyplot as plt
import clean_dataset as clean
import make_dataset as mk
import vectorize_embed as em
import tools as tools

import pickle
import numpy as np

'''features'''
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import label_binarize

'''Classifiers'''
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.pipeline import Pipeline

'''Metrics/Evaluation'''
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from scipy import interp
from itertools import cycle
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline

import joblib
import seaborn as sns

[nltk_data] Downloading package stopwords to C:\Users\Jonas
[nltk_data]     Nothnagel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Jonas
[nltk_data]     Nothnagel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
'''import data'''

df = pd.read_json(os.path.abspath(os.path.join('../..', 'data/processed/encoded_labels'))+'/technical_team_all.json')  
'''lowercase data and str type data'''
df['all_text_clean_spacy'] = df['all_text_clean_spacy'].astype(str).apply(clean.basic)
df['all_text_clean'] = df['all_text_clean'].astype(str)

df.shape



(603, 15)

In [3]:
df.columns

Index(['PIMS_ID', 'all_text_clean', 'all_text_clean_spacy',
       'chemicals_and_waste_programme', 'climate_change_adaptation_programme',
       'climate_forest_of_climate_change_mitigation',
       'climate_strategies_and_policy_programme',
       'ecosystems_and_biodiversity_programme',
       'energy_program_of_climate_change_mitigation',
       'oceans_and_water_programme', 'environmental_governance_programme',
       'category_1', 'category_2', 'category_3', 'labels'],
      dtype='object')

In [4]:
'''remove small data points'''
df_trunc = mk.truncate(df, 500, save=True)

In [5]:
#Holding out 5 rows from the original dataframe for prediction at the end
df_holdout = df_trunc.iloc[:5]

df_trunc = df_trunc.iloc[5:]

In [6]:
#Creating the features (tf-idf weights) for the processed text
texts = df_trunc['all_text_clean'].astype('str')

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), 
                                   min_df = 2, 
                                   max_df = .95)

X = tfidf_vectorizer.fit_transform(texts)
   

y = df_trunc['environmental_governance_programme'].values

#Dimenionality reduction. Only using the 100 best features er category

lsa = TruncatedSVD(n_components=100, 
                   n_iter=10, 
                   random_state=3)

X = lsa.fit_transform(X)

In [7]:
#Preliminary model evaluation using default parameters

#Creating a dict of the models
model_dict = {'Dummy' : DummyClassifier(random_state=3),
              'Logistic Regression balanced' : LogisticRegression(random_state = 3, class_weight = "balanced"),
              'Logistic Regression' : LogisticRegression(random_state = 3),
              'Stochastic Gradient Descent' : SGDClassifier(random_state=3, loss='log'),
              'Stochastic Gradient Descent balanceed' : SGDClassifier(random_state=3, loss='log',class_weight = "balanced" ),
              'Random Forest': RandomForestClassifier(random_state=3),
              'Decsision Tree': DecisionTreeClassifier(random_state=3),
              'AdaBoost': AdaBoostClassifier(random_state=3),
              'Gaussian Naive Bayes': GaussianNB(),
              'K Nearest Neighbor': KNeighborsClassifier()}

#Train test split with stratified sampling for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = .3, 
                                                    shuffle = True, 
                                                    stratify = y, 
                                                    random_state = 3)

#Function to get the scores for each model in a df
def model_score_df(model_dict):   
    model_name, ac_score_list, p_score_list, r_score_list, f1_score_list = [], [], [], [], []
    for k,v in model_dict.items():   
        model_name.append(k)
        v.fit(X_train, y_train)
        y_pred = v.predict(X_test)
        ac_score_list.append(accuracy_score(y_test, y_pred))
        p_score_list.append(precision_score(y_test, y_pred, average='macro'))
        r_score_list.append(recall_score(y_test, y_pred, average='macro'))
        f1_score_list.append(f1_score(y_test, y_pred, average='macro'))
        model_comparison_df = pd.DataFrame([model_name, ac_score_list, p_score_list, r_score_list, f1_score_list]).T
        model_comparison_df.columns = ['model_name', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score']
        model_comparison_df = model_comparison_df.sort_values(by='f1_score', ascending=False)
    return model_comparison_df

model_score_df(model_dict)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,model_name,accuracy_score,precision_score,recall_score,f1_score
0,Dummy,0.988024,0.494012,0.5,0.496988
2,Logistic Regression,0.988024,0.494012,0.5,0.496988
3,Stochastic Gradient Descent,0.988024,0.494012,0.5,0.496988
5,Random Forest,0.988024,0.494012,0.5,0.496988
7,AdaBoost,0.988024,0.494012,0.5,0.496988
9,K Nearest Neighbor,0.988024,0.494012,0.5,0.496988
4,Stochastic Gradient Descent balanceed,0.982036,0.493976,0.49697,0.495468
6,Decsision Tree,0.982036,0.493976,0.49697,0.495468
8,Gaussian Naive Bayes,0.982036,0.493976,0.49697,0.495468
1,Logistic Regression balanced,0.958084,0.493827,0.484848,0.489297


In [8]:
def binary_log_classifier(dataframe, category):
    X_train, X_test, y_train, y_test = train_test_split(dataframe['all_text_clean'],
                                                        dataframe[category].values,
                                                        test_size = .3,
                                                        random_state = 1,
                                                        shuffle = True)
    print('training size:', len(X_train))
    print('test size:', len(X_test))
    print('distribution of tagged projects:', dataframe[category].value_counts())
    #if weighted == True:
        #class_weights = tools.get_class_weights(y_train)
        #print(class_weights)
    #else: 
        #class_weights = None
        

    '''extract features using tfidf vecorization:'''
    vectorizer = TfidfVectorizer(ngram_range = (1,2), min_df = 0.01, max_df = 0.95)
    vect = vectorizer.fit(X_train)
    X_train = vect.transform(X_train)
    X_test = vect.transform(X_test)
    
    "feature reduction"
    #lsa = TruncatedSVD(n_components=100, 
                   #n_iter=10, 
                   #random_state=3)

    #X_train = lsa.fit_transform(X_train)
    #X_test = lsa.fit_transform(X_test)    
    
    
    # set pipeline
    pipe = Pipeline([('classifier' , LogisticRegression())])

    # Create param grid.
    param_grid = [
        {'classifier' : [LogisticRegression(class_weight = "balanced")],
         'classifier__penalty' : ['l1', 'l2'],
        'classifier__C' : np.logspace(-4, 4, 20),
        'classifier__solver' : ['liblinear ','lbfgs']}
    ]

    # Create grid search object
    clf = GridSearchCV(pipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1, scoring = 'accuracy')

    # Fit on data
    best_clf = clf.fit(X_train, y_train)
    print('')
    print('Training accuracy:', best_clf.score(X_train, y_train).round(3))
    print('Test accuracy:', best_clf.score(X_test, y_test).round(3))
    y_hat = best_clf.predict(X_test)
    print('recall', recall_score(y_test, y_hat))

    from sklearn.metrics import classification_report
    print(classification_report(y_test, y_hat))    
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_hat).ravel()
    specificity = tn / (tn+fp)
    print('specificity is:', specificity)

    return best_clf, vectorizer, y_train

In [9]:
best_clf, vectorizer, y_train = binary_log_classifier(df_trunc, 'chemicals_and_waste_programme')

# precision 0.33 on class 1: 33% of all items predicted as 1 are truly 1.
# recall 0.6 on class 1: 60% of all true 1 are predicted as 1. 

training size: 387
test size: 167
distribution of tagged projects: 0    515
1     39
Name: chemicals_and_waste_programme, dtype: int64
Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 360 tasks      | elapsed:    5.4s



Training accuracy: 1.0
Test accuracy: 0.988
recall 0.875
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       151
           1       1.00      0.88      0.93        16

    accuracy                           0.99       167
   macro avg       0.99      0.94      0.96       167
weighted avg       0.99      0.99      0.99       167

specificity is: 1.0


[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    5.8s finished
