In [50]:
from nltk.corpus import stopwords
from ftfy import fix_text
import spacy
import re
import json
import pickle
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, make_scorer

## NLP-pipeline Classifier: CountVectorizer, TF-idf, SVC

In [51]:
class NLPModel:
    
    LABEL_DICT = { # mapping the model's output to text labels
        0: 'low',
        1: 'medium',
        2: 'high',
        3: 'very_high'    
    }
#     0.17222840499877926 0.8060877380371094 2.8519862670898437
    QUANTILE_DICT = { # mapping the model's output to text labels
        0: [0, 0.17222840499877926],
        1: [0.17222840499877927, 0.8060877380371094],
        2: [0.8060877380371095, 2.8519862670898437],
        3: [2.8519862670898437, 300]   # we set a random high threshold, higer than out current 150sth max 
    }
    
    _scoring = {'accuracy' : make_scorer(accuracy_score), 
       'precision' : make_scorer(precision_score, average = 'macro'),
       'recall' : make_scorer(recall_score, average = 'macro'), 
       'f1_score' : make_scorer(f1_score, average = 'macro')
    }

    
    def __init__(self, load_model=False, model_version="1"):
        """NLPModel's constructor
        
        Args:
            model_version (str): verion's name
            load_model (bool): if the model exists and must be loaded
            
        """
        self.model_version = model_version     
        
        if load_model:
            self._retrieve_model("./")
            

            
    @classmethod
    def map_value_to_range(cls, df, target_column):
        '''
        Labels the target colum of the input df according to the QUANTILE_DICT and LABEL_DICT
        '''
        def convert_value(v):
            # return the key for which v is in the range of the key's value
            for key, value in cls.QUANTILE_DICT.items():
                if v >= value[0] and v <= value[1]:
                    return key
            raise ValueError(f"Invalid value: {v}")
            
        for i in range(len(df)):
            value = df[target_column][i]
            df[target_column][i] = convert_value(value)
            
        return df
            
        
    def find_best_parameters(self, x, y):
        
        text_nlp = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('svc', SVC())])

        # Create grid of parameters to test through cross-validation
        parameters = {
            'vect__ngram_range': [(1,1),(1,2),(1,3)],
            'vect__max_df': [0.8],
            'vect__max_features': [10000], # we limit it to the best 10k features since speeches are long (performance reasons)
            'tfidf__use_idf': [True],
            'svc__kernel': ['rbf'],
            'svc__gamma': [1, 1e-1, 1e-2, 1e-3, 1e-4, "scale"],
            'svc__C':  [1, 10, 100]
        }

        gs_nlp = GridSearchCV(text_nlp, parameters, cv=5, verbose=2)
        gs_nlp.fit(x, y)
        
        output_dict = {}
        for param_name in sorted(parameters.keys()):
            output_dict[param_name] = gs_nlp.best_params_[param_name]
            
        with open(f"gs_nlp_{self.model_version}.json", "w") as write_file:
            json.dump(output_dict, write_file, indent=4)
       
    
    def train_model(self, x, y, vect_params: dict, svc_params: dict):
        
        """This method trains the speech model and overwrites the existing one if the same type and version is provided (if same version applies)"""
        
        class NpEncoder(json.JSONEncoder):
            def default(self, obj):
                if isinstance(obj, np.integer):
                    return int(obj)
                if isinstance(obj, np.floating):
                    return float(obj)
                if isinstance(obj, np.ndarray):
                    return obj.tolist()
                return json.JSONEncoder.default(self, obj)
            
        text_nlp = Pipeline([
            ('vect', CountVectorizer(max_features=10000, **vect_params)),
            ('tfidf', TfidfTransformer()),
            ('svc', SVC(**svc_params))
        ])
        
        # Train the model
        print(' Training the model...')
        results = cross_validate(text_nlp, x, y, cv=5, scoring=self._scoring, error_score="raise")
        text_nlp.fit(x, y)
        

        # Save model
        print(' Saving the model...')
        filename = f"nlp_model_{self.model_version}.pkl"
        pickle.dump(text_nlp, open(filename, 'wb'))
        print(' Saving validation scores...')
        with open(f"nlp_model_{self.model_version}_validation.json", "w") as write_file:
            json.dump(results, write_file, indent=4, cls=NpEncoder)
        

    def _retrieve_model(self, path_to_model):
        """This method gets the model from the current directory"""
    
        print('Loading model from local machine...')
        path_to_binary_model = f'./nlp_model_{self.model_version}.pkl'
        model = pickle.load(open(path_to_binary_model, 'rb'))
        self.model = model
        print('Model loaded')
    

    def predict(self, x):
        
        """This method predicts using the loaded model"""
        y_pred = self.model.predict(x)
        y_pred_relabeled = [LABEL_DICT[label] for label in y_pred]
        return 

        

## Remap the renewable energy/capita to classes (percentile ranges)

In [52]:
renewable_energy = pd.read_csv('per-capita-renewables (1).csv')
renewable_energy = renewable_energy[~renewable_energy.Code.isna()].reset_index(drop=True)
renewable_energy.rename(columns={"Renewables per capita (kWh - equivalent)" : 'renewable_capita'}, inplace=True)
renewable_energy.head()

Unnamed: 0,Entity,Code,Year,renewable_capita
0,Algeria,DZA,1965,0.09424
1,Algeria,DZA,1966,0.081358
2,Algeria,DZA,1967,0.091327
3,Algeria,DZA,1968,0.121841
4,Algeria,DZA,1969,0.075913


In [53]:
renewable_energy.renewable_capita.max()

153.88340625

In [54]:
renewable_energy.index

RangeIndex(start=0, stop=4292, step=1)

In [55]:
p_25 = renewable_energy.renewable_capita.quantile(0.25)
p_50 = renewable_energy.renewable_capita.quantile(0.50)
p_75 = renewable_energy.renewable_capita.quantile(0.75)

print(p_25, p_50, p_75)

0.17222840499877926 0.8060877380371094 2.8519862670898437


In [56]:
labels = ['low', 'medium', 'high', 'very_high']

In [57]:
renewable_energy_relabeled = NLPModel.map_value_to_range(renewable_energy, "renewable_capita")
renewable_energy_relabeled

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_column][i] = convert_value(value)


Unnamed: 0,Entity,Code,Year,renewable_capita
0,Algeria,DZA,1965,0.0
1,Algeria,DZA,1966,0.0
2,Algeria,DZA,1967,0.0
3,Algeria,DZA,1968,0.0
4,Algeria,DZA,1969,0.0
...,...,...,...,...
4287,World,OWID_WRL,2017,2.0
4288,World,OWID_WRL,2018,2.0
4289,World,OWID_WRL,2019,2.0
4290,World,OWID_WRL,2020,2.0


## Matchnig renewable energy entries to the speeches

In [58]:
cleaned_speeches = pd.read_csv('cleaned_speeches.csv', index_col=0)
cleaned_speeches[:1]

Unnamed: 0,Session,Year,ISO-alpha3 Code,Speech,Cleaned_Speech
0,25,1970,AUT,155.\t May I begin by expressing to Ambassado...,begin expressing ambassador hambro behalf dele...


In [59]:
renewable_energy_relabeled[:1]

Unnamed: 0,Entity,Code,Year,renewable_capita
0,Algeria,DZA,1965,0.0


In [60]:
data = pd.merge(cleaned_speeches, renewable_energy_relabeled,
               left_on=['ISO-alpha3 Code', 'Year'], right_on=['Code', 'Year'],
               how='inner').drop(columns=['ISO-alpha3 Code'])

In [61]:
data.head()

Unnamed: 0,Session,Year,Speech,Cleaned_Speech,Entity,Code,renewable_capita
0,25,1970,155.\t May I begin by expressing to Ambassado...,begin expressing ambassador hambro behalf dele...,Austria,AUT,3.0
1,25,1970,"33.\t Mr. President, I take great pleasure in...",president take pleasure following tradition co...,Mexico,MEX,2.0
2,25,1970,1. The delegation of Algeria is very pleased ...,delegation algeria see session assembly held p...,Algeria,DZA,0.0
3,25,1970,"176.\t Mr. President, the delegation of Ceylon...",president delegation ceylon see preside sessio...,Sri Lanka,LKA,1.0
4,25,1970,36.\t May I say to the delegation of Malaysia ...,say delegation malaysia delegation shares grie...,Philippines,PHL,1.0


In [62]:
# Our X is just trained speeches, and y is renewable_capita, so we can drop the other columns as well
# saving this separately so that we can retrieve the other feature vals for each row in case we need to
model_data = data.drop(columns=['Session', 'Year', 'Speech', 'Entity', 'Code'])
model_data.head()

Unnamed: 0,Cleaned_Speech,renewable_capita
0,begin expressing ambassador hambro behalf dele...,3.0
1,president take pleasure following tradition co...,2.0
2,delegation algeria see session assembly held p...,0.0
3,president delegation ceylon see preside sessio...,1.0
4,say delegation malaysia delegation shares grie...,1.0


## Finding best param combination

In [63]:
# instantiating model and setting load to false, since we're creating a new one, not using a saved one
model = NLPModel(model_version=1, load_model=False)

# separating X and y and bringing them into shape
X = model_data.Cleaned_Speech.values
y = model_data.renewable_capita.values

# using grid search with 5 folds to find best param values 
# for the Vectorizer, tf-idf, and the SVC classifier int the pipeline
model.find_best_parameters(X, y)

 Training the model...
 Saving the model...
 Saving validation scores...


## Training the model and saving validation scores

In [None]:
# inserting the JSON param combo saved by find_best_parameters() in a file in the current directory
model.train_model(
    vect_params={"max_df": 0.8, "ngram_range": (1,2)}, 
    svc_params={"kernel": "rbf", "gamma": 1, "C": 10},
    x=X, y=y
)