In [4]:
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join('../..', 'src')))
sys.setrecursionlimit(20500)
import pandas as pd
import spacy

import matplotlib.pyplot as plt
import clean_dataset as clean
import make_dataset as mk
import vectorize_embed as em
import tools as tools
import models as m

import pickle
import numpy as np

'''features'''
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import label_binarize

'''Classifiers'''
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.pipeline import Pipeline

'''Metrics/Evaluation'''
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from scipy import interp
from itertools import cycle
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline

import joblib
import seaborn as sns

In [6]:
'''import data'''

df = pd.read_json(os.path.abspath(os.path.join('../..', 'data/processed/encoded_labels'))+'/technical_team_all.json')  
'''lowercase data and str type data'''
df['all_text_clean_spacy'] = df['all_text_clean_spacy'].astype(str).apply(clean.basic)
df['all_text_clean'] = df['all_text_clean'].astype(str)

df.columns

Index(['Unnamed: 0', 'PIMS_ID', 'all_text_clean', 'all_text_clean_spacy',
       'chemicals_and_waste_programme_x', 'persistent_organic_pollutants',
       'ozone_depleting_substances', 'heavy_metals', 'waste_management',
       'cooling_alternatives', 'climate_change_adaptation_programme_x',
       'cross_sectoral_climate_resilient_livelihoods',
       'fostering_food_security_and_resilient_agricultural_systems',
       'mainstreaming_integrated_policy_and_planning_for_climate_resilient_sustainable_development',
       'ecosystem_based_adaptation', 'urban_resilience',
       'climate_resilient_integrated_water_resource_and_coastal_management',
       'climate_forest_of_climate_change_mitigation_x', 'carbon_sequestration',
       'climate_financing_for_redd_redd', 'chemicals_and_waste_programme_y',
       'climate_change_adaptation_programme_y',
       'climate_forest_of_climate_change_mitigation_y',
       'climate_strategies_and_policy_programme',
       'ecosystems_and_biodiversity_

In [None]:
df[['chemicals_and_waste_programme', 'climate_change_adaptation_programme',
       'climate_forest_of_climate_change_mitigation',
       'climate_strategies_and_policy_programme',
       'ecosystems_and_biodiversity_programme',
       'energy_program_of_climate_change_mitigation',
       'oceans_and_water_programme', 'environmental_governance_programme',]].sum(axis=0).sort_values(ascending = False).plot.bar()

In [None]:
categories = ['chemicals_and_waste_programme', 'climate_change_adaptation_programme',
       'climate_forest_of_climate_change_mitigation',
       'climate_strategies_and_policy_programme',
       'ecosystems_and_biodiversity_programme',
       'energy_program_of_climate_change_mitigation',
       'oceans_and_water_programme', 'environmental_governance_programme']

In [None]:
# iterate over categories and save best models:
    #Creating a dict of the models
model_dict = {
              'LR_balanced' : LogisticRegression(random_state = 3, class_weight = "balanced"),
              'LR' : LogisticRegression(random_state = 3),
              'SDG' : SGDClassifier(random_state=3, loss='log'),
              'SDG_balanceed' : SGDClassifier(random_state=3, loss='log',class_weight = "balanced" ),
              'RF': RandomForestClassifier(random_state=3),
              'Decsision_Tree': DecisionTreeClassifier(random_state=3),
              'AdaBoost': AdaBoostClassifier(random_state=3),
              'GNB': GaussianNB(),
              'KNB': KNeighborsClassifier()}
    
for category in categories: 
    if category != "no tag":
        if df[category].sum(axis=0) > 20:
            
            print('processing:', category)
            print('_____')
            #Creating the features (tf-idf weights) for the processed text
            texts = df['all_text_clean'].astype('str')

            tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), 
                                               min_df = 2, 
                                               max_df = .95)
            X = tfidf_vectorizer.fit_transform(texts)     
            y = df[category].values

            if len(category) > 50:
                 shorter = category[0:20]
                 #save vectorizer:
                 filename = '../../models/tf_idf/transformed_sectors/'+shorter+'_'+'vectorizer.sav'
                 joblib.dump(tfidf_vectorizer, filename)    

            else:
                #save vectorizer:
                filename = '../../models/tf_idf/transformed_sectors/'+category+'_'+'vectorizer.sav'
                joblib.dump(tfidf_vectorizer, filename)    

            #Dimenionality reduction. Only using the 100 best features er category
            lsa = TruncatedSVD(n_components=100, 
                               n_iter=10, 
                               random_state=3)
            X = lsa.fit_transform(X)

            if len(category) > 50:
                print('long')
                shorter = category[0:20]
                #save lsa model:
                filename = '../../models/tf_idf/transformed_sectors/'+shorter+'_'+'lsa.sav'
                joblib.dump(lsa, filename)
            else:
                #save vectorizer:
                filename = '../../models/tf_idf/transformed_sectors/'+category+'_'+'lsa.sav'
                joblib.dump(lsa, filename)  

            #Train test split with stratified sampling for evaluation
            X_train, X_test, y_train, y_test = train_test_split(X, 
                                                                y, 
                                                                test_size = .3, 
                                                                shuffle = True, 
                                                                stratify = y, 
                                                                random_state = 3)

            m.model_score_df(model_dict, category, 'transformed_sectors', X_train, X_test, y_train, y_test)