In [4]:
'''basics'''
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join('../..', 'src')))
sys.setrecursionlimit(20500)
import vectorize_embed as em
import make_dataset as mk
import clean_dataset as clean
import visualize as vis
import pandas as pd
import pickle 
import numpy as np
import matplotlib.pyplot as plt
import tools as tools


'''Plotting'''
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

'''features'''
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import label_binarize

'''Classifiers'''
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB


'''Metrics/Evaluation'''
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from scipy import interp
from itertools import cycle
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn_hierarchical_classification.classifier import HierarchicalClassifier
from sklearn_hierarchical_classification.constants import ROOT
from sklearn_hierarchical_classification.metrics import h_fbeta_score, multi_labeled
from sklearn.pipeline import Pipeline

import operator    
import joblib

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to C:\Users\Jonas
[nltk_data]     Nothnagel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Jonas
[nltk_data]     Nothnagel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Jonas
[nltk_data]     Nothnagel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Jonas
[nltk_data]     Nothnagel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [1]:
categories = [
       'food_and_agricultural_commodities_strategy', 'green_recovery',
       'health', 'human_rights', 'leaving_no_one_behind',
       'multi_stakeholder_collaboration', 'nature_based_solution',
       'plastic', 'poverty_reduction', 'public_private_partnership', 'sids',
       'south_south_cooperation', 'structural_system_transformation']

In [8]:
"""function that pre-processes untext data into right format"""

def predict_text(input_string, category, spacy = False, basic_clean = True, tfidf = False, context_emb = False, transf = False):

    print('______')
    input_list = [input_string]
    input_df = pd.DataFrame(input_list, columns =['input_text'])
    
    """input text will be normalise to the standard of training data:"""
    if basic_clean == True:       
        input_df['input_text'] = input_df['input_text'].apply(clean.basic)
        clean_df = pd.Series(input_df['input_text'])
    else:
        pass
    if spacy == True:
        input_df['input_text'] = input_df['input_text'].apply(clean.spacy_clean)
        clean_df = pd.Series(input_df['input_text'])
    else:
        pass
      
    """input text will be vectorised/embedded:
        
        tf-idf
    """
    if tfidf == True:
        
        """load vectorizer and LSA dimension reducer"""
        
        tfidf_vectorizer = joblib.load('../../models/tf_idf/hot_topics/'+category+'_'+'vectorizer.sav')        
        lsa = joblib.load('../../models/tf_idf/hot_topics/'+category+'_'+'lsa.sav')
        
        vector_df = tfidf_vectorizer.transform(clean_df)
        vector_df = lsa.transform(vector_df)
        
        """load models:"""
        clf = joblib.load('../../models/tf_idf/hot_topics/'+category+'_'+'model.sav')
        
        """predict"""
        y_hat = clf.predict(vector_df)
        y_prob = clf.predict_proba(vector_df)
        
        if y_hat == 1:
            print(category)
            print(y_hat)
            print("YES. Confidence:", y_prob[0][1].round(2)*100, "%")
            hat.append(y_prob[0][1].round(2)*100)
        if y_hat == 0:
            print(category)
            print(y_hat)
            print("NO Confidence:", y_prob[0][0].round(2)*100, "%")
            hat.append(y_prob[0][0].round(2)*100)
    else:
        pass
    
    """
        Roberta embeddings and SGD classifier:
    """
    if context_emb == True:
        
        X = clean_df.tolist()
        
        """load models:"""
        clf = joblib.load('../../models/contextual_emb/hot_topics/'+category+'_'+'Roberta_'+'model.sav')
        
        """embed and predict"""
        #vector_df = em.get_embeddings("roberta-base-nli-stsb-mean-tokens", clean_df)
        
        """predict"""
        y_hat = clf.predict(em.get_embeddings("roberta-base-nli-stsb-mean-tokens", X))
        y_prob = clf.predict_proba(em.get_embeddings("roberta-base-nli-stsb-mean-tokens", X))
        print(y_hat)
        print(y_prob)
        if y_hat == [1]:
            print(category)
            print("YES. Confidence:", y_prob[0][1].round(3)*100, "%")
            hat.append(y_prob[0][1].round(2)*100)
        if y_hat == [0]:
            print(category)
            print("NO Confidence:", y_prob[0][0].round(3)*100, "%")   
            hat.append(y_prob[0][0].round(2)*100)
    else:
        pass

    """
        Transformer
    """
    if transf == True:
        """predict with transformer
        
        needs pytorch and simpletransformer library"""

    else:
        pass
    
    print('______')
    return y_hat, y_prob, hat    

In [9]:
input_string =  input("Enter your text: ")
hat = []
for category in categories: 
    prediction, probability, hat = predict_text(input_string, category, tfidf =True, context_emb = False)

Enter your text: hi
______
food_and_agricultural_commodities_strategy
[0]
NO Confidence: 56.99999999999999 %
______
______
green_recovery
[0]
NO Confidence: 98.0 %
______
______
health
[0]
NO Confidence: 100.0 %
______
______
human_rights
[0]
NO Confidence: 100.0 %
______
______
leaving_no_one_behind
[0]
NO Confidence: 99.0 %
______
______
multi_stakeholder_collaboration
[0]
NO Confidence: 54.0 %
______
______
nature_based_solution
[0]
NO Confidence: 62.0 %
______
______
plastic
[1]
YES. Confidence: 100.0 %
______
______
poverty_reduction
[0]
NO Confidence: 63.0 %
______
______
public_private_partnership
[1]
YES. Confidence: 99.0 %
______
______
sids
[0]
NO Confidence: 65.0 %
______
______
south_south_cooperation
[0]
NO Confidence: 80.0 %
______
______
structural_system_transformation
[0]
NO Confidence: 80.0 %
______


In [10]:
d = dict(zip(categories, hat)) 
d

{'food_and_agricultural_commodities_strategy': 56.99999999999999,
 'green_recovery': 98.0,
 'health': 100.0,
 'human_rights': 100.0,
 'leaving_no_one_behind': 99.0,
 'multi_stakeholder_collaboration': 54.0,
 'nature_based_solution': 62.0,
 'plastic': 100.0,
 'poverty_reduction': 63.0,
 'public_private_partnership': 99.0,
 'sids': 65.0,
 'south_south_cooperation': 80.0,
 'structural_system_transformation': 80.0}