In [None]:
import pickle
import pandas as pd
from preprocessing import clean_sentence
from utils import carbon_class_filter, get_sum_probs, get_majority_pred_soft
import json
import pandas as pd
import ast


In [None]:
# instantiate model paths
DATA_FOLDER = "../data/"
LOGREG_VECT = DATA_FOLDER + "saved_models/carbonclass_models/model_LR_vectorizer.pkl"
LOGREG_MODEL = DATA_FOLDER + "saved_models/carbonclass_models/model_LR.pkl"
SVM_VECT = DATA_FOLDER + "saved_models/carbonclass_models/model_SVM_vectorizer.pkl"
SVM_MODEL = DATA_FOLDER + "saved_models/carbonclass_models/model_SVM.pkl"
NB_VECT = DATA_FOLDER + "saved_models/carbonclass_models/model_NB_vectorizer.pkl"
NB_MODEL = DATA_FOLDER + "saved_models/carbonclass_models/model_NB.pkl"
RF_VECT = DATA_FOLDER + "saved_models/carbonclass_models/model_RF_vectorizer.pkl"
RF_MODEL = DATA_FOLDER + "saved_models/carbonclass_models/model_RF.pkl"
CB_VECT = DATA_FOLDER + "saved_models/carbonclass_models/model_CB_vectorizer.pkl"
CB_MODEL = DATA_FOLDER + "saved_models/carbonclass_models/model_CB.pkl"

In [None]:
def text_classification_pipeline(df):
    if len(df) == 0:
        return []
    # LOG REG
    lr_vect = pickle.load(open(LOGREG_VECT, "rb"))
    lr_model = pickle.load(open(LOGREG_MODEL, "rb"))
    lr_vected_text = lr_vect.transform(df.cleaned_sentence)
    lr_pred = lr_model.predict_proba(lr_vected_text)
    df['lr_prob_0'] = [i[0] for i in lr_pred]
    df['lr_prob_1'] = [i[1] for i in lr_pred]
    df['lr_prob_2'] = [i[2] for i in lr_pred]
    df['lr_prob_3'] = [i[3] for i in lr_pred]
    df['lr_prob_4'] = [i[4] for i in lr_pred]

    # NB
    nb_vect = pickle.load(open(NB_VECT, "rb"))
    nb_model = pickle.load(open(NB_MODEL, "rb"))
    nb_vected_text = nb_vect.transform(df.cleaned_sentence)
    nb_pred = nb_model.predict_proba(nb_vected_text)
    df['nb_prob_0'] = [i[0] for i in nb_pred]
    df['nb_prob_1'] = [i[1] for i in nb_pred]
    df['nb_prob_2'] = [i[2] for i in nb_pred]
    df['nb_prob_3'] = [i[3] for i in nb_pred]
    df['nb_prob_4'] = [i[4] for i in nb_pred]

    # SVM
    svm_vect = pickle.load(open(SVM_VECT, "rb"))
    svm_model = pickle.load(open(SVM_MODEL, "rb"))
    svm_vected_text = svm_vect.transform(df.sentence)
    svm_pred = svm_model.predict_proba(svm_vected_text)
    df['svm_prob_0'] = [i[0] for i in svm_pred]
    df['svm_prob_1'] = [i[1] for i in svm_pred]
    df['svm_prob_2'] = [i[2] for i in svm_pred]
    df['svm_prob_3'] = [i[3] for i in svm_pred]
    df['svm_prob_4'] = [i[4] for i in svm_pred]


    # RF
    rf_vect = pickle.load(open(RF_VECT, "rb"))
    rf_model = pickle.load(open(RF_MODEL, "rb"))
    rf_vected_text = rf_vect.transform(df.cleaned_sentence)
    rf_pred = rf_model.predict_proba(rf_vected_text)
    df['rf_prob_0'] = [i[0] for i in rf_pred]
    df['rf_prob_1'] = [i[1] for i in rf_pred]
    df['rf_prob_2'] = [i[2] for i in rf_pred]
    df['rf_prob_3'] = [i[3] for i in rf_pred]
    df['rf_prob_4'] = [i[4] for i in rf_pred]

    # CB
    cb_vect = pickle.load(open(CB_VECT, "rb"))
    cb_model = pickle.load(open(CB_MODEL, "rb"))
    cb_vected_text = cb_vect.transform(df.cleaned_sentence)
    cb_pred = cb_model.predict_proba(cb_vected_text)
    df['cb_prob_0'] = [i[0] for i in cb_pred]
    df['cb_prob_1'] = [i[1] for i in cb_pred]
    df['cb_prob_2'] = [i[2] for i in cb_pred]
    df['cb_prob_3'] = [i[3] for i in cb_pred]
    df['cb_prob_4'] = [i[4] for i in cb_pred]

    # WORD HEURISTICS
    heu_preds = list(df.apply(carbon_class_filter, axis=1))

    # GET VOTING CLASSIFIER
    df = get_sum_probs(df, heu_preds)
    model_pred  = get_majority_pred_soft(df)

    return model_pred

In [None]:
def text_except_relevance_pipeline(json_path):
    # Opening JSON file
    
    f = open(json_path,)

    # returns JSON object as a dictionary
    data = json.load(f)

    # Loop through for each company
    print("Text Classification Pipeline")
    i = data
    print(i['company'])
    try:
        text_data = i['text_output']['sentence']
    except TypeError:
        temp_dict = ast.literal_eval(i['text_output'])
        new_dict = {}
        for k,v in temp_dict.items():
            new_dict[k] = list(v.values())
        i['text_output'] = new_dict
        text_data = new_dict['sentence']
    finally:
        df = pd.DataFrame(text_data, columns = ['sentence'])
        df['cleaned_sentence'] = df['sentence'].apply(clean_sentence)

    # Text Classification
    print("Generating Text Classification Predictions")
    text_class_pred = text_classification_pipeline(df)

    # Save Text Classification Predictions
    i['text_output']['carbon_class'] = text_class_pred

    # # Rule Mining
    print("Generating Rule Mining Text")
    mined_text = rule_mining_pipeline(df)

    # # Save Mined Text
    i['text_output']['mined_text'] = mined_text

    # Word Cloud
    print("Generating Word Clouds")
    wordcloud_img_path = word_cloud_pipeline(df.sentence, text_class_pred, i['company'], i['year'])

    # Save Word Cloud Image Paths
    i['wordcloud_img_path'] = wordcloud_img_path

    # Sentiment Analysis
    print("Generating Sentiments")
    class_sentiments = sentiment_analysis_pipeline(df ,text_class_pred)
    i['sentiment_score'] = class_sentiments

    print('Writing Data')
    output_path = json_path[:-5] + "_all.json"
    
    with open(json_path, 'w') as fp:
         json.dump(data,fp)
    
    return output_path


In [None]:
file_path = "../data/sustainability_reports/new/Canada Pension2017_text_output.json"
text_except_relevance_pipeline(file_path)