In [1]:
import os
import sklearn
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import log_loss, confusion_matrix, plot_roc_curve
import re
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn') # pretty matplotlib plots

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

import plotly.express as px

# import dataframe_image as dfi
# X columns are
#  website_name | text

# Y columns are
# is_positive_sentiment

#Paths

DATA_DIR = os.path.join("data", "data_reviews")
X_TRAIN  = os.path.join(DATA_DIR, "x_train.csv")
Y_TRAIN  = os.path.join(DATA_DIR, "y_train.csv")
X_TEST   = os.path.join(DATA_DIR, "x_test.csv")

plots_dir = "figures"

In [2]:
# UTILITIES:
# Computes the accuracy of a model using the TP, TN, FP and FN values.
# This is most easily used by passing in the calc_TP_TN_FP_FN() function
def compute_accuracy(TP, TN, FP, FN):
    return (TP + TN) / (TP + TN + FP + FN)

def compute_TPR_TNR_PPV_NPV(TP, TN, FP, FN):
    try:
        TPR = TP / (TP + FN)
    except:
        TPR = 0
    try:
        TNR = TN / (TN + FP)
    except:
        TNR = 0
    try:
        PPV = TP / (TP + FP)
    except:
        PPV = 0
    try:
        NPV = TN / (TN + FN)
    except:
        NPV = 0
    
    # print("{} {:.3f}\n{} {:.3f}\n{} {:.3f}\n{} {:.3f}".format("True Positve Rate: ", TPR,
    #                                           "True Negative Rate:", TNR,
    #                                           "Positive Predictive Rate:", PPV,
    #                                           "Negative Predictive Rate:", NPV))
    return TPR, TNR, PPV, NPV

def calc_TP_TN_FP_FN(ytrue_N, yhat_N):
    '''
    
    Args
    ----
    ytrue_N : 1D array of floats
        Each entry represents the binary value (0 or 1) of 'true' label of one example
        One entry per example in current dataset
    yhat_N : 1D array of floats
        Each entry represents a predicted binary value (either 0 or 1).
        One entry per example in current dataset.
        Needs to be same size as ytrue_N.

    Returns
    -------
    TP : int
        Number of true positives
    TN : int
        Number of true negatives
    FP : int
        Number of false positives
    FN : int
        Number of false negatives
    '''
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    zipped = zip(ytrue_N, yhat_N)
    for pair in zipped:
        if pair[0] == 0:
            if pair[1] == 0:
                TN += 1
            else:
                FP += 1
        else:
            if pair[1] == 1:
                TP += 1
            else:
                FN += 1

    return TP, TN, FP, FN


def calc_perf_metrics_for_threshold(ytrue_N, yproba1_N, thresh=0.5):
    ''' Compute performance metrics for a given probabilistic classifier and threshold
    Args
    ----
    ytrue_N : 1D array of floats
        Each entry represents the binary value (0 or 1) of 'true' label of one example
        One entry per example in current dataset
    yproba1_N : 1D array of floats
        Each entry represents a probability (between 0 and 1) that correct label is positive (1)
        One entry per example in current dataset
        Needs to be same size as ytrue_N
    thresh : float
        Scalar threshold for converting probabilities into hard decisions
        Calls an example "positive" if yproba1 >= thresh
        Default value reflects a majority-classification approach (class is the one that gets
        highest probability)

    Returns
    -------
    acc : accuracy of predictions
    tpr : true positive rate of predictions
    tnr : true negative rate of predictions
    ppv : positive predictive value of predictions
    npv : negative predictive value of predictions
    '''
    # First convert the probabilities into hard choices.
    converted_probs = []
    for prob in yproba1_N:
        if prob >= thresh:
            converted_probs.append(1)
        else:
            converted_probs.append(0)

    scores             = calc_TP_TN_FP_FN(ytrue_N, converted_probs)
    acc                = compute_accuracy(*scores)
    error              = 1 - acc
    l_loss             = log_loss(ytrue_N, yproba1_N)
    tpr, tnr, ppv, npv = compute_TPR_TNR_PPV_NPV(*scores)
    
    return acc, error, l_loss, tpr, tnr, ppv, npv

# You can use this function later to make printing results easier; don't change it.
def metrics_to_dataframe(model_name, ytrue_N, yproba1_N, thresh=0.5):
    ''' Pretty print perf. metrics for a given probabilistic classifier and threshold
    '''
    acc, error, l_loss,\
    tpr, tnr, ppv, npv = calc_perf_metrics_for_threshold(ytrue_N, yproba1_N, thresh)
    round_val = 3
    df = pd.DataFrame(
        [
            [model_name, 
            str(round(acc, round_val)), 
            str(round(error, round_val)), 
            str(round(l_loss, round_val)),
            str(round(tpr, round_val)),
            str(round(tnr, round_val)),
            str(round(ppv, round_val)),
            str(round(npv, round_val))]
        ],
        columns=["Model", "Accuracy", "Error", "Log Loss", "TPR", "TNR", "PPV", "NPV"]
        )

    return df


# You can use this function later to make printing results easier; don't change it.
def print_perf_metrics_for_threshold(ytrue_N, yproba1_N, thresh=0.5):
    ''' Pretty print perf. metrics for a given probabilistic classifier and threshold
    '''
    acc, error, l_loss,\
    tpr, tnr, ppv, npv = calc_perf_metrics_for_threshold(ytrue_N, yproba1_N, thresh)
    
    ## Pretty print the results
    print("ACC: {:.3f}".format(acc))
    print("ERR: {:.3f}".format(error))
    print("L_L: {:.3f}".format(l_loss))
    print("TPR: {:.3f}".format(tpr))
    print("TNR: {:.3f}".format(tnr))
    print("PPV: {:.3f}".format(ppv))
    print("NPV: {:.3f}".format(npv))

def calc_confusion_matrix_for_threshold(ytrue_N, yproba1_N, thresh=0.5):
    ''' Compute the confusion matrix for a given probabilistic classifier and threshold
    
    Args
    ----
    ytrue_N : 1D array of floats
        Each entry represents the binary value (0 or 1) of 'true' label of one example
        One entry per example in current dataset
    yproba1_N : 1D array of floats
        Each entry represents a probability (between 0 and 1) that correct label is positive (1)
        One entry per example in current dataset
        Needs to be same size as ytrue_N
    thresh : float
        Scalar threshold for converting probabilities into hard decisions
        Calls an example "positive" if yproba1 >= thresh
        Default value reflects a majority-classification approach (class is the one that gets
        highest probability)

    Returns
    -------
    cm_df : Pandas DataFrame
        Can be printed like print(cm_df) to easily display results
    '''
    cm = confusion_matrix(ytrue_N, yproba1_N >= thresh)
    cm_df = pd.DataFrame(data=cm, columns=[0, 1], index=[0, 1])
    cm_df.columns.name = 'Predicted'
    cm_df.index.name = 'True'
    return cm_df

def concat_results_and_save(results, filename):
    final_df = results[0]

    for res in results[1:]:
        final_df = pd.concat([final_df, res], ignore_index=True)
    
    final_df = final_df.sort_values(by=["Accuracy"], ignore_index=True)
    print(final_df)
    formatted_results = final_df.style.background_gradient()
    # dfi.export(formatted_results, os.path.join(plots_dir, filename)) 

    return final_df

In [3]:
# Load data
x_train = pd.read_csv(X_TRAIN)
y_train = pd.read_csv(Y_TRAIN)
x_test  = pd.read_csv(X_TEST)


In [4]:

# Some simple text processing to get started
def process_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

x_train['text'] = x_train['text'].apply(lambda x: process_text(x))
x_test['text'] = x_test['text'].apply(lambda x: process_text(x))

x_train['sentiment'] = x_train['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
x_test['sentiment'] = x_test['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [5]:
# NOTE: This whole block is for creating the Bag Of Words charts.
# it is left commented out so we don't reproduce the graphs on subsequent full 
# runs


# def vectorizer_test(x_train, y_train, ngram_ranges):
#     results = {}

#     for range in ngram_ranges:
#         # Creating the vectorizer that changes the words into columns of numbers
#         bigram_vectorizer = CountVectorizer(ngram_range=range,
#                                             token_pattern=r'\b\w+\b', min_df=1)

#         # We use the entire corpus from train and test to fit the model
#         corpus = pd.concat([x_train.text, x_test.text])
#         # Fit the corpus to the vectorizer
#         X_2 = bigram_vectorizer.fit(corpus)

#         # Transform the big X datasets into the vectorized representations
#         x_train_trans = bigram_vectorizer.transform(x_train.text).toarray()
        
#         logreg = LogisticRegression()

#         logreg.fit(x_train_trans, y_train['is_positive_sentiment'])

#         acc = logreg.score(x_train_trans, y_train['is_positive_sentiment'])
#         results[str(range)] = [acc, x_train_trans.shape[1]]

#     df = pd.DataFrame.from_dict(data=results, orient='index', columns=["acc", "n_features"])
#     acc_fig = px.bar(df, x=df.index, y="acc",text_auto='.3%', labels={"index": "N_gram Range", "acc": "Accuracy"})
#     acc_fig.write_image("figures/bag_of_words/accuracy_bar.png")

#     feat_fig = px.bar(df, x=df.index, y="n_features", text_auto='.3s', labels={"index": "N_gram Range", "n_features": "Number of Features"})
#     feat_fig.write_image("figures/bag_of_words/features_bar.png")




In [6]:
# ngram_ranges = [(1,1), (1,2), (1,3), (1,4), (1,5)]

# vectorizer_test(x_train, y_train, ngram_ranges)

In [7]:
# Creating the vectorizer that changes the words into columns of numbers
bigram_vectorizer = CountVectorizer(ngram_range=(1, 3),
                                    token_pattern=r'\b\w+\b', min_df=1)

# We use the entire corpus from train and test to fit the model
corpus = pd.concat([x_train.text, x_test.text])
# Fit the corpus to the vectorizer
X_2 = bigram_vectorizer.fit(corpus)

# Transform the big X datasets into the vectorized representations
x_train_trans = bigram_vectorizer.transform(x_train.text).toarray()
x_test_trans  = bigram_vectorizer.transform(x_test.text).toarray()


In [9]:
# Get the important feature lists and weights assosiated with them

feature_list = X_2.get_feature_names_out()

important_features = []
not_important_features = []
sentiment_weights = []
for i, feature in enumerate(feature_list):
    blob = TextBlob(feature)
    sentiment = blob.sentiment.polarity
    if sentiment != 0:
        important_features.append(i)
        sentiment_weights.append(sentiment)
    else:
        not_important_features.append(i)



In [10]:
# Delete the features from a copy of the big Xs that have no sentimentality
train_sentiments = np.delete(x_train_trans.copy(), not_important_features, 1)
train_sentiments = train_sentiments.astype(float)
test_sentiments  = np.delete(x_test_trans.copy(), not_important_features, 1)
test_sentiments  = test_sentiments.astype(float)


In [12]:


def assign_weights(features):
    for i, row in enumerate(features):
        for j, col in enumerate(row):
            if col != 0:
                features[i][j] = sentiment_weights[j]


def print_non_0s(features):
    for i, row in enumerate(features):
        for j, col in enumerate(row):
            if col != 0:
                print("row:", i, "col:", j, "weight:", col)

assign_weights(train_sentiments)
assign_weights(test_sentiments)
# print_non_0s(train_trans)


In [13]:
# Add in the column for the sentiment of the entire sentence that we 
# created earlier
train_sentiments = np.hstack((train_sentiments, x_train.sentiment.to_frame()))
test_sentiments  = np.hstack((test_sentiments, x_test.sentiment.to_frame()))



In [15]:
x_train_trans = np.hstack((x_train_trans, train_sentiments))
x_test_trans  = np.hstack((x_test_trans, test_sentiments))

In [17]:
# NAIVE Check using simple LogisticRegression solver

# logreg = LogisticRegression()
# logreg.fit(x_train_trans, y_train['is_positive_sentiment'])
# acc = logreg.score(x_train_trans, y_train['is_positive_sentiment'])


In [18]:
# Seee results
# print(acc)

# res_df = metrics_to_dataframe("Logistic Regression | Features Include Sentiment Weights",
#                       y_train.is_positive_sentiment,
#                       logreg.predict_proba(x_train_trans)[:, 1])

# concat_results_and_save([res_df], "part2_results.png")

# test_pred  = logreg.predict_proba(x_test_trans)[:, 1]
# np.savetxt("yproba1_test.txt", test_pred)

In [19]:
# LOGISTIC REGRESSION STUFFSSS
# Using L2 as regularization penalty

# model = LogisticRegression()
# solvers = ['lbfgs', 'liblinear', 'sag', 'saga']
# penalty = ['l2']
# c_values = [0.01, 0.1, 1.0, 10, 100]

# # define grid search
# grid = dict(solver=solvers,penalty=penalty,C=c_values)
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1)
# grid_search = GridSearchCV(
#     estimator=model, 
#     param_grid=grid, 
#     n_jobs=-1, 
#     cv=cv, 
#     scoring='accuracy',
#     error_score=0, 
#     return_train_score=True)
# grid_result = grid_search.fit(x_train_trans, y_train.is_positive_sentiment)






In [20]:
# # summarize results
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means  = grid_result.cv_results_['mean_test_score']
# stds   = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

# L2_grid_search_df = pd.DataFrame.from_dict(grid_result.cv_results_)
# L2_grid_search_df.to_csv("result_csvs/L2_grid_search_results.csv")

In [21]:
# Lets try L1 as the reg penalty
# L1_model   = LogisticRegression()
# L1_solvers = ['liblinear', 'saga']
# L1_penalty = ['l1']
# c_values   = [0.01, 0.1, 1.0, 10, 100]

# # define grid search
# L1_grid = dict(solver=L1_solvers, penalty=L1_penalty, C=c_values)
# L1_cv   = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# L1_grid_search = GridSearchCV(
#     estimator=L1_model, 
#     param_grid=L1_grid, 
#     n_jobs=-1, 
#     cv=L1_cv, 
#     scoring='accuracy',
#     error_score=0,
#     return_train_score=True)
# L1_grid_result = L1_grid_search.fit(x_train_trans, y_train.is_positive_sentiment)

# # summarize results
# print("Best: %f using %s" % (L1_grid_result.best_score_, L1_grid_result.best_params_))
# L1_means  = L1_grid_result.cv_results_['mean_test_score']
# L1_stds   = L1_grid_result.cv_results_['std_test_score']
# L1_params = L1_grid_result.cv_results_['params']
# for mean, stdev, param in zip(L1_means, L1_stds, L1_params):
#     print("%f (%f) with: %r" % (mean, stdev, param))


In [22]:
# summarize results
# print("Best: %f using %s" % (L1_grid_result.best_score_, L1_grid_result.best_params_))
# L1_means  = L1_grid_result.cv_results_['mean_test_score']
# L1_stds   = L1_grid_result.cv_results_['std_test_score']
# L1_params = L1_grid_result.cv_results_['params']
# for mean, stdev, param in zip(L1_means, L1_stds, L1_params):
#     print("%f (%f) with: %r" % (mean, stdev, param))


# L1_grid_search_df = pd.DataFrame.from_dict(L1_grid_result.cv_results_)
# L1_grid_search_df.to_csv("result_csvs/L1_grid_search_results.csv")

In [23]:
# Lets try MLP

# Things to modify
# * Alpha for sure
# * Random state, since class
# * Activation function 

# from sklearn.neural_network import MLPClassifier
# mlp_model         = MLPClassifier()
# mlp_solver        = ['lbfgs']
# mlp_random_states = [1,2,3,4,5]
# mlp_activation_functions = ["identity", "logistic", "tanh", "relu"]
# alphas = np.logspace(-1, 1, 5)

# # define grid search
# mlp_grid = dict(
#     solver=mlp_solver,
#     random_state=mlp_random_states, 
#     activation=mlp_activation_functions, 
#     alpha=alphas, 
# )
# mlp_cv          = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
# mlp_grid_search = GridSearchCV(
#     estimator=mlp_model, 
#     param_grid=mlp_grid, 
#     n_jobs=4, 
#     cv=mlp_cv, 
#     scoring='accuracy',
#     error_score=0,
#     return_train_score=True)
# mlp_grid_result = mlp_grid_search.fit(x_train_trans, y_train.is_positive_sentiment)



In [24]:
# summarize results
# print("Best: %f using %s" % (mlp_grid_result.best_score_, mlp_grid_result.best_params_))
# mlp_means  = mlp_grid_result.cv_results_['mean_test_score']
# mlp_stds   = mlp_grid_result.cv_results_['std_test_score']
# mlp_params = mlp_grid_result.cv_results_['params']
# for mean, stdev, param in zip(mlp_means, mlp_stds, mlp_params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

# mlp_grid_search_results_df = pd.DataFrame.from_dict(mlp_grid_search.cv_results_)
# mlp_grid_search_results_df.to_csv("results_csvs/mlp_grid_search_results.csv")

In [25]:
# ADA boooooost me
from sklearn.ensemble import AdaBoostClassifier

# define the model with default hyperparameters
ada_model = AdaBoostClassifier()
# define the grid of values to search
ada_grid = dict(
    # Number of Trees. Number of weak learners
    n_estimators    = [int(x) for x in np.logspace(1, 4, 5)],      # [10, 21, 46, 100, 215, 464, 1000, 2154, 4641, 10000]
    # Depth of Tree. We can make the models used in the ensemble less weak (more skillful) by increasing the depth of the decision tree.
    # base_estimators = [1, 2, 3, 4, 5], # NEVERMIND!!!! FUCK IT
    # lower rates are better for more trees, larger for less.
    learning_rate   = [float(x) for x in np.logspace(-3, 1, 5)]     # [0.001, 0.01, 0.1, 1.0, 10.0]
)

# define the evaluation procedure
ada_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
# define the grid search procedure
ada_grid_search = GridSearchCV(
    estimator=ada_model, 
    param_grid=ada_grid, 
    n_jobs=2, 
    cv=ada_cv, 
    scoring='accuracy',
    error_score=0,
    return_train_score=True
    )
# execute the grid search
ada_grid_result = ada_grid_search.fit(x_train_trans, y_train.is_positive_sentiment)
# summarize the best score and configuration



In [None]:
print("Best: %f using %s" % (ada_grid_result.best_score_, ada_grid_result.best_params_))
# summarize all scores that were evaluated
ada_means = ada_grid_result.cv_results_['mean_test_score']
ada_stds = ada_grid_result.cv_results_['std_test_score']
ada_params = ada_grid_result.cv_results_['params']
for mean, stdev, param in zip(ada_means, ada_stds, ada_params):
    print("%f (%f) with: %r" % (mean, stdev, param))

ada_grid_results_df = pd.DataFrame.from_dict(ada_grid_result.cv_results_)