In [1]:
import sys
sys.path.append('../')
%matplotlib inline
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve
from sklearn.metrics import accuracy_score
from sklearn import metrics

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier

In [2]:
from mycode.utils import classes_def, clean_tweets

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/josehuillca/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load Data

In [3]:
file_csv = "../dataset/Corona_NLP_test.csv"
dataset = pd.read_csv(file_csv,encoding="latin")

## Pipeline

### Define output classes

In [4]:
dataset['class'] = dataset['Sentiment'].apply(lambda x:classes_def(x))
dataset["class"].value_counts(normalize= True)

0    0.429963
2    0.407056
1    0.162981
Name: class, dtype: float64

In [5]:
dataset.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,class
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative,0
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive,2
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive,2
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative,0
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,1


### Text Cleaning

In [6]:
preprocessed_tweets = clean_tweets(dataset['OriginalTweet'].values)

100%|██████████| 3798/3798 [00:01<00:00, 2745.37it/s]


### Converting Text to Numerical Vector

In [7]:
tf_idf_vect = TfidfVectorizer(min_df=10)
tf_idf_vect.fit(preprocessed_tweets)
final_tf_idf = tf_idf_vect.transform(preprocessed_tweets)
print("The shape of out text TFIDF vectorizer: ",final_tf_idf.get_shape())

The shape of out text TFIDF vectorizer:  (3798, 1124)


### Select Atrubuttes

In [8]:
X = final_tf_idf.toarray()  # OriginalTweet
y = np.array(dataset["class"].tolist())

### Train and Test Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3)

## Stacking

In [10]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold

In [11]:
# get a stacking ensemble of models
def get_stacking(n_splits=10):
    cv = StratifiedKFold(n_splits=n_splits)
    # define the base models
    level0 = list()
    level0.append(('rfc', RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=50, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)))
    level0.append(('mnb', MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)))
    level0.append(('xgbc', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
           colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
           max_depth=7, min_child_weight=1, missing=None, n_estimators=500,
           n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
           reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
           silent=True, subsample=1)))
    level0.append(('sgdc', SGDClassifier(class_weight='balanced', penalty='l1')))
    
    # define meta learner model
    level1 = SGDClassifier(class_weight='balanced', penalty='l1')
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=cv)
    return model

# get a list of models to evaluate
def get_models():
    models = dict()
    models['rfc'] = RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=50, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
    models['mnb'] = MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)
    models['xgbc'] = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
           colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
           max_depth=7, min_child_weight=1, missing=None, n_estimators=500,
           n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
           reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
           silent=True, subsample=1)
    models['sgdc'] = SGDClassifier(class_weight='balanced', penalty='l1')
    models['stacking'] = get_stacking()
    return models

# evaluate a given model using cross-validation
def evaluate_model_train(model, X_, y_, n_splits=10):
    cv = StratifiedKFold(n_splits=n_splits)
    #predict_proba = cross_val_predict(model, X_, y_, cv=n_splits, n_jobs=-1, method='predict_proba')
    predict = cross_val_predict(model, X_, y_, cv=cv, n_jobs=-1)
    print(f'Training Data CV Score: {np.round(accuracy_score(y_, predict),4) * 100}%')
    return predict

def evaluate_model_test(model, X_tr, y_tr, X_te, y_te):
    # Fit model
    model.fit(X_tr, y_tr)
    # Make predictions
    y_pred = model.predict(X_te)
    print(f'Testing Data Accuracy Score: {np.round(accuracy_score(y_te, y_pred), 4) * 100}%')
    return y_pred
    

### Data Train

In [None]:
# get the models to evaluate
models = get_models()
results_train = list()
for name, model in models.items():
    print('='*50)
    print(name)
    print('-'*50)
    results_train = evaluate_model_train(model, X_train, y_train)

rfc
--------------------------------------------------
Training Data CV Score: 61.17%
mnb
--------------------------------------------------
Training Data CV Score: 60.46%
xgbc
--------------------------------------------------
Training Data CV Score: 63.019999999999996%
sgdc
--------------------------------------------------
Training Data CV Score: 63.73%
stacking
--------------------------------------------------


### Data Test

In [None]:
# evaluate the models and store results
results, names, times = list(), list(), list()
for name, model in models.items():
    print('='*50)
    print(name)
    print('-'*50)
    start_time = time.time()
    predict = evaluate_model_test(model, X_train, y_train, X_test, y_test)
    results.append(predict)
    names.append(name)
    times.append(time.time() - start_time)
    

## Metrics

In [None]:
def classification_report(y_actual, list_y_predict, target_names, names_models, times):
    for i in range(len(list_y_predict)):
        print('='*50)
        print(names_models[i], ': ', times[i], ' seconds.')
        print('-'*50)
        print(metrics.classification_report(y_actual, list_y_predict[i], 
                                    target_names= target_names))

def plot_save_confussionmatrix(y_actual, list_y_pred, names_models):
    plt.clf()
    for i in range(len(list_y_pred)):
        cm=confusion_matrix(y_actual, list_y_pred[i])
        cm_df=pd.DataFrame(cm,index=[0,1,2],columns=[0,1,2])
        print("Accuracy:",names_models[i], accuracy_score(y_actual, list_y_pred[i]))

        sns.set(font_scale=1.,color_codes=True,palette="deep")
        sns.heatmap(cm_df,annot=True,annot_kws={"size":12},fmt="d",cmap="YlGnBu")
        plt.title("Confusion Matrix")
        plt.xlabel("Predicted Value")
        plt.ylabel("True Value")
        plt.savefig('../dataset/images_result/'+ names_models[i] + '.png', format='png', dpi=120)

In [None]:
classification_report(y_train, results, dataset['class'].unique(), names, times)

In [None]:
plot_save_confussionmatrix(y_train, results, names)