In [26]:
import sys
sys.path.append('../')
%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve
from sklearn.metrics import accuracy_score
from sklearn import metrics

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier

In [3]:
from mycode.utils import classes_def, clean_tweets

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/josehuillca/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load Data

In [4]:
file_csv = "../dataset/Corona_NLP_test.csv"
dataset = pd.read_csv(file_csv,encoding="latin")

## Pipeline

### Define output classes

In [5]:
dataset['class'] = dataset['Sentiment'].apply(lambda x:classes_def(x))
dataset["class"].value_counts(normalize= True)

0    0.429963
2    0.407056
1    0.162981
Name: class, dtype: float64

In [6]:
dataset.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,class
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative,0
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive,2
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive,2
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative,0
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,1


### Text Cleaning

In [7]:
preprocessed_tweets = clean_tweets(dataset['OriginalTweet'].values)

100%|██████████| 3798/3798 [00:01<00:00, 2659.46it/s]


### Converting Text to Numerical Vector

In [8]:
tf_idf_vect = TfidfVectorizer(min_df=10)
tf_idf_vect.fit(preprocessed_tweets)
final_tf_idf = tf_idf_vect.transform(preprocessed_tweets)
print("The shape of out text TFIDF vectorizer: ",final_tf_idf.get_shape())

The shape of out text TFIDF vectorizer:  (3798, 1124)


### Select Atrubuttes

In [9]:
X = final_tf_idf  # OriginalTweet
y = dataset["class"].tolist()

### Train and Test Split

In [10]:
#X_train, X_test, y_train, y_test = train_test_split(X.tocsr(), y, test_size= 0.3, stratify=y,  random_state=42)

## Stacking

In [11]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [16]:
# get a stacking ensemble of models
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('rfc', RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=50, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)))
    level0.append(('mnb', MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)))
    level0.append(('sgdc', SGDClassifier(class_weight='balanced', penalty='l1')))
    level0.append(('xgbc', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
           colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
           max_depth=7, min_child_weight=1, missing=None, n_estimators=500,
           n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
           reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
           silent=True, subsample=1)))
    #level0.append(('bayes', GaussianNB()))
    # define meta learner model
    level1 = SGDClassifier(class_weight='balanced', penalty='l1')
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return model

# get a list of models to evaluate
def get_models():
    models = dict()
    models['rfc'] = RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=50, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
    models['mnb'] = MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)
    models['sgdc'] = SGDClassifier(class_weight='balanced', penalty='l1')
    models['xgbc'] = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
           colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
           max_depth=7, min_child_weight=1, missing=None, n_estimators=500,
           n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
           reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
           silent=True, subsample=1)
    #models['bayes'] = GaussianNB()
    models['stacking'] = get_stacking()
    return models

# evaluate a given model using cross-validation
def evaluate_model(model, X_, y_, cv=5):
    #cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    predict = cross_val_predict(model, X_, y_, cv=5, n_jobs=-1)
    return predict

In [17]:
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    predict = evaluate_model(model, X, y)
    results.append(predict)
    names.append(name)

## Metrics

In [24]:
def classification_report(y_actual, list_y_predict, target_names, names_models):
    for i in range(len(list_y_predict)):
        print('='*50)
        print(names_models[i])
        print('-'*50)
        print(metrics.classification_report(y_actual, list_y_predict[i], 
                                    target_names= target_names))

def plot_save_confussionmatrix(y_actual, list_y_pred, names_models):
    for i in range(len(list_y_predict)):
        cm=confusion_matrix(y_actual, list_y_predict[i])
        cm_df=pd.DataFrame(cm,index=[0,1,2],columns=[0,1,2])
        print("Accuracy:",names_models[i], accuracy_score(y_test, y_pred))

        sns.set(font_scale=1.4,color_codes=True,palette="deep")
        sns.heatmap(cm_df,annot=True,annot_kws={"size":16},fmt="d",cmap="YlGnBu")
        plt.title("Confusion Matrix")
        plt.xlabel("Predicted Value")
        plt.ylabel("True Value")

In [25]:
classification_report(y, results, dataset['class'].unique(), names)

rfc
------------------------------
              precision    recall  f1-score   support

           0       0.64      0.65      0.64      1633
           2       0.44      0.47      0.46       619
           1       0.66      0.63      0.65      1546

    accuracy                           0.61      3798
   macro avg       0.58      0.58      0.58      3798
weighted avg       0.62      0.61      0.61      3798

mnb
------------------------------
              precision    recall  f1-score   support

           0       0.61      0.72      0.66      1633
           2       0.53      0.15      0.24       619
           1       0.62      0.68      0.65      1546

    accuracy                           0.61      3798
   macro avg       0.59      0.52      0.52      3798
weighted avg       0.60      0.61      0.59      3798

sgdc
------------------------------
              precision    recall  f1-score   support

           0       0.71      0.67      0.69      1633
           2       0.46

### Curve ROC

In [30]:

fpr, tpr, _ = roc_curve(truth,  pred)
auc = metrics.roc_auc_score(truth,  pred)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

ValueError: multilabel-indicator format is not supported

In [31]:
print(results[0][:10])

['0' '2' '1' '0' '0' '2' '2' '0' '2' '2']
