# Building and Evaluating Models

## Import Libraries

In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use('seaborn')

import joblib
import pickle

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from cf_matrix import make_confusion_matrix

import warnings
warnings.filterwarnings("ignore")

print('All Libraries Imported')

All Libraries Imported


## Import Datasets

The datasets for all six classes of tweets are imported for modeling. 

In [83]:
all_data = pd.read_pickle('all_data.pkl')
all_data.reset_index(inplace=True)
all_data.drop(['index'], axis=1, inplace=True)

In [84]:
display(all_data.head())
display(all_data.info())
display(round(all_data.describe(),2))

Unnamed: 0,Class,Created At,Lemmatized,Sentiment,Sentiment_TextBlob,Sentiment_VADER,Subjectivity,Tweet_Strings,Tweets,cleaned_tweets
0,Luxury,2019-12-13 14:47:03,"[polo, anyone, world, snow, polo, championship...",1,0.0,0.4404,0.0,polo anyone world snow polo championship held ...,Polo anyone?\r\n#StRegis World Snow #Polo Cham...,"[polo, anyone, world, snow, polo, championship..."
1,Luxury,2019-12-13 13:05:34,"[fantastic, night, celebrating, th, anniversary]",1,0.4,0.8074,0.9,fantastic night celebrating th anniversary,Fantastic night at the @TheStRegisMC celebrati...,"[fantastic, night, celebrating, th, anniversary]"
2,Luxury,2019-12-13 09:16:46,"[exceptional, tropical, sunshine, ensures, ult...",1,0.413333,0.8689,0.8,exceptional tropical sunshine ensures ultimate...,It's the exceptional tropical sunshine that en...,"[exceptional, tropical, sunshine, ensures, ult..."
3,Luxury,2019-12-13 04:05:43,"[birthday, sagittarius, aspen, ilovemylife, ha...",1,0.0,0.0,0.0,birthday sagittarius aspen ilovemylife happybi...,This is 38. 🎂🏹👀😬🍺🍰💩 @stregisaspen #birthday #🎂...,"[birthday, sagittarius, aspen, ilovemylife, ha..."
4,Luxury,2019-12-13 00:14:04,"[stunning, overwater, bar, see, today, called,...",1,0.341667,0.9359,0.5,stunning overwater bar see today called whale ...,Stunning overwater bar you will see today! It'...,"[stunning, overwater, bar, see, today, called,..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19936 entries, 0 to 19935
Data columns (total 10 columns):
Class                 19936 non-null object
Created At            19936 non-null object
Lemmatized            19936 non-null object
Sentiment             19936 non-null int64
Sentiment_TextBlob    19936 non-null float64
Sentiment_VADER       19936 non-null float64
Subjectivity          19936 non-null float64
Tweet_Strings         19936 non-null object
Tweets                19936 non-null object
cleaned_tweets        19936 non-null object
dtypes: float64(3), int64(1), object(6)
memory usage: 1.5+ MB


None

Unnamed: 0,Sentiment,Sentiment_TextBlob,Sentiment_VADER,Subjectivity
count,19936.0,19936.0,19936.0,19936.0
mean,0.87,0.18,0.29,0.35
std,0.33,0.31,0.43,0.32
min,0.0,-1.0,-0.97,0.0
25%,1.0,0.0,0.0,0.0
50%,1.0,0.05,0.36,0.35
75%,1.0,0.35,0.64,0.6
max,1.0,1.0,0.99,1.0


In [85]:
#missing data
total = all_data.isnull().sum().sort_values(ascending=False)
percent = (all_data.isnull().sum()/all_data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
cleaned_tweets,0,0.0
Tweets,0,0.0
Tweet_Strings,0,0.0
Subjectivity,0,0.0
Sentiment_VADER,0,0.0
Sentiment_TextBlob,0,0.0
Sentiment,0,0.0
Lemmatized,0,0.0
Created At,0,0.0
Class,0,0.0


## Defining Useful Functions

These functions are used to create models and gather metrics on the classification models. 

In [86]:
def fit_best_model(classifier, param_grid, dataset):
    """
    This function will use a gridsearch to fit a model using the classifier and parameter
    grid provided. It will return a dictionary containng the model along with test results.
    
    INPUTS:
    classifier                 = The classifier being fit.
    datasets    = Tuple containing train and test data in the format (X_train,X_test,y_train,y_test)
    param_grid          = The parameters to use for gridsearch fitting of the classifier.
    
    RETURNS:
    Dataframe with the following keys:
    Dataset = The dataset used to create the best model.
    Best parameters = The best model parameters used.
    Best Training Score  = The best accuracy score of the model on the training data.
    Test score  = The accuracy of the model on the test data.
    """
    results = []
    
    for data in datasets:
    
        gs = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5, scoring='accuracy', verbose = 5)
        gs.fit(data['X_train'], data['y_train'])

        model_params = gs.best_params_
        model_score = round(gs.best_score_,2)
        y_pred_test  = gs.predict(data['X_test'])

        accuracy_test  = round(accuracy_score(data['y_test'],y_pred_test),4)
        
        results.append({'Dataset':data['name'], 'Best Training Score': model_score, 'Test Score': accuracy_test, 'Best Parameters': model_params})
        
    pd.set_option('display.max_colwidth', 2000)
    df = pd.DataFrame(results)
    df.sort_values(by=['Test Score','Best Training Score'], inplace=True, ascending=False)
        
    return df

In [87]:
def plot_feature_importances(model):
    n_features = len(X_train.columns)
    plt.figure(figsize=(14,10))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), X_train.columns.values) 
    plt.xlabel("Feature importance")
    plt.ylabel("Features")

In [88]:
def feature_importances(model):
    n_features = tf_idf_X_train.shape[1]
    plt.figure(figsize=(8,8))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), tf_idf_df_train.columns.values) 
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")

In [89]:
def plot_roc_curve(fpr, tpr):
    sns.set_style("darkgrid", {"axes.facecolor": ".9"})

    plt.figure(figsize=(10,6))
    lw = 2
    auc = round(auc(fpr, tpr),2)
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='AUC = %0.2f' % auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.yticks([i/20.0 for i in range(21)])
    plt.xticks([i/20.0 for i in range(21)])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

# Creating Datasets

## Split Datasets into Training and Validation Sets

So that the models are not overfitted, the data is split into one set for testing models and another set for validating the model. The train_test_split() method is used to create datasets for all six classes of tweets. 

In [90]:
X = all_data['Tweet_Strings']
y = all_data['Class']

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)

## Count Vectorizer

Since most algorithms cannot interpret text, the tweets are converted into an array of numbers. This method is called vectoriztations. 

In [92]:
count_vec = CountVectorizer()
bow_X_train = count_vec.fit_transform(X_train).toarray()
bow_X_test = count_vec.transform(X_test)
bow_df_train = pd.DataFrame(X_train)

## TF_IDF Vectorizer

In [93]:
vectorizer = TfidfVectorizer()
tf_idf_X_train = vectorizer.fit_transform(X_train).toarray()
tf_idf_X_test = vectorizer.transform(X_test)
tf_idf_df_train = pd.DataFrame(tf_idf_X_train)

In [94]:
tf_idf = {'name': 'tf_idf','X_train': tf_idf_X_train, 'y_train':y_train, 'X_test': tf_idf_X_test, 'y_test': y_test}
bag_of_words = {'name': 'bag_of_words','X_train': bow_X_train, 'y_train': y_train, 'X_test': bow_X_test, 'y_test':y_test}

In [95]:
datasets = [tf_idf, bag_of_words]

# Building Classification Models

Three classification models have been chosen:

- Multinomial Naïve Bayes - This version of Naive Bayes, created specially for text, explicitly models the word counts and adjusts the underlying calculations.

- Logistic Regression - Logistic regression is used to describe data and to explain the relationship between one dependent binary variable and one or more nominal, ordinal, interval or ratio-level independent variables.

- Random Forest - Random Forest algorithm is an ensemble of Decision Trees. Random forests creates decision trees on randomly selected data samples, gets prediction from each tree and selects the best solution by means of voting.

GridsearchCV will be used to tune the models by finding the best perfoming parameters. 

## Multinomial Naïve Bayes

In [16]:
naive = MultinomialNB()
naive_param_grid = {'alpha': [1, 0.8, 0.5, 0.1, 0.01, 0.001, 0.0001, 0.00001],
                    'fit_prior': [True, False]}

In [None]:
fit_best_model(naive, naive_param_grid, datasets)

### Best Model Parameters - Multinomial Naïve Bayes

Final model constructed with best parameters for Multinomial Naïve Bayes.

* **Alpha**: 0.1
* **Fit_Prior**: True
* **Dataset**: TF_IDF

In [17]:
nb = MultinomialNB(alpha = 0.1, fit_prior = True)

nb.fit(tf_idf_X_train, y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [18]:
nb_train_preds = nb.predict(tf_idf_X_train)
nb_preds = nb.predict(tf_idf_X_test)

In [19]:
print('Training Data - Multinomial Naïve Bayes\n')
print(confusion_matrix(y_train, nb_train_preds))
print(classification_report(y_train, nb_train_preds))

Training Data - Multinomial Naïve Bayes

[[  537   127     0   279     0     0]
 [   21  2477     0   320    30     0]
 [    0    20   504   309     0     0]
 [    2   149    25 10561     4     1]
 [   24   141     0   120   224     0]
 [    0     1     1    58     0    13]]
              precision    recall  f1-score   support

 Booking.com       0.92      0.57      0.70       943
     Expedia       0.85      0.87      0.86      2848
      Luxury       0.95      0.61      0.74       833
     Premium       0.91      0.98      0.94     10742
   Priceline       0.87      0.44      0.58       509
      Select       0.93      0.18      0.30        73

    accuracy                           0.90     15948
   macro avg       0.90      0.61      0.69     15948
weighted avg       0.90      0.90      0.89     15948



In [20]:
print('Testing Data - Multinomial Naïve Bayes\n')
print(confusion_matrix(y_test, nb_preds))
print(classification_report(y_test, nb_preds))

Testing Data - Multinomial Naïve Bayes

[[  69   50    0   97    0    0]
 [   9  503    0  210    8    0]
 [   0    7   42  133    0    0]
 [   4   80    9 2640    2    0]
 [   5   36    0   39   29    0]
 [   0    1    0   15    0    0]]
              precision    recall  f1-score   support

 Booking.com       0.79      0.32      0.46       216
     Expedia       0.74      0.69      0.71       730
      Luxury       0.82      0.23      0.36       182
     Premium       0.84      0.97      0.90      2735
   Priceline       0.74      0.27      0.39       109
      Select       0.00      0.00      0.00        16

    accuracy                           0.82      3988
   macro avg       0.66      0.41      0.47      3988
weighted avg       0.81      0.82      0.80      3988



In [21]:
joblib.dump(nb, 'nb.pkl')

['nb.pkl']

## Logistic Regression

In [29]:
log_reg = LogisticRegression(random_state = 42, multi_class = 'multinomial')
log_param_grid = {'C':[0.01, 0.1, 1, 2, 10],
                 'solver':['lbfgs','saga','newton-cg', 'sag'],
                 'class_weight': ['balanced', None]}

In [None]:
fit_best_model(log_reg, log_param_grid, datasets)

### Best Model Parameters - Logistic Regression

Final model constructed with best parameters for Multinomial Logistic Regression

* **C: 2
* **Solver**: Newton-cg
* **Class_Weight**: None
* **Dataset**: TF_IDF

In [30]:
log_reg = LogisticRegression(C= 2, class_weight= None , solver= 'newton-cg', random_state=42, multi_class = 'multinomial')

log_reg.fit(tf_idf_X_train, y_train)

LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=42, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
log_train_preds = log_reg.predict(tf_idf_X_train)
log_preds = log_reg.predict(tf_idf_X_test)

In [32]:
print('Training Data - Multinomial Logistic Regression\n')
print(confusion_matrix(y_train, log_train_preds))
print(classification_report(y_train, log_train_preds))

Training Data - Multinomial Logistic Regression

[[  526    92     1   311    13     0]
 [   19  2410     8   393    18     0]
 [    2    10   368   453     0     0]
 [    5    78     9 10648     2     0]
 [   17   133     2   151   206     0]
 [    0     0     1    64     1     7]]
              precision    recall  f1-score   support

 Booking.com       0.92      0.56      0.70       943
     Expedia       0.89      0.85      0.87      2848
      Luxury       0.95      0.44      0.60       833
     Premium       0.89      0.99      0.94     10742
   Priceline       0.86      0.40      0.55       509
      Select       1.00      0.10      0.17        73

    accuracy                           0.89     15948
   macro avg       0.92      0.56      0.64     15948
weighted avg       0.89      0.89      0.88     15948



In [33]:
print('Testing Data - Multinomial Logistic Regression\n')
print(confusion_matrix(y_test, log_preds))
print(classification_report(y_test, log_preds))

Testing Data - Multinomial Logistic Regression

[[  86   32    1   93    4    0]
 [  19  480    0  222    9    0]
 [   1    9   36  136    0    0]
 [   3   59    6 2664    3    0]
 [   6   33    0   41   29    0]
 [   0    1    0   15    0    0]]
              precision    recall  f1-score   support

 Booking.com       0.75      0.40      0.52       216
     Expedia       0.78      0.66      0.71       730
      Luxury       0.84      0.20      0.32       182
     Premium       0.84      0.97      0.90      2735
   Priceline       0.64      0.27      0.38       109
      Select       0.00      0.00      0.00        16

    accuracy                           0.83      3988
   macro avg       0.64      0.42      0.47      3988
weighted avg       0.82      0.83      0.80      3988



In [34]:
joblib.dump(log_reg, 'log_reg.pkl')

['log_reg.pkl']

## Random Forest

In [None]:
rf = RandomForestClassifier(random_state=42)             
rf_param_grid = {'n_estimators': [50,75,100,125,150],
                 'criterion': ['gini', 'entropy'],
              'max_depth': [None, 2, 5, 10],
              'class_weight': ['balanced', None],
                'bootstrap': [True, False]}

In [None]:
fit_best_model(rf, rf_param_grid, datasets)

### Best Model Parameters - Random Forest

Final model constructed with best parameters for Random Forest.

* **Bootstrap**: True
* **Class Weight**: None
* **Criterion**: Gini
* **Max Depth**: None
* **Number of Estimators**: 50
* **Dataset**: TF_IDF

In [35]:
rf = RandomForestClassifier(random_state=42, bootstrap=True, class_weight=None, criterion='gini', max_depth=None, n_estimators=50)

rf.fit(tf_idf_X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [36]:
rf_train_preds = rf.predict(tf_idf_X_train)
rf_preds = rf.predict(tf_idf_X_test)

In [37]:
print('Training Data - Random Forest\n')
print(confusion_matrix(y_train, rf_train_preds))
print(classification_report(y_train, rf_train_preds))

Training Data - Random Forest

[[  888    28     0     5    22     0]
 [   35  2760     5    18    30     0]
 [    8     2   795    28     0     0]
 [   54    17    16 10650     1     4]
 [   14    24     0     4   467     0]
 [    0     0     0    12     0    61]]
              precision    recall  f1-score   support

 Booking.com       0.89      0.94      0.91       943
     Expedia       0.97      0.97      0.97      2848
      Luxury       0.97      0.95      0.96       833
     Premium       0.99      0.99      0.99     10742
   Priceline       0.90      0.92      0.91       509
      Select       0.94      0.84      0.88        73

    accuracy                           0.98     15948
   macro avg       0.94      0.93      0.94     15948
weighted avg       0.98      0.98      0.98     15948



In [38]:
print('Testing Data - Random Forest\n')
print(confusion_matrix(y_test, rf_preds))
print(classification_report(y_test, rf_preds))

Testing Data - Random Forest

[[ 117   34    1   60    4    0]
 [  53  454    5  206   12    0]
 [   1   12   51  115    1    2]
 [  68   68   20 2569    3    7]
 [  10   45    0   26   28    0]
 [   0    0    0   13    0    3]]
              precision    recall  f1-score   support

 Booking.com       0.47      0.54      0.50       216
     Expedia       0.74      0.62      0.68       730
      Luxury       0.66      0.28      0.39       182
     Premium       0.86      0.94      0.90      2735
   Priceline       0.58      0.26      0.36       109
      Select       0.25      0.19      0.21        16

    accuracy                           0.81      3988
   macro avg       0.59      0.47      0.51      3988
weighted avg       0.80      0.81      0.80      3988



In [39]:
joblib.dump(rf, 'rf.pkl')

['rf.pkl']

## Comparing Models

In [40]:
models = [nb, rf ,log_reg]
model_names = ['Multinomial Naïve Bayes','Random Forest','Multinomial Logistic Regression']

In [41]:
best_acc = 0.0
best_clf = 0
best_pipe = ''
for index, val in enumerate(models):
    if val.score(tf_idf_X_test, y_test) > best_acc:
        best_acc = val.score(tf_idf_X_test, y_test)
        best_pipe = val
        best_clf = index
print('Classifier with best accuracy: %s' % model_names[best_clf])

Classifier with best accuracy: Multinomial Logistic Regression


The classifier with the best accuracy is **Multinomial Logistic Regression** at **83%**. It will be used for interpretation. 

In [45]:
print(log_reg.coef_[:20])

[[-0.21687636 -0.09190841 -0.17157429 ... -0.11389845 -0.03006101
  -0.02046255]
 [-0.72196841  0.29761311  0.44871866 ... -0.04551712 -0.05790089
  -0.02768475]
 [-0.13060844 -0.09007998 -0.0329394  ... -0.02977021 -0.02492691
  -0.01993557]
 [ 1.38385824 -0.01962936 -0.21720199 ...  0.23690397  0.14433794
   0.07672821]
 [-0.29213407 -0.083384   -0.02038606 ... -0.03777292 -0.02750856
  -0.00631981]
 [-0.02227096 -0.01261137 -0.00661694 ... -0.00994527 -0.00394056
  -0.00232553]]


In [97]:
#Pickle X_test and y_test as binary
pickle.dump(X_test, open( "X_test.pkl", "wb" ))
pickle.dump(y_test, open( "y_test.pkl", "wb" ))
pickle.dump(tf_idf_X_test, open("tf_idf_X_test.pkl", "wb"))

# Future Work

1. Gather More Tweets
2. Topic Modeling with Latent Dirichlet Allocation(LDA)
3. Different Algorithms
    * KNN
    * XG Boost
    * Support Vector Machines
    * Neural Networks
        -RNN/LSTM