## Import Libraries

In [106]:
import pandas as pd

import pickle

import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import xgboost as xgb

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, auc

from sklearn.metrics import accuracy_score

from keras import models
from keras import layers
from keras import optimizers

Using TensorFlow backend.


## Split Data into Training & Test Set

In [5]:
## Import saved dataframe using pickle
df = pd.read_pickle('cleaned_labelled_tweets')

In [17]:
df.head()

Unnamed: 0,network,datetime,original_tweet,subject,sentiment,lemmatized_tweets_tokens,lemmatized_tweets_string
0,@VodafoneUK,2019-12-04 08:05:14,@VodafoneUK Plus £2.28 package &amp; posting !...,device,0.0,"[plus, £2.28, package, posting]",plus £2.28 package posting
1,@VodafoneUK,2019-12-04 08:04:05,I have repeatedly asked how to get a refund so...,customer service,-0.3,"[repeatedly, asked, get, refund, use, another,...",repeatedly asked get refund use another provid...
2,@VodafoneUK,2019-12-04 08:01:19,"I have supplied visa details twice, I have bee...",customer service,-0.3,"[supplied, visa, detail, twice, subjected, hor...",supplied visa detail twice subjected horrendou...
3,@VodafoneUK,2019-12-04 07:57:42,@VodafoneIN promised yesterday I’d receive no ...,customer service,-0.25,"[promised, yesterday, receive, call, would, ge...",promised yesterday receive call would get emai...
4,@VodafoneUK,2019-12-04 07:57:16,@VodafoneUK you send texts about rewards - thi...,promotion,-0.155556,"[send, text, reward, morning, lindt, take, app...",send text reward morning lindt take app never ...


In [18]:
X = df['lemmatized_tweets_string']
y = df['subject']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=213)

## Vectorize Data

In [30]:
vectorizer = TfidfVectorizer()

In [54]:
tf_idf_X_train = vectorizer.fit_transform(X_train)

In [55]:
tf_idf_X_test = vectorizer.transform(X_test)

## Model Data

In [64]:
## Function to identify the optimat dataset and parameters for a given classifier and parameter grid
def best_model_parameters_dataset(classifier, param_grid, datasets):
    
    ## Create a list to contain the dataset, optimal parameters, and score for training and test set
    score_parameters = []
    
    ## Create a for loop which iterates through each dataset and identifies the optimal parameters for the given classifier
    for data in datasets:
        
        gs = GridSearchCV(classifier, param_grid, scoring='accuracy', cv=3)
        gs.fit(data['X_train'], data['y_train'])
        y_test_preds = gs.predict(data['X_test'])
        test_score = accuracy_score(y_test_preds, data['y_test'])
        score_parameters.append({'Dataset':data['name'], 'Training Score':round(gs.best_score_,2), 'Test Score': round(test_score,2), 'Parameters':gs.best_params_})
     
    ## Generate a dataframe that contains the optimal parameters for each dataset
    df = pd.DataFrame(score_parameters)
    df.sort_values(by=['Test Score', 'Training Score'], inplace=True, ascending=False)
    
    return df

In [57]:
datasets = [{'name': 'tf_idf','X_train': tf_idf_X_train, 'y_train':y_train, 'X_test': tf_idf_X_test, 'y_test':y_test}]

### K Nearest Neighbours

In [72]:
## Specify the classifier, in this case K nearest neighbours
knn = KNeighborsClassifier()

## Define the parameter grid
knn_param_grid = {'n_neighbors':[5,20,40,50,60],
              'metric': ['manhattan', 'euclidean','minkowski'],
              'weights': ['uniform', 'distance']
             }

In [73]:
best_model_parameters_dataset(knn, knn_param_grid, datasets)

Unnamed: 0,Dataset,Parameters,Test Score,Training Score
0,tf_idf,"{'metric': 'manhattan', 'n_neighbors': 5, 'wei...",0.44,0.4


### Multinomial Naive Bayes Classifier

In [97]:
## Specify the classifier, in this case naive bayes
nb = MultinomialNB()

## Create a parameter grid to identify optimal parameters
nb_param_grid = {'alpha':[0.5,0.8,1]}

In [98]:
best_model_parameters_dataset(nb, nb_param_grid, datasets)

Unnamed: 0,Dataset,Parameters,Test Score,Training Score
0,tf_idf,{'alpha': 0.5},0.66,0.59


### Multinomial Logistic Regression

In [83]:
## Specify the classifier, in this case LogisticRegression()
logreg = LogisticRegression(random_state=55, max_iter=15000, multi_class='multinomial')

## Create a parameter grid to identify optimal parameters
logreg_param_grid = {'C':[1,2,10],
                     'class_weight': ['balanced', None],
                     'solver':['newton-cg', 'sag', 'saga','lbfgs']}

In [84]:
best_model_parameters_dataset(logreg, logreg_param_grid, datasets)



Unnamed: 0,Dataset,Parameters,Test Score,Training Score
0,tf_idf,"{'C': 2, 'class_weight': 'balanced', 'solver':...",0.76,0.71


### Random Forest

In [86]:
## Define the classifier to be used, in this case RandomForestClassifier(), specify a random_state
## so that the results are reproducible
forest = RandomForestClassifier(random_state=55)

## Specify the parameter grid to be assessed
forest_param_grid = {'n_estimators': [4,10,50,75,150],
                    'criterion': ['gini', 'entropy'],
                  'max_depth':[None, 5, 10, 15],
                  'class_weight': ['balanced', None],
                  'bootstrap': [True, False]
             }

In [87]:
best_model_parameters_dataset(forest, forest_param_grid, datasets)

Unnamed: 0,Dataset,Parameters,Test Score,Training Score
0,tf_idf,"{'bootstrap': False, 'class_weight': 'balanced...",0.75,0.68


### Support Vector Machine

In [88]:
## Specify the classification model, in this case a support vector machine
svm = SVC(gamma='auto', random_state=55)

## Specify the parameter grid to be used during the GridSearchCV
svm_param_grid = {'C':[1,5,10],
                'class_weight':['balanced', None]}

In [90]:
best_model_parameters_dataset(svm, svm_param_grid, datasets)

Unnamed: 0,Dataset,Parameters,Test Score,Training Score
0,tf_idf,"{'C': 1, 'class_weight': None}",0.4,0.37


### XG Boost

In [101]:
## Specify the classifier, in this case XG Boost
boost = xgb.XGBClassifier()

## Fit the model using the training data
boost.fit(tf_idf_X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [103]:
## Create predictions for the training and test datasets
boost_preds_train = boost.predict(tf_idf_X_train)
boost_preds_test = boost.predict(tf_idf_X_test)

In [104]:
accuracy_score(y_train, boost_preds_train)

0.7803149606299212

In [105]:
accuracy_score(y_test, boost_preds_test)

0.6992125984251969

### Neural Network

In [108]:
tf_idf_X_train.shape

(2540, 4589)

In [126]:
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer

In [146]:
le = LabelEncoder()

In [149]:
le.fit(df['subject'])
tweet_category = le.transform(df['subject'])

In [150]:
tweet_category

array([3, 2, 2, ..., 6, 6, 4])

In [151]:
tweet_onehot = to_categorical(tweet_category)

In [152]:
tweet_onehot.shape

(3175, 7)

In [154]:
tweets = df['lemmatized_tweets_string']

In [155]:
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(tweets)

In [156]:
tokenized_X_train = tokenizer.texts_to_matrix(tweets,mode='binary')

In [157]:
tokenized_X_train.shape

(3175, 2000)

In [158]:
model = models.Sequential()

In [159]:
model.add(layers.Dense(50, activation='relu', input_shape=(2000,))) #2 hidden layers
model.add(layers.Dense(25, activation='relu'))
model.add(layers.Dense(7, activation='softmax'))

In [160]:
model.compile(optimizer='SGD',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [161]:
history = model.fit(tokenized_X_train,
                    tweet_onehot,
                    epochs=120,
                    batch_size=256)

Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78

Epoch 80/120
Epoch 81/120
Epoch 82/120
Epoch 83/120
Epoch 84/120
Epoch 85/120
Epoch 86/120
Epoch 87/120
Epoch 88/120
Epoch 89/120
Epoch 90/120
Epoch 91/120
Epoch 92/120
Epoch 93/120
Epoch 94/120
Epoch 95/120
Epoch 96/120
Epoch 97/120
Epoch 98/120
Epoch 99/120
Epoch 100/120
Epoch 101/120
Epoch 102/120
Epoch 103/120
Epoch 104/120
Epoch 105/120
Epoch 106/120
Epoch 107/120
Epoch 108/120
Epoch 109/120
Epoch 110/120
Epoch 111/120
Epoch 112/120
Epoch 113/120
Epoch 114/120
Epoch 115/120
Epoch 116/120
Epoch 117/120
Epoch 118/120
Epoch 119/120
Epoch 120/120
