# BT5153 TOPICS IN BUSINESS ANALYTICS - Group Project 
# Understanding Customer in the Airline Industry on Social Media
# Group 7 - Team NLP
Zhang Kang En	A0186050L 

Chee Wai Kin Simon	A0186100U

Toh Jing Xiang Joshua	A0186795E

Jesisca Tandi	A0185994E

Su Yixi Jessie	A0054353L


In [1]:
# Load libraries
import numpy as np
import pandas as pd
import glob, random, spacy, re, inspect, pickle, os
from textblob import TextBlob
from random import sample
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline, Pipeline, make_union
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Embedding, LSTM, Activation, Flatten, Conv1D, GlobalMaxPooling1D, Dropout, Concatenate
from keras.utils import to_categorical, np_utils
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping
from IPython.core.display import display, HTML
from gensim.models import KeyedVectors
display(HTML("<style>.container { width:98% !important; }</style>"))
pd.set_option('display.max_colwidth', -1)
nlp = spacy.load('en_core_web_sm')


Using TensorFlow backend.


## Load annotated data

In [2]:
# Load manually labelled data
# path = 'labelled tweets/' # Path to the annotated data
path = 'annotated/' # Path to the annotated data
all_files = glob.glob(path + "/*.csv")
li = []

y_variables = ['t_luggages','t_flight_delays', 't_flight_bookings', 't_club', 't_customer_service','t_in-flight','t_seatings',
               'f_enquiry','f_compliment', 'f_complaint', 'f_info', 
               's_positive', 's_negative', 's_neutral']
allvars =  ['text', 'emoji'] + y_variables

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    df = df[allvars]
    li.append(df)

labelled_tweets = pd.concat(li, axis=0, ignore_index=True)
labelled_tweets = labelled_tweets.fillna(0)
print('Total annotated data ',labelled_tweets.shape[0])


# convert feedback to onehot encoding, if not classified, group into info
labelled_tweets['feedback'] = np.where(labelled_tweets['f_enquiry'] == 1, 'enquiry', 
                                       np.where(labelled_tweets['f_compliment'] == 1, 'compliment', 
                                                np.where(labelled_tweets['f_complaint'] == 1, 'complaint', 
                                                         np.where(labelled_tweets['f_info'] == 1, 'info', 'info'))))
feedbackLabels = ['complaint','compliment', 'enquiry', 'info']


# convert topic to onehot encoding, if not classified, group into others
labelled_tweets['topic'] = np.where(
    labelled_tweets['t_luggages'] == 1, 't_luggages', 
    np.where(labelled_tweets['t_flight_delays'] == 1, 't_flight_delays', 
             np.where(labelled_tweets['t_flight_bookings'] == 1, 't_flight_bookings', 
                      np.where(labelled_tweets['t_club'] == 1, 't_club', 
                               np.where(labelled_tweets['t_customer_service'] == 1, 't_customer_service', 
                                        np.where(labelled_tweets['t_in-flight'] == 1, 't_in-flight', 
                                                 np.where(labelled_tweets['t_seatings'] == 1, 't_seatings', 'others')))))))
topicLabels = ['others', 't_club', 't_customer_service', 't_flight_bookings', 't_flight_delays', 't_in-flight', 't_luggages', 't_seatings']

# convert sentiment to onehot encoding, if not classified, group into others
labelled_tweets['sentiment'] = np.where(labelled_tweets['s_positive'] == 1, 'positive',
                                        np.where(labelled_tweets['s_negative'] == 1, 'negative',
                                                 np.where(labelled_tweets['s_neutral'] == 1, 'neutral', 'neutral')))
sentimentLabels = ['negative', 'neutral', 'positive']


display(pd.Series(labelled_tweets['topic']).value_counts())
display(pd.Series(labelled_tweets['feedback']).value_counts())
display(pd.Series(labelled_tweets['sentiment']).value_counts())

# Folder to save the best model
outFolder = 'bestModels'
if not os.path.exists(outFolder):
    os.makedirs(outFolder)

Total annotated data  3000


others                1664
t_flight_bookings     318 
t_customer_service    296 
t_flight_delays       288 
t_in-flight           177 
t_luggages            138 
t_club                64  
t_seatings            55  
Name: topic, dtype: int64

info          1566
complaint     775 
enquiry       384 
compliment    275 
Name: feedback, dtype: int64

neutral     1734
negative    910 
positive    356 
Name: sentiment, dtype: int64

### Train-test split

In [3]:
# Train-test split (80-20)
X_train_withEmoji, X_test_withEmoji, labelled_tweets_Y_train, labelled_tweets_Y_test = train_test_split(labelled_tweets[['text','emoji']], labelled_tweets[['topic', 'feedback', 'sentiment']], 
                                                                                    test_size=0.2, stratify=labelled_tweets[['topic']], random_state=10)

X_train = X_train_withEmoji.text
X_test = X_test_withEmoji.text

print('\nTrain size =', X_train.shape[0])
display(pd.Series(labelled_tweets_Y_train['topic']).value_counts())
display(pd.Series(labelled_tweets_Y_train['feedback']).value_counts())
display(pd.Series(labelled_tweets_Y_train['sentiment']).value_counts())

print('\nTest size =', X_test.shape[0])
display(pd.Series(labelled_tweets_Y_test['topic']).value_counts())
display(pd.Series(labelled_tweets_Y_test['feedback']).value_counts())
display(pd.Series(labelled_tweets_Y_test['sentiment']).value_counts())



Train size = 2400


others                1331
t_flight_bookings     255 
t_customer_service    237 
t_flight_delays       230 
t_in-flight           142 
t_luggages            110 
t_club                51  
t_seatings            44  
Name: topic, dtype: int64

info          1262
complaint     609 
enquiry       300 
compliment    229 
Name: feedback, dtype: int64

neutral     1398
negative    712 
positive    290 
Name: sentiment, dtype: int64


Test size = 600


others                333
t_flight_bookings     63 
t_customer_service    59 
t_flight_delays       58 
t_in-flight           35 
t_luggages            28 
t_club                13 
t_seatings            11 
Name: topic, dtype: int64

info          304
complaint     166
enquiry       84 
compliment    46 
Name: feedback, dtype: int64

neutral     336
negative    198
positive    66 
Name: sentiment, dtype: int64

In [4]:
# Parameters setting
evaluationMethod = 'f1_macro'
evaluationMethod_nn = 'accuracy'

## Create functions for feature extraction and models

In [5]:
# General function for dense NN
def createNNModel(inpLayer, outLayer, n_neuron=[10], rate=0.8):

    model = Sequential()
    layerBefore = inpLayer
    for i in n_neuron:
        model.add(Dropout(rate = rate, input_shape=(layerBefore,)))
        model.add(Dense(i, activation='relu'))
        layerBefore = n_neuron
    model.add(Dense(outLayer, activation="softmax"))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=[evaluationMethod_nn])
    return model

class kerasDenseNNWrapper():
    
    def __init__(self, dummyEnc=None, n_neuron=[10], rate=0.8, validation_split=0.2, epochs=10, batch_size=50, verbose=0):
        frame = inspect.currentframe()
        args, _, _, values = inspect.getargvalues(frame)
        for i in args:
            if str(i) != 'self':
                setattr(self, i, values[i])

    def set_params(self, **params):
        if not params:
            return self
        for key, value in params.items():
            key, delim, sub_key = key.partition('__')
            if delim:
                setattr(self, sub_key, value)
            else:
                setattr(self, key, value)
        return self
    
    def fit(self, X, y):
        
        if self.dummyEnc is not None:
            y_enc = self.dummyEnc.transform(y)
            y_dummy = np_utils.to_categorical(y_enc)
            
        inpLayer = X.shape[1]
        outLayer = y_dummy.shape[1]
        self.model = createNNModel(inpLayer, outLayer, n_neuron=self.n_neuron, rate=self.rate)
        self.model.fit(X, y_dummy, validation_split=self.validation_split, epochs=self.epochs, batch_size=self.batch_size, verbose=self.verbose)
    
    def predict(self, X):
        predicted_Y = self.model.predict(X)
        predicted_class = self.dummyEnc.classes_[np.argmax(predicted_Y, axis=1)]
        return predicted_class

In [6]:
# General function for CNN
def createCNNModel(sequenceLength, outLayer, vocabsize=1000, embeddingdim=20, filtersizes=[2,3,4,5], numfilters=3):

    model_input = Input(shape=(sequenceLength,))
    z = Embedding(vocabsize, embeddingdim, input_length=sequenceLength, name="embedding")(model_input)
    
    # Convolutional Layer 
    conv_blocks = []
    for sz in filtersizes:
        conv = Conv1D(filters=numfilters,
                      kernel_size=sz,
                      padding="valid",
                      activation="relu",
                      strides=1)(z)
        conv = GlobalMaxPooling1D()(conv)
        conv_blocks.append(conv)

    # Fully-connected Layer
    z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

    # This is multi-class classification problem, use softmax layer 
    model_output = Dense(outLayer, activation="softmax")(z)
    model = Model(model_input, model_output)
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=[evaluationMethod_nn])

    return model


class kerasCNNWrapper():
    
    def __init__(self, dummyEnc=None, vocabsize=1000, embeddingdim=20, filtersizes=[2,3,4,5], numfilters=3, tokTrain=None, validation_split=0.2, epochs=10, batch_size=50, verbose=0):
        frame = inspect.currentframe()
        args, _, _, values = inspect.getargvalues(frame)
        for i in args:
            if str(i) != 'self':
                setattr(self, i, values[i])
                
    def set_params(self, **params):
        if not params:
            return self
        for key, value in params.items():
            key, delim, sub_key = key.partition('__')
            if delim:
                setattr(self, sub_key, value)
            else:
                setattr(self, key, value)
                
        return self
    
    def transformX(self, X, fitting=None, vocabsize=1000):

        if fitting is True:
            tok = Tokenizer(num_words=vocabsize)  ## here, we are set the max number of words to keep. The most common 7999 words will be kept
            tok.fit_on_texts(X)
            self.tok = tok
            X_seq = self.tok.texts_to_sequences(X)
            self.sequence_length = max([len(ele) for ele in X_seq]) 
        
        # Convert string to index
        X_seq = self.tok.texts_to_sequences(X)

        # Padding
        X_seq_padded = pad_sequences(X_seq, maxlen=self.sequence_length, padding='post')
        
        return X_seq_padded

    def fit(self, X, y):

        if self.dummyEnc is not None:
            y_enc = self.dummyEnc.transform(y)
            y_dummy = np_utils.to_categorical(y_enc)
            
        X_trf = self.transformX(X, fitting=True, vocabsize=self.vocabsize)
        inpLayer = X_trf.shape[1]
        outLayer = y_dummy.shape[1]
        
        self.model = createCNNModel(inpLayer, outLayer, vocabsize=self.vocabsize, embeddingdim=self.embeddingdim, 
                                   filtersizes=self.filtersizes, numfilters=self.numfilters)
        self.model.fit(X_trf, y_dummy, validation_split=self.validation_split, epochs=self.epochs, batch_size=self.batch_size, verbose=self.verbose)
    
    def predict(self, X):
        
        X_trf = self.transformX(X, vocabsize=self.vocabsize)
        predicted_Y = self.model.predict(X_trf)
        predicted_class = self.dummyEnc.classes_[np.argmax(predicted_Y, axis=1)]
        return predicted_class



In [7]:
# Feature engineering : convert emojis into text
def processEmoji(emoji):
    if emoji!=0:
        processedEmoji = re.sub(r"\\\\U", "U+", 
                                re.sub(r"\\\\U", "U+", 
                                       re.sub(r"\\\\U000", "U+", 
                                              re.sub(r"\\\\ufe0f", "", 
                                                     str(emoji.encode('unicode-escape')).strip("b'")).upper()))).split()
    else: 
        processedEmoji = []
    return processedEmoji

emojiDictionary = pd.read_csv("full_emoji_list.csv")

def mapEmoji(emojiList):
    if len(emojiList)>0:
        emojiText = ''
        for i in emojiList:
            k = emojiDictionary[emojiDictionary.Code==i].Description.values
            if len(k)>0:
                emojiText = ' '.join([emojiText, k[0]])
    else:
        emojiText = ''
    return emojiText

def makeFeatureEmoji(X):
    
    output = X.emoji.apply(lambda x: mapEmoji(processEmoji(x)))
    return output.values

In [8]:
# Feature engineering : get total number of words, total number of nouns, adj, adv, verb

def countWord(data, typeOfPOS):
    c = 0
    for i in data:
        if i[1] in typeOfPOS:
            c += 1
            
    return c

posDictionary = {'Noun': ['NN', 'NNS', 'NNP', 'NNPS'],
                'Adj': ['JJ', 'JJR', 'JJS'],
                'Adv': ['RB', 'RBR', 'RBS'],
                'Verb': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']}

def combineFeatures(X):

    manualFeatures = pd.DataFrame()
    manualFeatures['textBlobOutput'] = X.text.apply(lambda x: TextBlob(x))
    for k in posDictionary:
        manualFeatures[k] = manualFeatures.textBlobOutput.apply(lambda x: countWord(x.tags, posDictionary[k]))
    
    manualFeatures['len'] = manualFeatures.textBlobOutput.apply(lambda x: len(x.words))
    manualFeatures['polarity'] = manualFeatures.textBlobOutput.apply(lambda x: x.sentiment.polarity)
    manualFeatures['subjectivity'] = manualFeatures.textBlobOutput.apply(lambda x: x.sentiment.subjectivity)
    
    manualFeatures = manualFeatures.drop('textBlobOutput', axis=1)
    return manualFeatures.values
    

In [9]:
# Feature engineering : Use GoogleNews pre-trained word vectors
# wordEmbTrainedModel = KeyedVectors.load_word2vec_format('c:/Users/Kang En/Desktop/GoogleNews-vectors-negative300.bin.gz', binary=True)
wordEmbTrainedModel = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

Nwordemb = 300
def wordEmbFun_Convert(sentence):
    myDict = {}
    for t in ['NOUN', 'ADJ', 'VERB']:
        myDict[t] = np.zeros((Nwordemb,1))
        myDict['{}_N'.format(t)] = 0

    word = nlp(sentence)
    for i in word:
        if (str(i.lemma_) in wordEmbTrainedModel) & (i.pos_ in myDict):
            e = wordEmbTrainedModel[i.lemma_]
            myDict[i.pos_] += e.reshape(e.shape[0],1)
            myDict['{}_N'.format(i.pos_)] += 1

    for t in ['NOUN', 'ADJ', 'VERB']:
        if myDict['{}_N'.format(t)] != 0:
            myDict[t] = myDict[t]/myDict['{}_N'.format(t)]

    return np.concatenate((myDict['NOUN'], myDict['ADJ'], myDict['VERB']))

def wordEmbFun(inp):
    out = np.zeros((inp.shape[0], Nwordemb*3))
    for i,j in enumerate(inp):
        out[i,:] = wordEmbFun_Convert(j).flatten()
    return out


In [10]:
# Function to print performance metrics
def getPerformanceMetrics(y_test, pred_y_test, labelList=None):
    
    test_acc = accuracy_score(y_test, pred_y_test)
    print('Test accuracy\t{:.03f}'.format(test_acc))
    
    f1_micro_score = f1_score(y_test, pred_y_test, average='micro')
    print('F1 micro\t{:.03f}'.format(f1_micro_score))
    
    f1_macro_score = f1_score(y_test, pred_y_test, average='macro')
    print('F1 macro\t{:.03f}'.format(f1_macro_score))
    
    conf_mat = confusion_matrix(y_test, pred_y_test, labels=labelList)
    print('Confusion matrix\n', conf_mat)
    
    return test_acc, f1_macro_score

### [Model 1] Word embedding + GLM
Vectorizer method: use predefined word embedding vector from GoogleNews-vectors-negative300.bin, average over nouns, verbs, adjectives in each tweet.
Model: GLM. Each pos will be a vector of 300, combined together becomes a vector with (900,) shape

In [11]:
def modelOne(X_train, y_train, X_test, y_test, tunedParms=None, saveModel=False, labelList=None):

    print('='*5, "\tModel 1\t", '='*5)

    # GLM + mean(NOUN, ADJ, VERB) word emb)
    glm = LogisticRegression(class_weight='balanced', solver='lbfgs', max_iter=2000, multi_class='multinomial')
    wordEmb = FunctionTransformer(wordEmbFun, validate=False)
    pipe = make_pipeline(wordEmb, glm)

    # create a grid of parameters to search
    param_grid = {}
    param_grid['logisticregression__C'] = [0.1, 0.3, 0.5, 0.7, 0.9] 

    if tunedParms is None:

        # Start cross validation to tune hyperparameters
        grid_t = GridSearchCV(pipe, param_grid, cv=5, scoring=evaluationMethod, return_train_score=True)
        %time grid_t.fit(X_train, y_train)

        # examine the best score
        print(grid_t.best_score_)
        print(grid_t.best_params_)

        pred_y_test = grid_t.predict(X_test)
        
    else:
        
        pipe.set_params(**tunedParms)
        pipe.fit(X_train, y_train)
        if saveModel:
            pickle.dump(pipe, open(saveModel, 'wb'))
        pred_y_test = pipe.predict(X_test)

    test_acc, test_f1_macro = getPerformanceMetrics(y_test, pred_y_test, labelList=labelList)

    if tunedParms is None:
        return grid_t.best_params_
            
    return test_acc, test_f1_macro

### [Model 2] BoW + Naive bayes


In [12]:
def modelTwo(X_train, y_train, X_test, y_test, tunedParms=None, saveModel=False, labelList=None):

    print('='*5, "\tModel 2\t", '='*5)

    # import and instantiate Multinomial Naive Bayes (with the default parameters)
    nb_t = MultinomialNB()
    vect_t = CountVectorizer() 
    pipe = make_pipeline(vect_t, nb_t)

    # create a grid of parameters to search
    param_grid = {}
    param_grid['countvectorizer__token_pattern'] = ['(?u)\\b\\w\\w+\\b', '\\b[^\\d\\W]+\\b', '([a-z ]+)']
    param_grid['countvectorizer__stop_words'] = ["english"]
    param_grid['countvectorizer__ngram_range'] = [(1, 1),(1,2)]
    param_grid['countvectorizer__max_df'] = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    param_grid['countvectorizer__min_df'] = [3, 4, 5]
    param_grid['multinomialnb__alpha'] = [0.2, 0.5, 0.7, 1]

    if tunedParms is None:

        # Start cross validation to tune hyperparameters
        grid_t = GridSearchCV(pipe, param_grid, cv=5, scoring=evaluationMethod, return_train_score=True)
        %time grid_t.fit(X_train, y_train)

        # examine the best score
        print(grid_t.best_score_)
        print(grid_t.best_params_)

        pred_y_test = grid_t.predict(X_test)
        
    else:
        
        pipe.set_params(**tunedParms)
        pipe.fit(X_train, y_train)
        if saveModel:
            pickle.dump(pipe, open(saveModel, 'wb'))

        pred_y_test = pipe.predict(X_test)

    test_acc, test_f1_macro = getPerformanceMetrics(y_test, pred_y_test, labelList=labelList)

    if tunedParms is None:
        return grid_t.best_params_
            
    return test_acc, test_f1_macro

### [Model 3] TF-IDF + Naive bayes


In [13]:
def modelThree(X_train, y_train, X_test, y_test, tunedParms=None, saveModel=False, labelList=None):

    print('='*5, "\tModel 3\t", '='*5)

    # import and instantiate Multinomial Naive Bayes (with the default parameters)
    nb_t = MultinomialNB()
    vect_t = TfidfVectorizer() 
    pipe = make_pipeline(vect_t, nb_t)
    pipe.steps

    # create a grid of parameters to search
    param_grid = {}
    param_grid['tfidfvectorizer__token_pattern'] = ['(?u)\\b\\w\\w+\\b', '\\b[^\\d\\W]+\\b', '([a-z ]+)']
    param_grid['tfidfvectorizer__stop_words'] = ["english"]
    param_grid['tfidfvectorizer__ngram_range'] = [(1, 1),(1,2)]
    param_grid['tfidfvectorizer__max_df'] = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    param_grid['tfidfvectorizer__min_df'] = [3, 4, 5]
    param_grid['multinomialnb__alpha'] = [0.2, 0.5, 0.7, 1]

    if tunedParms is None:

        # pass the pipeline (instead of the model) to GridSearchCV
        grid_t = GridSearchCV(pipe, param_grid, cv=5, scoring=evaluationMethod, return_train_score=True)
        %time grid_t.fit(X_train, y_train)

        # examine the score for each combination of parameters
        print(grid_t.best_score_)
        print(grid_t.best_params_)

        pred_y_test = grid_t.predict(X_test)
        
    else:

        pipe.set_params(**tunedParms)
        pipe.fit(X_train, y_train)
        if saveModel:
            pickle.dump(pipe, open(saveModel, 'wb'))
        pred_y_test = pipe.predict(X_test)

    test_acc, test_f1_macro = getPerformanceMetrics(y_test, pred_y_test, labelList=labelList)

    if tunedParms is None:
        return grid_t.best_params_

    return test_acc, test_f1_macro

### [Model 4] BoW + GLM


In [14]:
def modelFour(X_train, y_train, X_test, y_test, tunedParms=None, saveModel=False, labelList=None):
    
    print('='*5, "\tModel 4\t", '='*5)

    # GLM+CountVect
    glm = LogisticRegression(class_weight='balanced', solver='lbfgs', max_iter=2000, multi_class='multinomial')
    vect_t = CountVectorizer() 
    pipe = make_pipeline(vect_t, glm)
    pipe.steps

    # create a grid of parameters to search
    param_grid = {}
    param_grid['countvectorizer__token_pattern'] = ['(?u)\\b\\w\\w+\\b', '\\b[^\\d\\W]+\\b', '([a-z ]+)']
    param_grid['countvectorizer__stop_words'] = ["english"]
    param_grid['countvectorizer__ngram_range'] = [(1, 1),(1,2)]
    param_grid['countvectorizer__max_df'] = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    param_grid['countvectorizer__min_df'] = [3, 4, 5]
    param_grid['logisticregression__C'] = [0.1, 0.3, 0.5, 0.7, 0.9] 

    if tunedParms is None:

        # pass the pipeline (instead of the model) to GridSearchCV
        grid_t = GridSearchCV(pipe, param_grid, cv=5, scoring=evaluationMethod, return_train_score=True)
        %time grid_t.fit(X_train, y_train)

        # examine the score for each combination of parameters
        print(grid_t.best_score_)
        print(grid_t.best_params_)
        pred_y_test = grid_t.predict(X_test)
        
    else:
        
        pipe.set_params(**tunedParms)
        pipe.fit(X_train, y_train)
        if saveModel:
            pickle.dump(pipe, open(saveModel, 'wb'))
        pred_y_test = pipe.predict(X_test)

    test_acc, test_f1_macro = getPerformanceMetrics(y_test, pred_y_test, labelList=labelList)

    if tunedParms is None:
        return grid_t.best_params_
    
    return test_acc, test_f1_macro

### [Model 5] TF-IDF + GLM


In [15]:
def modelFive(X_train, y_train, X_test, y_test, tunedParms=None, saveModel=False, labelList=None):
    
    print('='*5, "\tModel 5\t", '='*5)

    # GLM+TFIDFVect
    glm = LogisticRegression(class_weight='balanced', solver='lbfgs', max_iter=2000, multi_class='multinomial')
    vect_t = TfidfVectorizer() 
    pipe = make_pipeline(vect_t, glm)
    pipe.steps

    # create a grid of parameters to search (and specify the pipeline step along with the parameter)
    param_grid = {}
    param_grid['tfidfvectorizer__token_pattern'] = ['(?u)\\b\\w\\w+\\b', '\\b[^\\d\\W]+\\b', '([a-z ]+)']
    param_grid['tfidfvectorizer__stop_words'] = ["english"]
    param_grid['tfidfvectorizer__ngram_range'] = [(1, 1),(1,2)]
    param_grid['tfidfvectorizer__max_df'] = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    param_grid['tfidfvectorizer__min_df'] = [3, 4, 5]
    param_grid['logisticregression__C'] = [0.1, 0.3, 0.5, 0.7, 0.9] 

    if tunedParms is None:

        # pass the pipeline (instead of the model) to GridSearchCV
        grid_t = GridSearchCV(pipe, param_grid, cv=5, scoring=evaluationMethod, return_train_score=True)
        %time grid_t.fit(X_train, y_train)

        # examine the score for each combination of parameters
        print(grid_t.best_score_)
        print(grid_t.best_params_)

        pred_y_test = grid_t.predict(X_test)
        
    else:
        
        pipe.set_params(**tunedParms)
        pipe.fit(X_train, y_train)
        if saveModel:
            pickle.dump(pipe, open(saveModel, 'wb'))
        pred_y_test = pipe.predict(X_test)

    test_acc, test_f1_macro = getPerformanceMetrics(y_test, pred_y_test, labelList=labelList)

    if tunedParms is None:
        return grid_t.best_params_
    
    return test_acc, test_f1_macro

### [Model 6] BoW + XGBoost


In [16]:
def modelSix(X_train, y_train, X_test, y_test, tunedParms=None, saveModel=False, labelList=None):

    print('='*5, "\tModel 6\t", '='*5)

    # XGB+CountVect
    nclass = np.unique(y_train).shape[0]
    xgb = XGBClassifier(objective='multi:softmax', num_class=nclass, random_state=42, n_jobs=4)
    vect_t = CountVectorizer()
    pipe = make_pipeline(vect_t, xgb)

    # create a grid of parameters to search (and specify the pipeline step along with the parameter)
    param_grid = {}
    param_grid['countvectorizer__token_pattern'] = ['(?u)\\b\\w\\w+\\b']
    param_grid['countvectorizer__stop_words'] = ["english"]
    param_grid['countvectorizer__ngram_range'] = [(1, 1),(1,2)]
    param_grid['countvectorizer__max_df'] = [0.5]
    param_grid['countvectorizer__min_df'] = [3, 4, 5]
    param_grid['xgbclassifier__learning_rate'] = [0.1, 0.05] 
    param_grid['xgbclassifier__max_depth'] = [3,5] 
    param_grid['xgbclassifier__n_estimators'] = [50,100,200] 

    if tunedParms is None:
        # pass the pipeline (instead of the model) to GridSearchCV
        grid_t = GridSearchCV(pipe, param_grid, cv=5, scoring=evaluationMethod, return_train_score=True)
        %time grid_t.fit(X_train, y_train)
        
        # examine the score for each combination of parameters
        print(grid_t.best_score_)
        print(grid_t.best_params_)
        pred_y_test = grid_t.predict(X_test)
        
    else:
        pipe.set_params(**tunedParms)
        pipe.fit(X_train, y_train)
        if saveModel:
            pickle.dump(pipe, open(saveModel, 'wb'))
        pred_y_test = pipe.predict(X_test)
        
    test_acc, test_f1_macro = getPerformanceMetrics(y_test, pred_y_test, labelList=labelList)

    if tunedParms is None:
        return grid_t.best_params_

    return test_acc, test_f1_macro

### [Model 7] BoW + 1-hidden-layer NN

In [17]:
def modelSeven(X_train, y_train, X_test, y_test, tunedParms=None, saveModel=False, labelList=None):
    
    print('='*5, "\tModel 7\t", '='*5)

    enc = LabelEncoder()
    enc.fit(y_train)
    vect = CountVectorizer()
    nn = kerasDenseNNWrapper(dummyEnc=enc)
    pipe = make_pipeline(vect, nn)

    # create a grid of parameters to search (and specify the pipeline step along with the parameter)
    param_grid = {}
    param_grid['countvectorizer__token_pattern'] = ['(?u)\\b\\w\\w+\\b']
    param_grid['countvectorizer__stop_words'] = ["english"]
    param_grid['countvectorizer__ngram_range'] = [(1, 1),(1,2)]
    param_grid['countvectorizer__max_df'] = [0.5]
    param_grid['countvectorizer__min_df'] = [3, 4, 5]
    param_grid['kerasdensennwrapper__n_neuron'] = [[10], [20], [40]] # One layer
    param_grid['kerasdensennwrapper__rate'] = [0.3, 0.5, 0.8] 

    if tunedParms is None:
        # pass the pipeline (instead of the model) to GridSearchCV
        grid_t = GridSearchCV(pipe, param_grid, cv=5, scoring=evaluationMethod, return_train_score=True)
        %time grid_t.fit(X_train, y_train)
        
        # examine the score for each combination of parameters
        print(grid_t.best_score_)
        print(grid_t.best_params_)
        pred_y_test = grid_t.predict(X_test)
        
    else:
        pipe.set_params(**tunedParms)
        pipe.fit(X_train, y_train)
        if saveModel:
            pickle.dump(pipe, open(saveModel, 'wb'))
        pred_y_test = pipe.predict(X_test)
        
    test_acc, test_f1_macro = getPerformanceMetrics(y_test, pred_y_test, labelList=labelList)

    if tunedParms is None:
        return grid_t.best_params_

    return test_acc, test_f1_macro

### [Model 8] CNN

In [18]:
def modelEight(X_train, y_train, X_test, y_test, tunedParms=None, saveModel=False, labelList=None):
    
    print('='*5, "\tModel 8\t", '='*5)
    
    enc = LabelEncoder()
    enc.fit(y_train)
    nn = kerasCNNWrapper(dummyEnc=enc)
    pipe = make_pipeline(nn)

    # create a grid of parameters to search (and specify the pipeline step along with the parameter)
    param_grid = {}
    param_grid['kerascnnwrapper__vocabsize'] = [1000, 1200, 1500] # One layer
    param_grid['kerascnnwrapper__embeddingdim'] = [15, 20] 
    param_grid['kerascnnwrapper__filtersizes'] = [[2, 3], [2,3,4]]
    param_grid['kerascnnwrapper__numfilters'] = [2,3]

    if tunedParms is None:
        # pass the pipeline (instead of the model) to GridSearchCV
        grid_t = GridSearchCV(pipe, param_grid, cv=5, scoring=evaluationMethod, return_train_score=True)
        %time grid_t.fit(X_train, y_train)
        
        # examine the score for each combination of parameters
        print(grid_t.best_score_)
        print(grid_t.best_params_)
        pred_y_test = grid_t.predict(X_test)
        
    else:
        pipe.set_params(**tunedParms)
        pipe.fit(X_train, y_train)
        if saveModel:
            pickle.dump(pipe, open(saveModel, 'wb'))
        pred_y_test = pipe.predict(X_test)
        
    test_acc, test_f1_macro = getPerformanceMetrics(y_test, pred_y_test, labelList=labelList)

    if tunedParms is None:
        return grid_t.best_params_

    return test_acc, test_f1_macro

### [Model 9] Manual features + GLM

In [19]:
def modelNine(X_train, y_train, X_test, y_test, tunedParms=None, saveModel=False, labelList=None):
    # input is X_train_withEmoji

    print('='*5, "\tModel 9\t", '='*5)

    # GLM + manual features
    glm = LogisticRegression(class_weight='balanced', solver='lbfgs', max_iter=2000, multi_class='multinomial')
    manualFun = FunctionTransformer(combineFeatures, validate=False)
    manualEmoji = FunctionTransformer(makeFeatureEmoji, validate=False)
    vect_t = CountVectorizer()
    featuresUnion = make_union(manualFun, make_pipeline(manualEmoji, vect_t))
    pipe = make_pipeline(featuresUnion, glm)

    # create a grid of parameters to search
    param_grid = {}
    param_grid['logisticregression__C'] = [0.1, 0.3, 0.5, 0.7, 0.9] 
    param_grid['featureunion__pipeline__countvectorizer__token_pattern'] = ['(?u)\\b\\w\\w+\\b']
    param_grid['featureunion__pipeline__countvectorizer__stop_words'] = ["english"]
    param_grid['featureunion__pipeline__countvectorizer__ngram_range'] = [(1, 1), (1,2)]
      
    #param_grid['featureunion__pipeline__countvectorizer__token_pattern'] = ['(?u)\\b\\w\\w+\\b', '\\b[^\\d\\W]+\\b', '([a-z ]+)']
    #param_grid['featureunion__pipeline__countvectorizer__stop_words'] = ["english"]
    #param_grid['featureunion__pipeline__countvectorizer__ngram_range'] = [(1, 1),(1,2)]
    #param_grid['featureunion__pipeline__countvectorizer__max_df'] = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    #param_grid['featureunion__pipeline__countvectorizer__min_df'] = [3, 4, 5]

    if tunedParms is None:

        # Start cross validation to tune hyperparameters
        grid_t = GridSearchCV(pipe, param_grid, cv=5, scoring=evaluationMethod, return_train_score=True)
        %time grid_t.fit(X_train, y_train)

        # examine the best score
        print(grid_t.best_score_)
        print(grid_t.best_params_)

        pred_y_test = grid_t.predict(X_test)
        
    else:
        
        pipe.set_params(**tunedParms)
        pipe.fit(X_train, y_train)
        if saveModel:
            pickle.dump(pipe, open(saveModel, 'wb'))
        pred_y_test = pipe.predict(X_test)

    test_acc, test_f1_macro = getPerformanceMetrics(y_test, pred_y_test, labelList=labelList)

    if tunedParms is None:
        return grid_t.best_params_
            
    return test_acc, test_f1_macro

### [Model 10] Manual features + XGBoost

In [20]:
def modelTen(X_train, y_train, X_test, y_test, tunedParms=None, saveModel=False, labelList=None):
    # input is X_train_withEmoji
    
    print('='*5, "\tModel 10\t", '='*5)
    # XGB + manual features
    nclass = np.unique(y_train).shape[0]
    xgb = XGBClassifier(objective='multi:softmax', num_class=nclass, random_state=42, n_jobs=4)
    manualFun = FunctionTransformer(combineFeatures, validate=False)
    manualEmoji = FunctionTransformer(makeFeatureEmoji, validate=False)
    vect_t = CountVectorizer()
    featuresUnion = make_union(manualFun, make_pipeline(manualEmoji, vect_t))
    pipe = make_pipeline(featuresUnion, xgb)

    # create a grid of parameters to search
    param_grid = {}
    param_grid['xgbclassifier__learning_rate'] = [0.1, 0.05] 
    param_grid['xgbclassifier__max_depth'] = [3,5] 
    param_grid['xgbclassifier__n_estimators'] = [50,100,200] 
    param_grid['featureunion__pipeline__countvectorizer__token_pattern'] = ['(?u)\\b\\w\\w+\\b']
    param_grid['featureunion__pipeline__countvectorizer__stop_words'] = ["english"]
    param_grid['featureunion__pipeline__countvectorizer__ngram_range'] = [(1, 1)]
    
    if tunedParms is None:

        # Start cross validation to tune hyperparameters
        grid_t = GridSearchCV(pipe, param_grid, cv=5, scoring=evaluationMethod, return_train_score=True)
        %time grid_t.fit(X_train, y_train)

        # examine the best score
        print(grid_t.best_score_)
        print(grid_t.best_params_)

        pred_y_test = grid_t.predict(X_test)
        
    else:
        
        pipe.set_params(**tunedParms)
        pipe.fit(X_train, y_train)
        if saveModel:
            pickle.dump(pipe, open(saveModel, 'wb'))
        pred_y_test = pipe.predict(X_test)

    test_acc, test_f1_macro = getPerformanceMetrics(y_test, pred_y_test, labelList=labelList)

    if tunedParms is None:
        return grid_t.best_params_
            
    return test_acc, test_f1_macro

# <u>Problem 1</u>  --  Predicting the type of feedback

In [21]:
# Generate X and y for topic
y_train = labelled_tweets_Y_train.feedback
y_test = labelled_tweets_Y_test.feedback

In [22]:
### Hyperparameter tuning using 5-fold cross validation ###
tuning = False
if tuning:
    modelOneBestParms = modelOne(X_train, y_train, X_test, y_test)
    print('modelOneBestParms = {}\n\n'.format(modelOneBestParms))

    modelTwoBestParms = modelTwo(X_train, y_train, X_test, y_test)
    print('modelTwoBestParms = {}\n\n'.format(modelTwoBestParms))

    modelThreeBestParms = modelThree(X_train, y_train, X_test, y_test)
    print('modelThreeBestParms = {}\n\n'.format(modelThreeBestParms))

    modelFourBestParms = modelFour(X_train, y_train, X_test, y_test)
    print('modelFourBestParms = {}\n\n'.format(modelFourBestParms))

    modelFiveBestParms = modelFive(X_train, y_train, X_test, y_test)
    print('modelFiveBestParms = {}\n\n'.format(modelFiveBestParms))

    modelSixBestParms = modelSix(X_train, y_train, X_test, y_test)
    print('modelSixBestParms = {}\n\n'.format(modelSixBestParms))

    modelSevenBestParms = modelSeven(X_train, y_train, X_test, y_test)
    print('modelSevenBestParms = {}\n\n'.format(modelSevenBestParms))

    modelEightBestParms = modelEight(X_train, y_train, X_test, y_test)
    print('modelEightBestParms = {}\n\n'.format(modelEightBestParms))
    
    modelNineBestParms = modelNine(X_train_withEmoji, y_train, X_test_withEmoji, y_test)
    print('modelNineBestParms = {}\n\n'.format(modelNineBestParms))
    
    modelTenBestParms = modelTen(X_train_withEmoji, y_train, X_test_withEmoji, y_test)
    print('modelTenBestParms = {}\n\n'.format(modelTenBestParms))

    
### Use tuned hyperparameter ###
modelOneBestParms = {'logisticregression__C': 0.1}
modelTwoBestParms = {'countvectorizer__max_df': 0.5, 'countvectorizer__min_df': 3, 
                     'countvectorizer__ngram_range': (1, 1), 'countvectorizer__stop_words': 'english', 
                     'countvectorizer__token_pattern': '\\b[^\\d\\W]+\\b', 'multinomialnb__alpha': 0.5}
modelThreeBestParms = {'multinomialnb__alpha': 0.2, 'tfidfvectorizer__max_df': 0.5, 'tfidfvectorizer__min_df': 5, 
                       'tfidfvectorizer__ngram_range': (1, 2), 'tfidfvectorizer__stop_words': 'english', 
                       'tfidfvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b'}
modelFourBestParms = {'countvectorizer__max_df': 0.5, 'countvectorizer__min_df': 3, 'countvectorizer__ngram_range': (1, 2), 
                      'countvectorizer__stop_words': 'english', 'countvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b', 
                      'logisticregression__C': 0.1}
modelFiveBestParms = {'logisticregression__C': 0.9, 'tfidfvectorizer__max_df': 0.5, 'tfidfvectorizer__min_df': 3, 
                      'tfidfvectorizer__ngram_range': (1, 2), 'tfidfvectorizer__stop_words': 'english', 
                      'tfidfvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b'}
modelSixBestParms = {'countvectorizer__max_df': 0.5, 'countvectorizer__min_df': 5, 'countvectorizer__ngram_range': (1, 2), 
                     'countvectorizer__stop_words': 'english',  'countvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b', 
                     'xgbclassifier__learning_rate': 0.1, 'xgbclassifier__max_depth': 5, 'xgbclassifier__n_estimators': 200}
modelSevenBestParms = {'countvectorizer__max_df': 0.5, 'countvectorizer__min_df': 4, 'countvectorizer__ngram_range': (1, 2), 
                       'countvectorizer__stop_words': 'english', 'countvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b', 
                       'kerasdensennwrapper__n_neuron': [40], 'kerasdensennwrapper__rate': 0.3}
modelEightBestParms = {'kerascnnwrapper__embeddingdim': 20, 'kerascnnwrapper__filtersizes': [2, 3, 4], 
                       'kerascnnwrapper__numfilters': 3, 'kerascnnwrapper__vocabsize': 1200}
modelNineBestParms = {'featureunion__pipeline__countvectorizer__ngram_range': (1, 2), 'featureunion__pipeline__countvectorizer__stop_words': 'english', 
                      'featureunion__pipeline__countvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b', 'logisticregression__C': 0.1}
modelTenBestParms = {'featureunion__pipeline__countvectorizer__ngram_range': (1, 1), 'featureunion__pipeline__countvectorizer__stop_words': 'english', 
                     'featureunion__pipeline__countvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b', 
                     'xgbclassifier__learning_rate': 0.05, 'xgbclassifier__max_depth': 3, 'xgbclassifier__n_estimators': 50}



### Evaluate model performance with tuned parameters ###
opts = {'labelList': feedbackLabels}
finalResults = pd.DataFrame(columns=['TestAcc', 'TestF1Macro']) # To store all results

acc, f1macro = modelOne(X_train, y_train, X_test, y_test, tunedParms=modelOneBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelTwo(X_train, y_train, X_test, y_test, tunedParms=modelTwoBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelThree(X_train, y_train, X_test, y_test, tunedParms=modelThreeBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelFour(X_train, y_train, X_test, y_test, tunedParms=modelFourBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelFive(X_train, y_train, X_test, y_test, tunedParms=modelFiveBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelSix(X_train, y_train, X_test, y_test, tunedParms=modelSixBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelSeven(X_train, y_train, X_test, y_test, tunedParms=modelSevenBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelEight(X_train, y_train, X_test, y_test, tunedParms=modelEightBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelNine(X_train_withEmoji, y_train, X_test_withEmoji, y_test, tunedParms=modelNineBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelTen(X_train_withEmoji, y_train, X_test_withEmoji, y_test, tunedParms=modelTenBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

display(finalResults)

===== 	Model 1	 =====
Test accuracy	0.543
F1 micro	0.543
F1 macro	0.513
Confusion matrix
 [[100  10  29  27]
 [  4  27   6   9]
 [ 13   2  53  16]
 [ 57  42  59 146]]
===== 	Model 2	 =====
Test accuracy	0.608
F1 micro	0.608
F1 macro	0.520
Confusion matrix
 [[102   5  23  36]
 [  3  13   2  28]
 [ 24   0  39  21]
 [ 62  14  17 211]]
===== 	Model 3	 =====
Test accuracy	0.628
F1 micro	0.628
F1 macro	0.454
Confusion matrix
 [[ 90   1  13  62]
 [  2   3   2  39]
 [ 21   0  26  37]
 [ 36   1   9 258]]
===== 	Model 4	 =====
Test accuracy	0.633
F1 micro	0.633
F1 macro	0.563
Confusion matrix
 [[ 89  14  29  34]
 [  5  29   2  10]
 [ 17   5  40  22]
 [ 33  31  18 222]]
===== 	Model 5	 =====
Test accuracy	0.610
F1 micro	0.610
F1 macro	0.550
Confusion matrix
 [[ 97  13  30  26]
 [  3  28   5  10]
 [ 18   5  44  17]
 [ 43  38  26 197]]
===== 	Model 6	 =====
Test accuracy	0.658
F1 micro	0.658
F1 macro	0.527
Confusion matrix
 [[ 90   3  14  59]
 [  6  15   2  23]
 [ 22   3  21  38]
 [ 24   5   6 269]

  'precision', 'predicted', average, warn_for)


Test accuracy	0.450
F1 micro	0.450
F1 macro	0.426
Confusion matrix
 [[ 93  22  30  21]
 [  7  27   9   3]
 [ 14  10  44  16]
 [ 43  83  72 106]]
===== 	Model 10	 =====
Test accuracy	0.597
F1 micro	0.597
F1 macro	0.360
Confusion matrix
 [[ 78   1   2  85]
 [  6   2   1  37]
 [ 12   1   5  66]
 [ 24   5   2 273]]


Unnamed: 0,TestAcc,TestF1Macro
0,0.543333,0.513495
1,0.608333,0.520206
2,0.628333,0.45357
3,0.633333,0.562655
4,0.61,0.55021
5,0.658333,0.527031
6,0.645,0.501854
7,0.551667,0.326008
8,0.45,0.425599
9,0.596667,0.359573


### Output the best model for live prediction

In [23]:
modelFour(X_train, y_train, X_test, y_test, tunedParms=modelFourBestParms, saveModel=os.path.join(outFolder, 'BestModel_Feedback.pkl'))

===== 	Model 4	 =====
Test accuracy	0.633
F1 micro	0.633
F1 macro	0.563
Confusion matrix
 [[ 89  14  29  34]
 [  5  29   2  10]
 [ 17   5  40  22]
 [ 33  31  18 222]]


(0.6333333333333333, 0.5626553235129592)

# <u>Problem 2</u>  --  Predicting topic

In [24]:
# Generate X and y for topic
y_train = labelled_tweets_Y_train.topic
y_test = labelled_tweets_Y_test.topic

In [25]:
### Hyperparameter tuning using 5-fold cross validation ###
tuning = False
if tuning:
    modelOneBestParms = modelOne(X_train, y_train, X_test, y_test)
    print('modelOneBestParms = {}\n\n'.format(modelOneBestParms))

    modelTwoBestParms = modelTwo(X_train, y_train, X_test, y_test)
    print('modelTwoBestParms = {}\n\n'.format(modelTwoBestParms))

    modelThreeBestParms = modelThree(X_train, y_train, X_test, y_test)
    print('modelThreeBestParms = {}\n\n'.format(modelThreeBestParms))

    modelFourBestParms = modelFour(X_train, y_train, X_test, y_test)
    print('modelFourBestParms = {}\n\n'.format(modelFourBestParms))

    modelFiveBestParms = modelFive(X_train, y_train, X_test, y_test)
    print('modelFiveBestParms = {}\n\n'.format(modelFiveBestParms))

    modelSixBestParms = modelSix(X_train, y_train, X_test, y_test)
    print('modelSixBestParms = {}\n\n'.format(modelSixBestParms))

    modelNineBestParms = modelNine(X_train_withEmoji, y_train, X_test_withEmoji, y_test)
    print('modelNineBestParms = {}\n\n'.format(modelNineBestParms))

    modelTenBestParms = modelTen(X_train_withEmoji, y_train, X_test_withEmoji, y_test)
    print('modelTenBestParms = {}\n\n'.format(modelTenBestParms))

    modelSevenBestParms = modelSeven(X_train, y_train, X_test, y_test)
    print('modelSevenBestParms = {}\n\n'.format(modelSevenBestParms))

    modelEightBestParms = modelEight(X_train, y_train, X_test, y_test)
    print('modelEightBestParms = {}\n\n'.format(modelEightBestParms))


### Use tuned hyperparameter ###
modelOneBestParms = {'logisticregression__C': 0.1}
modelTwoBestParms = {'countvectorizer__max_df': 0.5, 'countvectorizer__min_df': 5, 
                     'countvectorizer__ngram_range': (1, 1), 'countvectorizer__stop_words': 'english', 
                     'countvectorizer__token_pattern': '\\b[^\\d\\W]+\\b', 'multinomialnb__alpha': 1}
modelThreeBestParms = {'multinomialnb__alpha': 0.2, 'tfidfvectorizer__max_df': 0.5, 'tfidfvectorizer__min_df': 5, 
                       'tfidfvectorizer__ngram_range': (1, 2), 'tfidfvectorizer__stop_words': 'english', 
                       'tfidfvectorizer__token_pattern': '\\b[^\\d\\W]+\\b'}
modelFourBestParms = {'countvectorizer__max_df': 0.5, 'countvectorizer__min_df': 3, 'countvectorizer__ngram_range': (1, 2), 
                      'countvectorizer__stop_words': 'english', 'countvectorizer__token_pattern': '\\b[^\\d\\W]+\\b', 'logisticregression__C': 0.7}
modelFiveBestParms = {'logisticregression__C': 0.9, 'tfidfvectorizer__max_df': 0.5, 'tfidfvectorizer__min_df': 3, 'tfidfvectorizer__ngram_range': (1, 2), 
                      'tfidfvectorizer__stop_words': 'english', 'tfidfvectorizer__token_pattern': '\\b[^\\d\\W]+\\b'}
modelSixBestParms = {'countvectorizer__max_df': 0.5, 'countvectorizer__min_df': 3, 'countvectorizer__ngram_range': (1, 2), 
                     'countvectorizer__stop_words': 'english', 'countvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b', 
                     'xgbclassifier__learning_rate': 0.1, 'xgbclassifier__max_depth': 3, 'xgbclassifier__n_estimators': 200}
modelSevenBestParms = {'countvectorizer__max_df': 0.5, 'countvectorizer__min_df': 4, 'countvectorizer__ngram_range': (1, 2), 
                       'countvectorizer__stop_words': 'english', 'countvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b', 
                       'kerasdensennwrapper__n_neuron': [40], 'kerasdensennwrapper__rate': 0.3}
modelEightBestParms = {'kerascnnwrapper__embeddingdim': 20, 'kerascnnwrapper__filtersizes': [2, 3, 4], 
                       'kerascnnwrapper__numfilters': 3, 'kerascnnwrapper__vocabsize': 1500}
modelNineBestParms = {'featureunion__pipeline__countvectorizer__ngram_range': (1, 1), 
                      'featureunion__pipeline__countvectorizer__stop_words': 'english', 
                      'featureunion__pipeline__countvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b', 'logisticregression__C': 0.1}
modelTenBestParms = {'featureunion__pipeline__countvectorizer__ngram_range': (1, 1), 
                     'featureunion__pipeline__countvectorizer__stop_words': 'english', 
                     'featureunion__pipeline__countvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b', 
                     'xgbclassifier__learning_rate': 0.1, 'xgbclassifier__max_depth': 3, 'xgbclassifier__n_estimators': 50}



### Evaluate model performance with tuned parameters ###
opts = {'labelList': topicLabels}
finalResults = pd.DataFrame(columns=['TestAcc', 'TestF1Macro']) # To store all results

acc, f1macro = modelOne(X_train, y_train, X_test, y_test, tunedParms=modelOneBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelTwo(X_train, y_train, X_test, y_test, tunedParms=modelTwoBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelThree(X_train, y_train, X_test, y_test, tunedParms=modelThreeBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelFour(X_train, y_train, X_test, y_test, tunedParms=modelFourBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelFive(X_train, y_train, X_test, y_test, tunedParms=modelFiveBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelSix(X_train, y_train, X_test, y_test, tunedParms=modelSixBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelSeven(X_train, y_train, X_test, y_test, tunedParms=modelSevenBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelEight(X_train, y_train, X_test, y_test, tunedParms=modelEightBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelNine(X_train_withEmoji, y_train, X_test_withEmoji, y_test, tunedParms=modelNineBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelTen(X_train_withEmoji, y_train, X_test_withEmoji, y_test, tunedParms=modelTenBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

display(finalResults)

===== 	Model 1	 =====
Test accuracy	0.473
F1 micro	0.473
F1 macro	0.431
Confusion matrix
 [[138  29  48  27  21  48   9  13]
 [  1   7   0   2   1   0   0   2]
 [ 12   1  29   4   6   2   4   1]
 [  7   4   5  28  10   4   5   0]
 [  4   0   2   5  40   6   1   0]
 [  7   2   3   3   5  11   3   1]
 [  0   0   3   0   2   0  23   0]
 [  0   0   1   1   0   1   0   8]]
===== 	Model 2	 =====
Test accuracy	0.668
F1 micro	0.668
F1 macro	0.476
Confusion matrix
 [[285   3   7  15  13   6   3   1]
 [  2   5   1   4   1   0   0   0]
 [ 24   0  21   4   4   3   3   0]
 [ 18   0   1  37   5   1   0   1]
 [ 16   0   1   6  34   0   1   0]
 [ 25   1   2   2   4   1   0   0]
 [  8   0   1   0   4   0  15   0]
 [  1   0   1   4   1   0   1   3]]
===== 	Model 3	 =====
Test accuracy	0.640
F1 micro	0.640
F1 macro	0.324
Confusion matrix
 [[317   1   2  10   1   2   0   0]
 [ 10   2   0   1   0   0   0   0]
 [ 44   0  11   2   2   0   0   0]
 [ 34   0   0  26   2   1   0   0]
 [ 31   0   0   3  23   0   

  'precision', 'predicted', average, warn_for)


Test accuracy	0.638
F1 micro	0.638
F1 macro	0.503
Confusion matrix
 [[250   3  16  27  10  18   4   5]
 [  3   6   1   2   1   0   0   0]
 [ 19   0  24   4   5   3   4   0]
 [ 12   0   1  38   4   2   4   2]
 [  8   0   1  11  33   4   1   0]
 [ 15   1   3   4   2   8   1   1]
 [  2   0   3   1   2   0  20   0]
 [  2   0   1   3   1   0   0   4]]
===== 	Model 5	 =====
Test accuracy	0.582
F1 micro	0.582
F1 macro	0.520
Confusion matrix
 [[202   6  21  29  15  44   9   7]
 [  1  10   0   2   0   0   0   0]
 [ 15   1  24   3   8   4   4   0]
 [ 10   0   4  34   5   4   4   2]
 [  3   0   1  10  38   5   1   0]
 [ 11   1   1   3   2  13   2   2]
 [  0   0   3   1   3   0  21   0]
 [  2   0   1   1   0   0   0   7]]
===== 	Model 6	 =====
Test accuracy	0.683
F1 micro	0.683
F1 macro	0.494
Confusion matrix
 [[315   2   3   7   1   3   1   1]
 [  7   5   0   1   0   0   0   0]
 [ 34   0  16   3   3   0   3   0]
 [ 33   0   3  19   4   1   2   1]
 [ 23   0   0   5  28   1   1   0]
 [ 22   1   1  

  'precision', 'predicted', average, warn_for)


Test accuracy	0.382
F1 micro	0.382
F1 macro	0.181
Confusion matrix
 [[194  36   8  15   7  27  14  32]
 [  2   0   2   3   0   1   1   4]
 [ 10   6   7   2  11   3   7  13]
 [ 13  11   0   5   8   5   8  13]
 [ 11   6   4   1   8   4  14  10]
 [  9   2   2   2   3   8   4   5]
 [  2   5   2   5   2   5   5   2]
 [  2   0   2   1   1   2   1   2]]
===== 	Model 10	 =====
Test accuracy	0.558
F1 micro	0.558
F1 macro	0.121
Confusion matrix
 [[326   1   2   1   3   0   0   0]
 [ 12   0   0   1   0   0   0   0]
 [ 52   0   1   1   5   0   0   0]
 [ 52   0   3   2   4   0   2   0]
 [ 47   0   3   2   6   0   0   0]
 [ 35   0   0   0   0   0   0   0]
 [ 25   0   1   0   2   0   0   0]
 [  9   0   1   1   0   0   0   0]]


  'precision', 'predicted', average, warn_for)


Unnamed: 0,TestAcc,TestF1Macro
0,0.473333,0.430894
1,0.668333,0.475646
2,0.64,0.323985
3,0.638333,0.50322
4,0.581667,0.520045
5,0.683333,0.493653
6,0.656667,0.384274
7,0.583333,0.183613
8,0.381667,0.180746
9,0.558333,0.121315


### Output the best model for live prediction

In [26]:
modelFive(X_train, y_train, X_test, y_test, tunedParms=modelFiveBestParms, saveModel=os.path.join(outFolder, 'BestModel_Topics.pkl'))

===== 	Model 5	 =====
Test accuracy	0.582
F1 micro	0.582
F1 macro	0.520
Confusion matrix
 [[202   6  21  29  15  44   9   7]
 [  1  10   0   2   0   0   0   0]
 [ 15   1  24   3   8   4   4   0]
 [ 10   0   4  34   5   4   4   2]
 [  3   0   1  10  38   5   1   0]
 [ 11   1   1   3   2  13   2   2]
 [  0   0   3   1   3   0  21   0]
 [  2   0   1   1   0   0   0   7]]


(0.5816666666666667, 0.5200451580110859)

# <u>Problem 3</u>  --  Predicting sentiment

In [27]:
# Generate X and y for topic
y_train = labelled_tweets_Y_train.sentiment
y_test = labelled_tweets_Y_test.sentiment

In [28]:
### Hyperparameter tuning using 5-fold cross validation ###
tuning = False
if tuning:
    modelOneBestParms = modelOne(X_train, y_train, X_test, y_test)
    print('modelOneBestParms = {}\n\n'.format(modelOneBestParms))

    modelTwoBestParms = modelTwo(X_train, y_train, X_test, y_test)
    print('modelTwoBestParms = {}\n\n'.format(modelTwoBestParms))

    modelThreeBestParms = modelThree(X_train, y_train, X_test, y_test)
    print('modelThreeBestParms = {}\n\n'.format(modelThreeBestParms))

    modelFourBestParms = modelFour(X_train, y_train, X_test, y_test)
    print('modelFourBestParms = {}\n\n'.format(modelFourBestParms))

    modelFiveBestParms = modelFive(X_train, y_train, X_test, y_test)
    print('modelFiveBestParms = {}\n\n'.format(modelFiveBestParms))

    modelSixBestParms = modelSix(X_train, y_train, X_test, y_test)
    print('modelSixBestParms = {}\n\n'.format(modelSixBestParms))
    
    modelSevenBestParms = modelSeven(X_train, y_train, X_test, y_test)
    print('modelSevenBestParms = {}\n\n'.format(modelSevenBestParms))

    modelEightBestParms = modelEight(X_train, y_train, X_test, y_test)
    print('modelEightBestParms = {}\n\n'.format(modelEightBestParms))
    
    modelNineBestParms = modelNine(X_train_withEmoji, y_train, X_test_withEmoji, y_test)
    print('modelNineBestParms = {}\n\n'.format(modelNineBestParms))

    modelTenBestParms = modelTen(X_train_withEmoji, y_train, X_test_withEmoji, y_test)
    print('modelTenBestParms = {}\n\n'.format(modelTenBestParms))



### Use tuned hyperparameter ###
modelOneBestParms = {'logisticregression__C': 0.1}
modelTwoBestParms = {'countvectorizer__max_df': 0.5, 'countvectorizer__min_df': 5, 'countvectorizer__ngram_range': (1, 1), 
                     'countvectorizer__stop_words': 'english', 'countvectorizer__token_pattern': '\\b[^\\d\\W]+\\b', 
                     'multinomialnb__alpha': 1}
modelThreeBestParms = {'multinomialnb__alpha': 0.2, 'tfidfvectorizer__max_df': 0.5, 
                       'tfidfvectorizer__min_df': 5, 'tfidfvectorizer__ngram_range': (1, 1), 
                       'tfidfvectorizer__stop_words': 'english', 'tfidfvectorizer__token_pattern': '\\b[^\\d\\W]+\\b'}
modelFourBestParms = {'countvectorizer__max_df': 0.5, 'countvectorizer__min_df': 3, 'countvectorizer__ngram_range': (1, 2), 
                      'countvectorizer__stop_words': 'english', 
                      'countvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b', 'logisticregression__C': 0.3}
modelFiveBestParms = {'logisticregression__C': 0.9, 'tfidfvectorizer__max_df': 0.5, 'tfidfvectorizer__min_df': 4, 
                      'tfidfvectorizer__ngram_range': (1, 2), 
                      'tfidfvectorizer__stop_words': 'english', 'tfidfvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b'}
modelSixBestParms = {'countvectorizer__max_df': 0.5, 'countvectorizer__min_df': 3, 'countvectorizer__ngram_range': (1, 2), 
                     'countvectorizer__stop_words': 'english', 'countvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b', 
                     'xgbclassifier__learning_rate': 0.1, 'xgbclassifier__max_depth': 3, 'xgbclassifier__n_estimators': 200}
modelSevenBestParms = {'countvectorizer__max_df': 0.5, 'countvectorizer__min_df': 4, 'countvectorizer__ngram_range': (1, 2), 
                       'countvectorizer__stop_words': 'english', 'countvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b', 
                       'kerasdensennwrapper__n_neuron': [40], 'kerasdensennwrapper__rate': 0.3}
modelEightBestParms = {'kerascnnwrapper__embeddingdim': 20, 'kerascnnwrapper__filtersizes': [2, 3, 4], 
                       'kerascnnwrapper__numfilters': 3, 'kerascnnwrapper__vocabsize': 1500}
modelNineBestParms = {'featureunion__pipeline__countvectorizer__ngram_range': (1, 2), 'featureunion__pipeline__countvectorizer__stop_words': 'english', 
                      'featureunion__pipeline__countvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b', 'logisticregression__C': 0.7}
modelTenBestParms = {'featureunion__pipeline__countvectorizer__ngram_range': (1, 1), 'featureunion__pipeline__countvectorizer__stop_words': 'english', 
                     'featureunion__pipeline__countvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b', 'xgbclassifier__learning_rate': 0.05, 
                     'xgbclassifier__max_depth': 3, 'xgbclassifier__n_estimators': 50}



### Evaluate model performance with tuned parameters ###
opts = {'labelList': sentimentLabels}
finalResults = pd.DataFrame(columns=['TestAcc', 'TestF1Macro']) # To store all results

acc, f1macro = modelOne(X_train, y_train, X_test, y_test, tunedParms=modelOneBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelTwo(X_train, y_train, X_test, y_test, tunedParms=modelTwoBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelThree(X_train, y_train, X_test, y_test, tunedParms=modelThreeBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelFour(X_train, y_train, X_test, y_test, tunedParms=modelFourBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelFive(X_train, y_train, X_test, y_test, tunedParms=modelFiveBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelSix(X_train, y_train, X_test, y_test, tunedParms=modelSixBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelSeven(X_train, y_train, X_test, y_test, tunedParms=modelSevenBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelEight(X_train, y_train, X_test, y_test, tunedParms=modelEightBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelNine(X_train_withEmoji, y_train, X_test_withEmoji, y_test, tunedParms=modelNineBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

acc, f1macro = modelTen(X_train_withEmoji, y_train, X_test_withEmoji, y_test, tunedParms=modelTenBestParms, **opts)
finalResults = finalResults.append({'TestAcc': acc, 'TestF1Macro': f1macro}, ignore_index=True)

display(finalResults)

===== 	Model 1	 =====
Test accuracy	0.593
F1 micro	0.593
F1 macro	0.558
Confusion matrix
 [[128  55  15]
 [ 90 189  57]
 [ 10  17  39]]
===== 	Model 2	 =====
Test accuracy	0.632
F1 micro	0.632
F1 macro	0.576
Confusion matrix
 [[114  79   5]
 [ 78 238  20]
 [  5  34  27]]
===== 	Model 3	 =====
Test accuracy	0.632
F1 micro	0.632
F1 macro	0.465
Confusion matrix
 [[ 83 115   0]
 [ 44 290   2]
 [  2  58   6]]
===== 	Model 4	 =====
Test accuracy	0.657
F1 micro	0.657
F1 macro	0.610
Confusion matrix
 [[127  57  14]
 [ 63 228  45]
 [  7  20  39]]
===== 	Model 5	 =====
Test accuracy	0.615
F1 micro	0.615
F1 macro	0.583
Confusion matrix
 [[127  55  16]
 [ 82 199  55]
 [  6  17  43]]
===== 	Model 6	 =====
Test accuracy	0.678
F1 micro	0.678
F1 macro	0.559
Confusion matrix
 [[ 86 109   3]
 [ 29 305   2]
 [  5  45  16]]
===== 	Model 7	 =====
Test accuracy	0.675
F1 micro	0.675
F1 macro	0.574
Confusion matrix
 [[ 99  95   4]
 [ 40 287   9]
 [  7  40  19]]
===== 	Model 8	 =====
Test accuracy	0.615
F1 mic

Unnamed: 0,TestAcc,TestF1Macro
0,0.593333,0.558261
1,0.631667,0.575903
2,0.631667,0.465238
3,0.656667,0.610012
4,0.615,0.582825
5,0.678333,0.558664
6,0.675,0.573531
7,0.615,0.494227
8,0.556667,0.52109
9,0.64,0.46037


### Output the best model for live prediction

In [29]:
modelFour(X_train, y_train, X_test, y_test, tunedParms=modelFourBestParms, saveModel=os.path.join(outFolder, 'BestModel_Sentiment.pkl'))

===== 	Model 4	 =====
Test accuracy	0.657
F1 micro	0.657
F1 macro	0.610
Confusion matrix
 [[127  57  14]
 [ 63 228  45]
 [  7  20  39]]


(0.6566666666666666, 0.6100120621064423)