In [1]:
import numpy as np
import pandas as pd
import nltk
from glob import glob

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from tqdm import tqdm


from tensorflow.keras.preprocessing import sequence, text



# from emotaglish.plot_matrix import visualize_results

### Prep

In [2]:
files = glob("../data/project_data/data_*_oversampled.csv")
print(files)
test_df = pd.read_csv(files[0])
train_df = pd.read_csv(files[1])
val_df = pd.read_csv(files[2])

r_train_x, r_train_y = train_df['text'], train_df['sentiment']
r_val_x, r_val_y = val_df['text'], val_df['sentiment']
r_test_x, r_test_y = test_df['text'], test_df['sentiment']
len(r_train_x)

['../data/project_data\\data_test_oversampled.csv', '../data/project_data\\data_train_oversampled.csv', '../data/project_data\\data_val_oversampled.csv']


33500

In [3]:
x_train, y_train = train_df['text'], train_df['sentiment']
x_val, y_val =  val_df['text'], val_df['sentiment']
x_test, y_test = test_df['text'], test_df['sentiment']
len(r_train_x)

33500

## Tokenizing

In [4]:
# count words
def count_words(sentences):
    counts = {}
    for s in sentences:
        for word in s.split():
            if word in counts:
                counts[word] += 1
            else:
                counts[word] = 1
    return counts

max_len = 240
max_features = len(count_words(r_train_x))
print("max_len",max_len,"max_features", max_features)

max_len 240 max_features 37187


In [5]:
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()

data_tok = [tokenizer.tokenize(d.lower()) for d in x_train]
data_tok

[['inangyan', 'please', 'do', 'not', 'wad', 'study'],
 ['atm',
  'looking',
  'forward',
  'to',
  'amazing',
  'lesson',
  'to',
  'be',
  'heard',
  'later',
  'with',
  'fam',
  'is',
  'one',
  'of',
  'the',
  'reasons',
  'why',
  'i',
  'am',
  'thankful',
  'to',
  'the',
  'endlessgratitudemcgi',
  'revolving',
  'heashundred',
  'points',
  'proudobemcgi'],
 ['bro', 'i', 'swear', 'i', 'am', 'trying', 'to', 'really', 'study'],
 ['if',
  'grad',
  'school',
  'does',
  'not',
  'work',
  'out',
  'i',
  'will',
  'be',
  'a',
  'hot',
  'barista',
  'in',
  'my',
  'indie',
  'coffee',
  'shop',
  'chapter',
  '1',
  'of',
  'my',
  'wattpad',
  'story',
  'begins',
  'with',
  'a',
  'mysterious',
  'customer',
  'asking',
  'if',
  'my',
  'beans',
  'are',
  'ethically',
  'sourced',
  'jowk',
  'only',
  'ok',
  'back',
  '2',
  'lesson'],
 ['do', 'not', 'really', 'study', 'promise'],
 ['ayoko', 'to', 'study', 'pensive', 'face'],
 ['i',
  'want',
  'to',
  'study',
  'but',

### Count Words and Set word length and features
- change max len depending on the length of the sentences

## Word embedding
the n-grams, glove is just P(k|word) = word
unigram just one word


In [6]:
import gensim.downloader as api
# 27 Billion twitter corpus 100 Dimension 
model = api.load('glove-twitter-100')


In [7]:
import numpy as np
def get_phrase_embedding(phrase):   
    vector = np.zeros([model.vector_size], dtype='float32')
    phrase = phrase.lower()
    phrase_tokens = tokenizer.tokenize(phrase)
    
    # average word vectors for all words in tokenized phrase, skip words that are not in model's vocabulary
    divisor = 0
    for word in phrase_tokens:
        if word in model.vocab:
            divisor += 1
            vector = vector + model.get_vector(word)
    
    if divisor != 0: vector /= divisor
    
    return vector

In [8]:

vector_matrix_x_train = list(map(get_phrase_embedding, x_train))
vector_matrix_x_train

[array([-1.32891402e-01,  1.59600675e-01,  3.91086012e-01, -5.73319979e-02,
        -1.20607994e-01, -3.41680041e-03,  5.02066612e-01, -2.66426019e-02,
         3.69071573e-01,  5.38885951e-01, -1.55959994e-01, -6.52190030e-01,
        -4.21044064e+00,  2.22299412e-01, -2.14974970e-01, -1.29793599e-01,
        -6.34811968e-02, -4.13278013e-01, -1.13036655e-01, -3.14181983e-01,
        -1.33289009e-01, -1.28740400e-01, -1.53611809e-01,  4.35914218e-01,
        -6.39930069e-02, -5.51510274e-01,  1.96646005e-01,  7.13657886e-02,
         1.02017999e-01, -2.24410035e-02,  3.54142010e-01, -2.01819986e-02,
        -1.22811399e-01,  2.86620017e-02, -2.73492008e-01, -1.03194444e-02,
         2.44483992e-01,  5.98502047e-02,  1.29280403e-01,  8.40500072e-02,
        -1.01757789e+00,  1.36290014e-01,  2.51938045e-01, -1.51990801e-01,
         2.50469983e-01,  3.77674066e-02, -2.44843394e-01, -2.91305989e-01,
        -1.87498003e-01, -1.90432012e-01,  2.05228016e-01, -6.10660017e-02,
        -6.3

In [9]:
vector_matrix_x_val = list(map(get_phrase_embedding, x_val))
vector_matrix_x_val

[array([-7.85974413e-02,  1.23440616e-01,  8.80182013e-02,  1.44353569e-01,
        -5.84563166e-02,  1.03447869e-01,  4.62054349e-02,  2.96204060e-01,
        -9.44139287e-02, -2.16062013e-02,  9.65871438e-02, -2.82850832e-01,
        -4.14569187e+00, -1.61389872e-01,  5.78422397e-02, -8.93152431e-02,
         6.55036867e-02, -2.85183787e-01, -4.79651630e-01,  1.01758525e-01,
        -3.24972242e-01,  3.92607693e-03,  1.31198421e-01,  2.05281317e-01,
        -2.55518332e-02, -2.64281631e-01,  1.65082246e-01,  6.97986931e-02,
        -8.73848423e-03,  4.58901115e-02, -6.30586892e-02,  2.91440766e-02,
        -3.77323657e-01, -1.62824079e-01,  1.93644404e-01,  5.64495139e-02,
        -2.08617561e-02,  1.55459568e-01, -5.93285598e-02, -1.71168521e-01,
        -6.51905239e-01, -1.22596219e-01,  8.15743953e-02,  1.38531089e-01,
         2.54201382e-01, -1.47683918e-01,  1.28902346e-01,  2.90333033e-01,
         8.88271704e-02,  1.84445679e-01,  1.03487492e-01, -1.31201208e-01,
         2.3

In [10]:
vector_matrix_x_test = list(map(get_phrase_embedding, x_test))
vector_matrix_x_test

[array([ 2.08203524e-01,  1.29121765e-01,  1.54559733e-02,  2.78440416e-01,
        -6.49257824e-02,  4.39317934e-02,  5.44303320e-02, -3.35007131e-01,
         9.24618691e-02, -6.39668554e-02,  7.53955245e-02, -1.65793762e-01,
        -4.29886198e+00, -6.07661046e-02, -7.07801953e-02, -1.68177292e-01,
         2.90579386e-02,  1.18672766e-03, -1.59883767e-01,  1.40486900e-02,
        -1.65798217e-01,  6.74222559e-02, -9.79262665e-02, -1.89602412e-02,
        -6.06207624e-02, -6.74506545e-01,  1.50461584e-01,  2.65598577e-02,
         1.30263641e-01,  1.65795624e-01, -1.40513957e-01, -3.84823456e-02,
        -1.78278223e-01, -5.43226935e-02,  1.25616327e-01,  7.25105703e-02,
         1.25958905e-01,  1.70306697e-01,  9.43208858e-02, -1.53390691e-01,
        -7.26490736e-01,  6.78993464e-02, -5.28204367e-02, -2.54983082e-02,
         5.02891064e-01,  1.47729302e-02, -5.25929928e-02,  6.20252229e-02,
         8.13897140e-03, -1.22262381e-01,  1.13458082e-01,  7.12280199e-02,
        -1.6

## MODEL AND TRAINING WITH HYPERPARAMETER TUNING

In [11]:

def get_pred_score(model, x_train, y_train, x_val, y_val, x_test, y_test):
    # input the scores here for comparison
    model.fit(x_train,y_train)
    
    # evaluation
#     print("Evalusation")
#     eval_train = model.evaluate(x_train, y_train)
#     eval_val = model.evaluate(x_val, y_val)
#     eval_test = model.evaluate(x_test, y_test)
    
    y_pred_train = model.predict(x_train)
    y_pred_val = model.predict(x_val)
    y_pred_test = model.predict(x_test)
    
    train_acc = accuracy_score(y_train, y_pred_train)
    val_acc = accuracy_score(y_val, y_pred_val)
    test_acc = accuracy_score(y_test, y_pred_test)
    
    train_f1 = f1_score(y_train, y_pred_train, average='macro')
    val_f1 = f1_score(y_val, y_pred_val, average='macro')
    test_f1 = f1_score(y_test, y_pred_test, average='macro')

    train_cl_rep = classification_report(y_train, y_pred_train)
    val_cl_rep = classification_report(y_val, y_pred_val)
    test_cl_rep = classification_report(y_test, y_pred_test)
    
    
    print(f"train acc: {train_acc}, val acc: {val_acc}, test acc: {test_acc}")
    print(f"train F1: {train_f1}, val F1: {val_f1}, test F1: {test_f1}")
#     print(f"EVALUATION :: train: {eval_train}, val : {eval_val}, test : {eval_test}" )

    # (accuracy, F1, classification report)
    train_score = (train_acc, train_f1, train_cl_rep)
    val_score = (val_acc, val_f1, val_cl_rep)
    test_score = (test_acc, test_f1, test_cl_rep)
    

    return train_score, val_score, test_score

def get_best_param_and_score(model_name, current_param, train_score, val_score, test_score, best_score, best_perf, best_param):
    test_acc = test_score[0] # acc = [0],  f1 = [1], class report = [2]
    
    if test_acc >= best_score: 
        best_score = test_acc # macro?
        best_perf.update({"train_score":train_score,"val_score":val_score,"test_score":test_score})
        
        # save the current parameters
        if model_name == "SVC":
            best_param.update({'C': current_param[0], 'gamma':current_param[1],'kernel': current_param[2]})
        if model_name == "BernoulliNB":
            best_param.update({'alpha': current_param})
        if model_name == "RandomForestClassifier":
            best_param.update({'max_depth': current_param[0], 'min_samples_leaf':current_param[1],
                               'min_samples_split': current_param[2], 'n_estimators':current_param[3]})
    
    return best_param, best_perf, best_score

def print_performance(best_BNB):
    print("\n===========================================")
    perf_list = ['train_score', 'val_score','test_score']
    for pname in perf_list:
        print(pname)
        perf_scores = best_BNB["Score"][pname]
        print(f"Accuracy: {perf_scores[0]}, Macro F1-Score: {perf_scores[1]}")
        print(perf_scores[2])# classification report
    print(f"Best Parameter: {best_BNB['Parameter']}")
    print("\n===========================================")
    
def model_BernoulliNB(model_name, hyperparam):
    best_score = 0
    best_perf = {}
    best_param = {}
    for alpha in hyperparam['alpha']:
        print("____________________________________________________________________")
        print(f"PARAM :: alpha:{alpha}")
        model = BernoulliNB(alpha = alpha) # model
        train_score, val_score, test_score = get_pred_score(model,
                                             vector_matrix_x_train, y_train,
                                             vector_matrix_x_val, y_val, 
                                             vector_matrix_x_test, y_test)
        # (Accuracy, F1, cls rep)
        best_param, best_perf, best_score = get_best_param_and_score(
                                                model_name, alpha, 
                                                train_score, val_score,
                                                test_score, best_score, 
                                                best_perf, best_param)
    best_BNB = {'Parameter': best_param, 'Score':best_perf}
    return best_BNB

def model_SVC(model_name, hyperparam):
    best_score = 0
    best_perf = {}
    best_param = {}
    for c in hyperparam['C']:
        for gamma in hyperparam['gamma']:
            for kernel in hyperparam['kernel']:
                model = SVC(C = c, gamma = gamma, kernel = kernel)
                print("____________________________________________________________________")
                print(f"PARAM :: C:{c}, gamma:{gamma}, kernel:{kernel}")
                train_score, val_score, test_score = get_pred_score(model,
                                                     vector_matrix_x_train, y_train,
                                                     vector_matrix_x_val, y_val, 
                                                     vector_matrix_x_test, y_test)
                # (Accuracy, F1, cls rep)
                current_param = (c, gamma, kernel)
                best_param, best_perf, best_score = get_best_param_and_score(
                                                        model_name, current_param, 
                                                        train_score, val_score,
                                                        test_score, best_score, 
                                                        best_perf, best_param)
    best_SVC = {'Parameter': best_param, 'Score':best_perf}
    return best_SVC

def model_RFC(model_name, hyperparam):
    best_score = 0
    best_perf = {}
    best_param = {}
    for max_depth in hyperparam['max_depth']:
        for min_samples_leaf in hyperparam['min_samples_leaf']:
            for min_samples_split in hyperparam['min_samples_split']:
                for n_estimators in hyperparam['n_estimators']:
                    print("____________________________________________________________________")
                    print(f"PARAM :: max_depth:{max_depth}, min_samples_leaf:{min_samples_leaf}, min_samples_split:{min_samples_split}, n_estimators:{n_estimators} ")
                    model = RandomForestClassifier(max_depth = max_depth,
                        min_samples_leaf = min_samples_leaf,
                        min_samples_split = min_samples_split,
                        n_estimators = n_estimators)
                    train_score, val_score, test_score = get_pred_score(model,
                                                         vector_matrix_x_train, y_train,
                                                         vector_matrix_x_val, y_val, 
                                                         vector_matrix_x_test, y_test)
                    # (Accuracy, F1, cls rep)
                    current_param = (max_depth, min_samples_leaf, min_samples_split, n_estimators)
                    best_param, best_perf, best_score = get_best_param_and_score(
                                                            model_name, current_param, 
                                                            train_score, val_score,
                                                            test_score, best_score, 
                                                            best_perf, best_param)  
    best_RFC = {'Parameter': best_param, 'Score':best_perf}
    return best_RFC
        
"""
Insert the brute force hyperparameter loops here (Models)

"""

    
    
def hyper_param_tuning(model_name, hyperparam):
    best_param = {}
    best_perf = {}
    best_acc_score = 0
    # study different types of classifier
    if model_name == "SVC": 
        best_SVC = model_SVC(model_name, hyperparam)
        print_performance(best_SVC)
#         for c in hyperparam['C']:
#             for gamma in hyperparam['gamma']:
#                 for kernel in hyperparam['kernel']:
#                     model = SVC(C = c, gamma = gamma, kernel = kernel)
#                     clf_res, best_param = get_best_param(model_name, model)
                    
    if model_name == "BernoulliNB":
        best_BNB = model_BernoulliNB(model_name, hyperparam)
        print_performance(best_BNB)
#         for alpha in hyperparam['alpha']:
#             model = BernoulliNB(alpha = alpha)
#             train_score, val_score, test_score = get_pred_score(model_name, model,
#                                                  vector_matrix_x_train, y_train,
#                                                  vector_matrix_x_val, y_val, 
#                                                  vector_matrix_x_test, y_test)
#             # get best parameter based from test_acc
#             if test_acc >= best_acc_score: 
#                 best_acc_score = test_acc
#                 best_scores.update({"train_score":train_score,"val_score":val_score,"test_score":test_score})
#                 best_param.update({""})
        
    if  model_name == "RandomForestClassifier":
        best_RFC = model_RFC(model_name, hyperparam)
        print_performance(best_RFC)        
#         for max_depth in hyperparam['max_depth']:
#             for min_samples_leaf in hyperparam['min_samples_leaf']:
#                 for min_samples_split in hyperparam['min_samples_split']:
#                     for n_estimators in hyperparam['n_estimators']:
#                         model = RandomForestClassifier(max_depth = max_depth,
#                             min_samples_leaf = min_samples_leaf,
#                             min_samples_split = min_samples_split,
#                             n_estimators = n_estimators)
#                         clf_res, best_param = get_best_param(model_name, model)
    

In [12]:
# hyper parameter tuning
hyper_param = {
    "BNB": {
        'alpha' : [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
    },
    "SVC": {
        'C': [1],#,10,100],#[0.1, 1, 10, 100, 1000], # regularization # 'C':[100],#
        'gamma': [0.1,0.01,0.001],#[1, 0.1, 0.01, 0.001, 0.0001], # kernel coefficient # 'gamma':[0.0001],#
        'kernel': ['rbf'] # radial basis function # 'kernel':['rbf']#
    },
    "RFC": {
    'bootstrap': [True], #[True, False],
     'max_depth': [10, 20, 30, 40, 50, 60], #70, 80, 90, 100, None],
     'max_features': ['auto'],#['auto', 'sqrt'],
     'min_samples_leaf': [1, 2],#, 4],
     'min_samples_split': [2, 5],#, 10],
     'n_estimators': [200, 400, 600, 800, 1000]#, 1200, 1400, 1600, 1800, 2000]
    }
}


In [13]:
# hyper parameter tuning
hyper_param = {
    "BNB": {
        'alpha' : [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
    },
    "SVC": {
        'C': [1],#,10,100],#[0.1, 1, 10, 100, 1000], # regularization # 'C':[100],#
        'gamma': [0.1,0.01,0.001],#[1, 0.1, 0.01, 0.001, 0.0001], # kernel coefficient # 'gamma':[0.0001],#
        'kernel': ['rbf'] # radial basis function # 'kernel':['rbf']#
    },
    "RFC": {
    'bootstrap': [True], #[True, False],
     'max_depth': [None],#, 80, 90, 100, None],
     'max_features': ['auto'],#['auto', 'sqrt'],
     'min_samples_leaf': [1],#4],
     'min_samples_split': [2],#, 10],
     'n_estimators': [500, 1000, 2000]#[3000, 6000],#200, 400, 600, 800, 1000]#, 1200, 1400, 1600, 1800, 2000]
    }
}
hyper_param_tuning('RandomForestClassifier', hyper_param['RFC'])

____________________________________________________________________
PARAM :: max_depth:None, min_samples_leaf:1, min_samples_split:2, n_estimators:500 
train acc: 0.9993134328358209, val acc: 0.48435634105564845, test acc: 0.49008836876044903
train F1: 0.9993134238948942, val F1: 0.4780478966652521, test F1: 0.4843277566943188
____________________________________________________________________
PARAM :: max_depth:None, min_samples_leaf:1, min_samples_split:2, n_estimators:1000 
train acc: 0.9993134328358209, val acc: 0.4810126582278481, test acc: 0.49964174826845
train F1: 0.9993134083065407, val F1: 0.47473129412761644, test F1: 0.4913340271582071
____________________________________________________________________
PARAM :: max_depth:None, min_samples_leaf:1, min_samples_split:2, n_estimators:2000 
train acc: 0.9993134328358209, val acc: 0.480057320277048, test acc: 0.5034631000716503
train F1: 0.9993134082942301, val F1: 0.47520937269430263, test F1: 0.49610665873024234

train_score

In [None]:
# hyper parameter tuning
hyper_param = {
    "BNB": {
        'alpha' : [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
    },
    "SVC": {
        'C': [1],#,10,100],#[0.1, 1, 10, 100, 1000], # regularization # 'C':[100],#
        'gamma': [0.1,0.01,0.001],#[1, 0.1, 0.01, 0.001, 0.0001], # kernel coefficient # 'gamma':[0.0001],#
        'kernel': ['rbf'] # radial basis function # 'kernel':['rbf']#
    },
    "RFC": {
    'bootstrap': [True], #[True, False],
     'max_depth': [None],#, 80, 90, 100, None],
     'max_features': ['auto'],#['auto', 'sqrt'],
     'min_samples_leaf': [4],#4],
     'min_samples_split': [8],#, 10],
     'n_estimators': [1000, 2000]#[3000, 6000],#200, 400, 600, 800, 1000]#, 1200, 1400, 1600, 1800, 2000]
    }
}
hyper_param_tuning('RandomForestClassifier', hyper_param['RFC'])

____________________________________________________________________
PARAM :: max_depth:None, min_samples_leaf:4, min_samples_split:8, n_estimators:1000 
train acc: 0.9946567164179104, val acc: 0.470026271793647, test acc: 0.48316216861714834
train F1: 0.9946617101943328, val F1: 0.464721151944621, test F1: 0.47563875053577975
____________________________________________________________________
PARAM :: max_depth:None, min_samples_leaf:4, min_samples_split:8, n_estimators:2000 


In [None]:
#     for max_depth in hyperparam['max_depth']:
#         for min_samples_leaf in hyperparam['min_samples_leaf']:
#             for min_samples_split in hyperparam['min_samples_split']:
#                 for n_estimators in hyperparam['n_estimators']:

hyper_param = {
    "BNB": {
        'alpha' : [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
    },
    "SVC": {
        'C': [1],#,10,100],#[0.1, 1, 10, 100, 1000], # regularization # 'C':[100],#
        'gamma': [0.1,0.01,0.001],#[1, 0.1, 0.01, 0.001, 0.0001], # kernel coefficient # 'gamma':[0.0001],#
        'kernel': ['rbf'] # radial basis function # 'kernel':['rbf']#
    },
    "RFC": {
    'bootstrap': [True], #[True, False],
     'max_depth': [10, 20, 30, 40, 50, 60], #70, 80, 90, 100, None],
     'max_features': ['auto'],#['auto', 'sqrt'],
     'min_samples_leaf': [1, 2],#, 4],
     'min_samples_split': [2, 5],#, 10],
     'n_estimators': [200, 400, 600, 800, 1000]#, 1200, 1400, 1600, 1800, 2000]
    }
}


hyper_param_tuning('RandomForestClassifier', hyper_param['RFC'])

In [24]:
hyper_param_tuning('BernoulliNB', hyper_param['BNB']) # 'alpha': 10.0 = BEST

____________________________________________________________________
PARAM :: alpha:0.0001
train acc: 0.3676716417910448, val acc: 0.3377119656078338, test acc: 0.34869835204203486
train F1: 0.3603125430726723, val F1: 0.33310983847357845, test F1: 0.34250919965501664
____________________________________________________________________
PARAM :: alpha:0.001
train acc: 0.3676716417910448, val acc: 0.3377119656078338, test acc: 0.34869835204203486
train F1: 0.3603125430726723, val F1: 0.33310983847357845, test F1: 0.34250919965501664
____________________________________________________________________
PARAM :: alpha:0.01
train acc: 0.3676716417910448, val acc: 0.3377119656078338, test acc: 0.34869835204203486
train F1: 0.3603125430726723, val F1: 0.33310983847357845, test F1: 0.34250919965501664
____________________________________________________________________
PARAM :: alpha:0.1
train acc: 0.3676716417910448, val acc: 0.3377119656078338, test acc: 0.34869835204203486
train F1: 0.360312

In [None]:
print("hello")

In [None]:
"SVC": {
    'C': [1,10,100],#[0.1, 1, 10, 100, 1000], # regularization # 'C':[100],#
    'gamma': [1,0.1,0.01,0.001],#[1, 0.1, 0.01, 0.001, 0.0001], # kernel coefficient # 'gamma':[0.0001],#
    'kernel': ['rbf'] # radial basis function # 'kernel':['rbf']#
},
model = SVC()
model.fit(vector_matrix_x_train, y_train)
y_pred = model.predict(vector_matrix_x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
hyper_param = {
    "BNB": {
        'alpha' : [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
    },
    "SVC": {
        'C': [10],#,10,100],#[0.1, 1, 10, 100, 1000], # regularization # 'C':[100],#
        'gamma': [0.5],#0.01,0.001],#[1, 0.1, 0.01, 0.001, 0.0001], # kernel coefficient # 'gamma':[0.0001],#
        'kernel': ['rbf'] # radial basis function # 'kernel':['rbf']#
    },
    "RFC": {
    'bootstrap': [True], #[True, False],
     'max_depth': [10, 20, 30, 40, 50, 60], #70, 80, 90, 100, None],
     'max_features': ['auto'],#['auto', 'sqrt'],
     'min_samples_leaf': [1, 2],#, 4],
     'min_samples_split': [2, 5],#, 10],
     'n_estimators': [200, 400, 600, 800, 1000]#, 1200, 1400, 1600, 1800, 2000]
    }
}
hyper_param_tuning('SVC', hyper_param['SVC']) # {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
# C = 10

____________________________________________________________________
PARAM :: C:10, gamma:0.5, kernel:rbf


In [29]:
hyper_param = {
    "BNB": {
        'alpha' : [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
    },
    "SVC": {
        'C': [10],#,10,100],#[0.1, 1, 10, 100, 1000], # regularization # 'C':[100],#
        'gamma': [1, 0.1,0.01,0.001],#[1, 0.1, 0.01, 0.001, 0.0001], # kernel coefficient # 'gamma':[0.0001],#
        'kernel': ['rbf'] # radial basis function # 'kernel':['rbf']#
    },
    "RFC": {
    'bootstrap': [True], #[True, False],
     'max_depth': [10, 20, 30, 40, 50, 60], #70, 80, 90, 100, None],
     'max_features': ['auto'],#['auto', 'sqrt'],
     'min_samples_leaf': [1, 2],#, 4],
     'min_samples_split': [2, 5],#, 10],
     'n_estimators': [200, 400, 600, 800, 1000]#, 1200, 1400, 1600, 1800, 2000]
    }
}
hyper_param_tuning('SVC', hyper_param['SVC']) # {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
# C = 10

____________________________________________________________________
PARAM :: C:10, gamma:0.1, kernel:rbf
train acc: 0.6798805970149254, val acc: 0.5447814664437545, test acc: 0.5550513494148555
train F1: 0.6798314403014106, val F1: 0.5384731506084557, test F1: 0.5519120196635996
____________________________________________________________________
PARAM :: C:10, gamma:0.01, kernel:rbf
train acc: 0.49916417910447763, val acc: 0.4731311201337473, test acc: 0.4910437067112491
train F1: 0.49864161343490193, val F1: 0.46628020038567064, test F1: 0.48720942816992396
____________________________________________________________________
PARAM :: C:10, gamma:0.001, kernel:rbf
train acc: 0.46325373134328357, val acc: 0.45044184380224506, test acc: 0.46286123716264627
train F1: 0.46250869701398994, val F1: 0.4463078194343895, test F1: 0.46072944286512685

train_score
Accuracy: 0.6798805970149254, Macro F1-Score: 0.6798314403014106
              precision    recall  f1-score   support

       Angry

In [27]:

hyper_param_tuning('SVC', hyper_param['SVC']) # {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
# C = 1

____________________________________________________________________
PARAM :: C:1, gamma:0.1, kernel:rbf
train acc: 0.5493432835820895, val acc: 0.5065679484117507, test acc: 0.5170766658705517
train F1: 0.549140290083326, val F1: 0.5008412742546356, test F1: 0.5115574587499097
____________________________________________________________________
PARAM :: C:1, gamma:0.01, kernel:rbf
train acc: 0.4655223880597015, val acc: 0.4509195127776451, test acc: 0.4669214234535467
train F1: 0.4648768119523565, val F1: 0.4468150866712217, test F1: 0.46479840706038367
____________________________________________________________________
PARAM :: C:1, gamma:0.001, kernel:rbf
train acc: 0.41008955223880594, val acc: 0.389777883926439, test acc: 0.4048244566515405
train F1: 0.4071415937592612, val F1: 0.38641851510613584, test F1: 0.4005622668517536

train_score
Accuracy: 0.5493432835820895, Macro F1-Score: 0.549140290083326
              precision    recall  f1-score   support

       Angry       0.60 

In [15]:
hyper_param_tuning('RandomForestClassifier', hyper_param['RFC'])

____________________________________________________________________
train acc: 0.733044776119403, val acc: 0.4389777883926439, test acc: 0.4511583472653451
train F1: 0.7350573118262183, val F1: 0.43269871771654866, test F1: 0.4429612137732025
____________________________________________________________________
train acc: 0.7339701492537314, val acc: 0.4380224504418438, test acc: 0.4528301886792453
train F1: 0.7359959040182151, val F1: 0.43303408731138465, test F1: 0.4460551125187271
____________________________________________________________________
train acc: 0.7379701492537314, val acc: 0.4451874850728445, test acc: 0.4592787198471459
train F1: 0.7401164075980213, val F1: 0.4392912709274732, test F1: 0.450904271596359
____________________________________________________________________
train acc: 0.7387164179104477, val acc: 0.43754478146644377, test acc: 0.459995223310246
train F1: 0.7408803783948578, val F1: 0.43205943715781514, test F1: 0.45266291822739985


KeyboardInterrupt: 

## NO HYPER PARAMETERS

In [None]:
model = SVC()
model.fit(vector_matrix_x_train, y_train)
y_pred = model.predict(vector_matrix_x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.4618729616400062
              precision    recall  f1-score   support

       Angry       0.21      0.34      0.26       436
        Fear       0.49      0.47      0.48      1836
       Happy       0.46      0.56      0.50       862
         Sad       0.47      0.40      0.43      1555
    Surprise       0.54      0.48      0.51      1750

    accuracy                           0.46      6439
   macro avg       0.43      0.45      0.44      6439
weighted avg       0.48      0.46      0.47      6439



In [None]:
#SVC with Hyperparamter Tuning
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001, 0.0001],'kernel': ['rbf']}
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid.fit(vector_matrix_x_train, y_train)



Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=11.9min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=12.0min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=12.7min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=12.7min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=13.4min
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=12.1min
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=12.4min
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=12.3min
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=12.1min
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=11.8min
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=13.9min
[CV] END ......................C=0.1, gamma=0.0

In [None]:
print(grid.best_estimator_)

grid_predictions = grid.predict(x_test)
print(confusion_matrix(y_test,grid_predictions))
print(classification_report(y_test,grid_predictions))#Output

NameError: ignored

In [None]:
model = BernoulliNB()
model.fit(vector_matrix_x_train, y_train)
y_pred = model.predict(vector_matrix_x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.3228762230159963
              precision    recall  f1-score   support

       Angry       0.12      0.42      0.19       436
        Fear       0.43      0.34      0.38      1836
       Happy       0.34      0.47      0.39       862
         Sad       0.31      0.25      0.28      1555
    Surprise       0.44      0.27      0.33      1750

    accuracy                           0.32      6439
   macro avg       0.33      0.35      0.32      6439
weighted avg       0.37      0.32      0.33      6439



In [None]:
model = RandomForestClassifier()
model.fit(vector_matrix_x_train, y_train)
y_pred = model.predict(vector_matrix_x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.43003571983227207
              precision    recall  f1-score   support

       Angry       0.21      0.40      0.28       436
        Fear       0.47      0.45      0.46      1836
       Happy       0.41      0.49      0.45       862
         Sad       0.43      0.38      0.41      1555
    Surprise       0.52      0.43      0.47      1750

    accuracy                           0.43      6439
   macro avg       0.41      0.43      0.41      6439
weighted avg       0.45      0.43      0.44      6439



In [None]:
#Random Forest with SVC
param_grid = {
    'bootstrap': [True, False],
     'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
     'max_features': ['auto', 'sqrt'],
     'min_samples_leaf': [1, 2, 4],
     'min_samples_split': [2, 5, 10],
     'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
    }
grid_predictions = grid.predict(x_test)
grid_search_forest = GridSearchCV(forest, param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search_forest.fit(vector_matrix_x_train, grid_predictions)

In [None]:
grid_predictions = grid.predict(x_test)
print(classification_report(y_test,grid_predictions))

RandomForestClassifier()