Main reference:

https://medium.com/swlh/natural-language-processing-nlp-analysis-with-amazon-review-data-part-i-data-engineering-6573b782e4dc
https://melaniesoek0120.medium.com/natural-language-processing-nlp-amazon-review-data-part-ii-eda-data-preprocessing-and-model-3866dcbdbb77 


## Read and DropNan

In [1]:
import pandas as pd
import numpy as np
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer as ps
from nltk.stem.wordnet import WordNetLemmatizer


df = pd.read_csv(r"C:\Users\Lim Jia Hui\Desktop\songsdata.csv")
print(len(df.reviewText))
print(df.head())
nanlist = []
for text in range(len(df.reviewText)):
    if type(df.reviewText[text])!= str:
        nanlist.append(int(text))
print(nanlist)
df.drop(index=nanlist, axis=0,inplace=True)
df.reset_index(inplace=True)


93907
   overall                                         reviewText
0        5  This is awesome to listen to, A must-have for ...
1        5                                               bien
2        5  It was great to hear the old stuff again and I...
3        4  well best of's are a bit poison normally but t...
4        5  What can I say? This is Casting Crowns!!!This ...
[2165, 54714]


## Change Labels to Binary

In [2]:
for idx in range(df.shape[0]):
    if df.loc[idx,'overall'] <= 3:
        df.loc[idx,'overall'] = 0
    if df.loc[idx,'overall'] > 3:
        df.loc[idx,'overall'] = 1
print(df['overall'].head())

0    1
1    1
2    1
3    1
4    1
Name: overall, dtype: int64


# Feature Extraction

## Stopwords

In [3]:
sw_list = stopwords.words('english')
sw_list += list(string.punctuation)
sw_list += ["''", '""', '...', '``', '’', '“', '’', '”', '‘', '‘',"'", '©','said',"'s", "also",'one',"n't",'com', '-', '–', '—', '_',"/"]

## Tokenizer, Stemmer, and Lemmatizer

In [4]:
def Tokenizer(data):
    tokens = nltk.word_tokenize(data)
    tokenized_data= []
    for i in tokens:
        if i.lower() not in sw_list:
            tokenized_data.append(i.lower())
    return tokenized_data

def Stemmer(data2):
    stemmed_data =[]
    for j in data2:
        stemmed_data.append(ps().stem(j))
    return stemmed_data

def Lemmatizer(data3):
    lemmatized_data = []
    for k in data3:
        lemmatized_data.append(WordNetLemmatizer().lemmatize(k, pos='v'))
    return lemmatized_data



lemmatized_reviews = list(map(Lemmatizer,(map(Stemmer, (map(Tokenizer, df['reviewText']))))))

print(lemmatized_reviews[:5])



[['awesom', 'listen', 'must-hav', 'slayer', 'fan', '..', 'sadli', 'need', 'tripl', 'disc', 'set', '..', 'mani', 'hit'], ['bien'], ['great', 'hear', 'old', 'stuff', 'like', 'new', 'stuff', 'recommend', 'slayer', 'fan'], ['well', 'best', 'bite', 'poison', 'normal', 'bad', 'pretti', 'good', "'d", 'put', '90', 'hell', 'await', 'reign', 'blood', 'south', 'season', 'divin', 'coupl', 'musica', 'track', 'everyth', 'god', 'hate', '-at', 'point', 'best', 'mean', 'everi', 'cd', 'mainli', 'bad', 'dose', 'put', 'great', 'track', 'live', 'show', 'play', 'much', 'like,213', 'skeleton', 'societi', 'sex', 'murder', 'art', 'gemini', 'rare', 'track', 'final', 'six', 'bonu', 'track', 'christ', 'illus', 'mysteri', 'cover', 'song', 'unditstput', 'attitud', 'cd', 'would', 'greatest', 'hit', 'collect', 'know', 'put', 'coupl', 'live', 'track', 'too.al', 'could', 'much', 'wors', 'great', 'car'], ['say', 'cast', 'crown', 'good', 'bless', 'fill', 'cd']]


## Post Lemmatized Nan Drops

In [5]:
nanlist2 = []
for word in range(len(lemmatized_reviews)):
    if len(lemmatized_reviews[word]) == 0:
        nanlist2.append(word)
print(nanlist2)
print(len(nanlist2))
print(len(lemmatized_reviews))
for nan in nanlist2:
    lemmatized_reviews.remove([])
print(len(lemmatized_reviews))



[293, 1792, 1942, 2652, 3560, 3857, 5025, 5122, 5568, 6552, 7846, 8136, 9464, 10069, 11448, 11803, 12422, 12825, 13354, 14932, 18283, 28588, 30450, 32356, 32601, 33455, 35223, 35698, 35703, 37831, 37910, 39228, 40752, 42037, 43383, 44559, 45716, 48627, 50097, 52295, 52380, 52690, 56502, 59520, 60623, 61150, 61165, 62528, 65639, 66416, 69224, 70043, 70892, 71015, 71677, 72711, 73298, 73509, 73897, 75327, 75406, 76099, 77468, 77836, 77867, 80751, 81848, 82071, 82843, 85562, 86105, 86398, 88067, 88161, 88665, 89174, 89212, 90531, 91580, 92817]
80
93905
93825


### Removal from original df

In [6]:
df.drop(index=nanlist2, axis=0,inplace=True)
df.reset_index(inplace=True)


## Train-Test and Train-Val-Test Split

In [7]:
from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test = train_test_split(pd.DataFrame({'processed_reviews':lemmatized_reviews}), df['overall'].to_frame(name='overall'), test_size=0.2, random_state=0)

x_train_train, x_val, y_train_train, y_val = train_test_split(x_train,y_train,test_size=0.2,stratify=y_train, random_state=0)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

print(x_train_train.shape)
print(x_val.shape)
print(y_train_train.shape)
print(y_val.shape)



(75060, 1)
(18765, 1)
(75060, 1)
(18765, 1)
(60048, 1)
(15012, 1)
(60048, 1)
(15012, 1)


## Resampling


In [8]:
# optional "manual" resampler with sklearn.utils.resample

# from sklearn.utils import resample

# xy_train = pd.concat([x_train,y_train], axis=1)
# xy_train_train = pd.concat([x_train_train,y_train_train], axis=1)

# def resampling(minority):
#    resampled_minority = resample(minority, replace=True, n_samples= len(xy_train[xy_train.overall==5]), random_state=0)
#    return resampled_minority

# resampled_xy_train = xy_train[xy_train.overall==5]
# resampled_xy_train_train = xy_train_train[xy_train_train.overall==5]

# for i in range (1,5):
#    resampled_xy_train = pd.concat([resampled_xy_train,resampling(xy_train[xy_train.overall==i])])

# for i in range (1,5):
#    resampled_xy_train_train = pd.concat([resampled_xy_train_train,resampling(xy_train_train[xy_train_train.overall==i])])

# resampled_shuffled_xy_train = resampled_xy_train.sample(frac=1)
# x_train = resampled_shuffled_xy_train['processed_reviews'].to_frame()
# y_train = resampled_shuffled_xy_train['overall'].to_frame()

# resampled_shuffled_xy_train_train = resampled_xy_train_train.sample(frac=1)
# x_train_train = resampled_shuffled_xy_train_train['processed_reviews'].to_frame()
# y_train_train = resampled_shuffled_xy_train_train['overall'].to_frame()

In [9]:
from sklearn.utils import resample
from imblearn.over_sampling import RandomOverSampler

x_train, y_train = RandomOverSampler(random_state=0).fit_resample(x_train,y_train)

x_train_train, y_train_train = RandomOverSampler(random_state=0).fit_resample(x_train_train,y_train_train)



## Tf-Idf Vectorization

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

def dummy_token_and_pre(doc):
    return doc

tfidf = TfidfVectorizer(analyzer='word', tokenizer=dummy_token_and_pre, preprocessor=dummy_token_and_pre, token_pattern=None)

tfidf_val = TfidfVectorizer(analyzer='word', tokenizer=dummy_token_and_pre, preprocessor=dummy_token_and_pre, token_pattern=None)

x_trainvec = tfidf.fit_transform(x_train.processed_reviews).sorted_indices()
x_testvec = tfidf.transform(x_test.processed_reviews).sorted_indices()

x_train_trainvec = tfidf_val.fit_transform(x_train_train.processed_reviews).sorted_indices()
x_valvec = tfidf_val.transform(x_val.processed_reviews).sorted_indices()

## Stacking

### TF MLP

In [11]:
#pseudocode for mlp hyperparam tuning

#obj()
    #param = suggest()

    #model = model()

    #for epochs in range()
    #            agg_f1 = []

    #    for train_index, val_index in StratifiedKFold(n_splits=5).split(x_trainvec,y_train): 

    #        x_trainvec_resd, y_train_resd = RandomOverSampler(random_state=0).fit_resample(x_trainvec.iloc[train_index], y_train.iloc[train_index])    

    #        model.fit(x=x_trainvec_resd, y=y_train_resd, epochs =(5^(rung)), verbose=0, batch_size= 100, random_state=0, callbacks=es)

    #        agg_f1.append(f1_score(y_pred=model.predict(pd.DataFrame.sparse.from_spmatrix(x_trainvec).iloc[val_index]), y_true=y_train.iloc[val_index].to_numpy().ravel(), average = 'macro'))

    #    intermediate_value = statistics.mean(agg_f1)



#rant: The challenge in implementing cross-validation for a tf model is the use of a tf model with (a lack of)sklearn functionalities
#      In this case, tf models have some hyperparams that need do be defined in .fit(), which clashes with sklearn functionalities 
#      as some sklearn functions have .fit() implicit in them. The important one here is cross_val_score(). Without this, one has to
#      resort to manually implementing the (stratified)cross_val function. This is a further issue when the data is imbalanced.
#      From googling it seems that the way to treat imbalanced data in cross-validation is to oversample the training folds
#      for each cross-validation split. This can normally be combined into a sklearn/imblearn pipeline and then passed to the
#      cross-validation function as seen in the random forest model. However, as mentioned this is a tf model with sklearn functionalities,
#      hence one has to implement this part manually as well. This is a bigger conundrum that one might initially expect.
#      As the method used is usually to slice out the training part of each cross-validation fold and then apply an oversampling function.
#      However, slicing a sparse matrix with a 1d np array as input is something that googling doesnt show how to do.
#      Now, you might think of transforming the sparse matrix into a pd df so that you could slice it with say iloc, but  
#      because of an unknown reason while the oversampling function as mentioned previously is supposed to be able to take 
#      both pandas df and sparse matrix as inputs, the pandas df version of a sparse matrix is not a valid input. This is vile shenanigans 
#      and I am utterly repulsed by it. Yet, given all this there is still the option of moving the tf-idf vectorization into the 
#      cross-validation part so that the sparse matrix is only made post oversampling, or making your own cross-validation
#      function with libraries using an updated version of pandas/scipy(both of which I will NOT do).
#  

#      Now one might be wondering why one should use a tf model anyways given its imperfect compatabiities(at least in cross-validation).  
#      The answer is GPU. Sklearn does not offer GPU support for training its models. The increase in training speed given GPU support is  
#      highly considerable, especially if you can parellelize your processes. However, given that cross-validation is such a pain to implement, 
#      one might ask whether its still worth using even  if you have to redo the tf-idf vectorization every split for every epoch 
#      for every set of hyperparameters. The answer is something I do not know and will not attempt to know. Hence, the question is then
#      rephrased into whether the extra speed from using the GPU is worth the decrease in samples used for the training set as some samples need
#      to be separated into a validation set. Admittedly, I personally  answered this question rather arbitrarily and perhaps personally biased to the
#      supposed superiority of Optuna in flexibility and (maybe)speed over sklearn hyperparameter tuning.
#      Hence, I can't exactly justify my choice here, and what I do here is best treated just as a proof of concept from a tired individual.
            


In [12]:
import tensorflow as tf
from tensorflow.python.keras import Sequential
from tensorflow.python.keras.callbacks import EarlyStopping
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.optimizer_v2.gradient_descent import SGD
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import RandomOverSampler
import statistics
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import SuccessiveHalvingPruner
from optuna.integration import TFKerasPruningCallback
from optuna.trial import TrialState
from tensorflow_addons.metrics import F1Score
from keras.utils import to_categorical
import scipy



def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.SparseTensor(indices, coo.data, coo.shape)

tf_x_train_trainvec = convert_sparse_matrix_to_sparse_tensor(x_train_trainvec)

tf_x_valvec = convert_sparse_matrix_to_sparse_tensor(x_valvec)

def create_model(trial, X):
    lr = trial.suggest_uniform('lr', 0.00001, 0.1)
    units = trial.suggest_int('units', 10, 50)
    input_dim = X

    model = Sequential()
    model.add(Dense(units, input_dim=input_dim, activation = 'relu', kernel_initializer='he_uniform'))
    model.add(Dense(1, activation='sigmoid'))

    optimizer = SGD(learning_rate=lr, momentum=0.9)

    

    model.compile(loss='binary_crossentropy', 
                    optimizer=optimizer, 
                    metrics=['accuracy', F1Score(num_classes=1, average= 'micro',threshold=0.5, name='tf_f1')])
    return model
            

def objective_mlp(trial):

    model = create_model(trial, x_train_trainvec.shape[1])

    callbacks = [EarlyStopping(monitor='val_tf_f1', mode='max', min_delta=0.01, patience=3, verbose=1), TFKerasPruningCallback(trial,'val_tf_f1')]

    fit_model = model.fit(x=tf_x_train_trainvec, y=tf.convert_to_tensor(y_train_train), epochs =50, verbose=1, batch_size= 50, callbacks=callbacks, validation_data=(tf_x_valvec,tf.convert_to_tensor(y_val)))

    return fit_model.history['val_tf_f1'][-1]

def show_result(study):

    pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
    complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

    trial_idx = 0
    for frozen_trial in study.get_trials(deepcopy=False):

        print("Trial {} completed rungs: {}".format(trial_idx, frozen_trial.system_attrs))
        trial_idx +=1

    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Number of pruned trials: ", len(pruned_trials))
    print("  Number of complete trials: ", len(complete_trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

study = optuna.create_study(
    direction="maximize",sampler= TPESampler(),  pruner=SuccessiveHalvingPruner()
)
study.optimize(objective_mlp, n_trials=100)

show_result(study)



[32m[I 2022-08-26 01:02:59,888][0m A new study created in memory with name: no-name-fc10492e-9d28-4441-be9a-7601f179de4f[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50


[32m[I 2022-08-26 01:05:46,857][0m Trial 0 finished with value: 0.9439120292663574 and parameters: {'lr': 0.06300422227860877, 'units': 14}. Best is trial 0 with value: 0.9439120292663574.[0m


Epoch 00015: early stopping
Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50


[32m[I 2022-08-26 01:09:21,382][0m Trial 1 finished with value: 0.8737736940383911 and parameters: {'lr': 0.004417718564699547, 'units': 43}. Best is trial 0 with value: 0.9439120292663574.[0m


Epoch 00019: early stopping
Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50


[32m[I 2022-08-26 01:12:29,441][0m Trial 2 finished with value: 0.952640175819397 and parameters: {'lr': 0.06337234363388337, 'units': 50}. Best is trial 2 with value: 0.952640175819397.[0m


Epoch 00016: early stopping
Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:12:53,836][0m Trial 3 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:13:18,351][0m Trial 4 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


[32m[I 2022-08-26 01:14:18,028][0m Trial 5 pruned. Trial was pruned at epoch 4.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:14:42,471][0m Trial 6 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:15:06,996][0m Trial 7 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50


[32m[I 2022-08-26 01:17:53,641][0m Trial 8 finished with value: 0.9514711499214172 and parameters: {'lr': 0.08499532870913261, 'units': 47}. Best is trial 2 with value: 0.952640175819397.[0m


Epoch 00014: early stopping
Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:18:18,223][0m Trial 9 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:18:42,803][0m Trial 10 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


[32m[I 2022-08-26 01:19:42,860][0m Trial 11 pruned. Trial was pruned at epoch 4.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50


[32m[I 2022-08-26 01:22:41,038][0m Trial 12 finished with value: 0.9550701975822449 and parameters: {'lr': 0.09787840285677056, 'units': 50}. Best is trial 12 with value: 0.9550701975822449.[0m


Epoch 00015: early stopping
Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


[32m[I 2022-08-26 01:23:41,483][0m Trial 13 pruned. Trial was pruned at epoch 4.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:24:06,261][0m Trial 14 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:24:30,802][0m Trial 15 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:24:55,351][0m Trial 16 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


[32m[I 2022-08-26 01:25:56,525][0m Trial 17 pruned. Trial was pruned at epoch 4.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:26:21,177][0m Trial 18 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


[32m[I 2022-08-26 01:27:21,566][0m Trial 19 pruned. Trial was pruned at epoch 4.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:27:46,079][0m Trial 20 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50


[32m[I 2022-08-26 01:30:45,472][0m Trial 21 finished with value: 0.955277681350708 and parameters: {'lr': 0.08625143054872278, 'units': 47}. Best is trial 21 with value: 0.955277681350708.[0m


Epoch 00015: early stopping
Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


[32m[I 2022-08-26 01:31:45,952][0m Trial 22 pruned. Trial was pruned at epoch 4.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:32:10,720][0m Trial 23 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50


[32m[I 2022-08-26 01:34:34,843][0m Trial 24 finished with value: 0.948631227016449 and parameters: {'lr': 0.0813893200643732, 'units': 50}. Best is trial 21 with value: 0.955277681350708.[0m


Epoch 00012: early stopping
Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:34:59,495][0m Trial 25 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:35:24,117][0m Trial 26 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


[32m[I 2022-08-26 01:36:25,862][0m Trial 27 pruned. Trial was pruned at epoch 4.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


[32m[I 2022-08-26 01:37:26,214][0m Trial 28 pruned. Trial was pruned at epoch 4.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:37:50,946][0m Trial 29 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:38:16,915][0m Trial 30 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50


[32m[I 2022-08-26 01:40:40,693][0m Trial 31 finished with value: 0.948452889919281 and parameters: {'lr': 0.0847470464260091, 'units': 47}. Best is trial 21 with value: 0.955277681350708.[0m


Epoch 00012: early stopping
Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


[32m[I 2022-08-26 01:41:41,251][0m Trial 32 pruned. Trial was pruned at epoch 4.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:42:06,004][0m Trial 33 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


[32m[I 2022-08-26 01:43:06,417][0m Trial 34 pruned. Trial was pruned at epoch 4.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:43:31,184][0m Trial 35 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50


[32m[I 2022-08-26 01:46:30,193][0m Trial 36 finished with value: 0.9560930132865906 and parameters: {'lr': 0.08679299688055687, 'units': 45}. Best is trial 36 with value: 0.9560930132865906.[0m


Epoch 00015: early stopping
Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


[32m[I 2022-08-26 01:47:30,638][0m Trial 37 pruned. Trial was pruned at epoch 4.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:47:55,334][0m Trial 38 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:48:20,014][0m Trial 39 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:48:45,569][0m Trial 40 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:49:10,321][0m Trial 41 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:49:34,921][0m Trial 42 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:49:59,547][0m Trial 43 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:50:24,248][0m Trial 44 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:50:48,974][0m Trial 45 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:51:13,587][0m Trial 46 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:51:38,264][0m Trial 47 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:52:02,936][0m Trial 48 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:52:27,709][0m Trial 49 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:52:52,345][0m Trial 50 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:53:17,062][0m Trial 51 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:53:41,596][0m Trial 52 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


[32m[I 2022-08-26 01:54:41,795][0m Trial 53 pruned. Trial was pruned at epoch 4.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:55:06,396][0m Trial 54 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:55:31,143][0m Trial 55 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:55:55,799][0m Trial 56 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:56:20,454][0m Trial 57 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:56:44,975][0m Trial 58 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


[32m[I 2022-08-26 01:57:45,581][0m Trial 59 pruned. Trial was pruned at epoch 4.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:58:10,235][0m Trial 60 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


[32m[I 2022-08-26 01:59:10,448][0m Trial 61 pruned. Trial was pruned at epoch 4.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 01:59:35,776][0m Trial 62 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


[32m[I 2022-08-26 02:00:36,143][0m Trial 63 pruned. Trial was pruned at epoch 4.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


[32m[I 2022-08-26 02:01:36,974][0m Trial 64 pruned. Trial was pruned at epoch 4.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:02:02,405][0m Trial 65 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:02:27,345][0m Trial 66 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:02:52,404][0m Trial 67 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


[32m[I 2022-08-26 02:03:53,061][0m Trial 68 pruned. Trial was pruned at epoch 4.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:04:17,943][0m Trial 69 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:04:42,714][0m Trial 70 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:05:07,625][0m Trial 71 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:05:32,441][0m Trial 72 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:05:57,372][0m Trial 73 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:06:22,293][0m Trial 74 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:06:47,248][0m Trial 75 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:07:11,959][0m Trial 76 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50


[32m[I 2022-08-26 02:10:12,376][0m Trial 77 finished with value: 0.9553421139717102 and parameters: {'lr': 0.07729220041249463, 'units': 48}. Best is trial 36 with value: 0.9560930132865906.[0m


Epoch 00015: early stopping
Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:10:37,250][0m Trial 78 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:11:02,088][0m Trial 79 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:11:26,746][0m Trial 80 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:11:51,595][0m Trial 81 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:12:16,359][0m Trial 82 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:12:41,162][0m Trial 83 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:13:06,517][0m Trial 84 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:13:31,447][0m Trial 85 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:13:56,196][0m Trial 86 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:14:20,907][0m Trial 87 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:14:45,548][0m Trial 88 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


[32m[I 2022-08-26 02:15:45,818][0m Trial 89 pruned. Trial was pruned at epoch 4.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:16:10,485][0m Trial 90 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:16:35,306][0m Trial 91 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


[32m[I 2022-08-26 02:17:35,731][0m Trial 92 pruned. Trial was pruned at epoch 4.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:18:00,419][0m Trial 93 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:18:25,306][0m Trial 94 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:18:50,147][0m Trial 95 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:19:14,912][0m Trial 96 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50


[32m[I 2022-08-26 02:22:15,109][0m Trial 97 finished with value: 0.9559414386749268 and parameters: {'lr': 0.08337494780099554, 'units': 49}. Best is trial 36 with value: 0.9560930132865906.[0m


Epoch 00015: early stopping
Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:22:39,872][0m Trial 98 pruned. Trial was pruned at epoch 1.[0m


Epoch 1/50




Epoch 2/50


[32m[I 2022-08-26 02:23:04,810][0m Trial 99 pruned. Trial was pruned at epoch 1.[0m


Trial 0 completed rungs: {}
Trial 1 completed rungs: {'completed_rung_0': 0.7558432817459106, 'completed_rung_1': 0.8062049150466919, 'completed_rung_2': 0.8683011531829834}
Trial 2 completed rungs: {'completed_rung_0': 0.8448481559753418, 'completed_rung_1': 0.8989165425300598}
Trial 3 completed rungs: {'completed_rung_0': 0.8091752529144287}
Trial 4 completed rungs: {'completed_rung_0': 0.7012574076652527}
Trial 5 completed rungs: {'completed_rung_0': 0.85057133436203, 'completed_rung_1': 0.8944485187530518}
Trial 6 completed rungs: {'completed_rung_0': 0.8212453722953796}
Trial 7 completed rungs: {'completed_rung_0': 0.823862612247467}
Trial 8 completed rungs: {'completed_rung_0': 0.8477990627288818, 'completed_rung_1': 0.9071922898292542}
Trial 9 completed rungs: {'completed_rung_0': 0.8419150710105896}
Trial 10 completed rungs: {'completed_rung_0': 0.8410026431083679}
Trial 11 completed rungs: {'completed_rung_0': 0.8522742390632629, 'completed_rung_1': 0.9031714797019958}
Trial 1

In [13]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

best_model = create_model(study.best_trial, x_trainvec.shape[1])

tf_x_trainvec = convert_sparse_matrix_to_sparse_tensor(x_trainvec)
tf_x_testvec = convert_sparse_matrix_to_sparse_tensor(x_testvec)

best_model.fit(x=tf_x_trainvec, y=tf.convert_to_tensor(y_train), epochs =50, verbose=1, batch_size= 50, callbacks=EarlyStopping(monitor='tf_f1', mode='max', min_delta=0.01, patience=3, verbose=1))

mlp_train_predictions = best_model.predict(x_trainvec)
mlp_test_predictions = best_model.predict(x_testvec)

mlp_train_predictions[mlp_train_predictions <= 0.5] = 0
mlp_train_predictions[mlp_train_predictions > 0.5] = 1

mlp_test_predictions[mlp_test_predictions <= 0.5] = 0
mlp_test_predictions[mlp_test_predictions > 0.5] = 1

Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 00016: early stopping


### MLP Evaluation

In [14]:
print("MLP Train Accuracy Score :",accuracy_score(y_pred=mlp_train_predictions, y_true=y_train.to_numpy().ravel()))
print("MLP Test Accuracy Score :",accuracy_score(y_pred=mlp_test_predictions, y_true= y_test.to_numpy().ravel()))

print("MLP Train F1 Score :",f1_score(y_pred=mlp_train_predictions, y_true=y_train.to_numpy().ravel(), average = 'binary'))
print("MLP Test F1 Score :",f1_score(y_pred=mlp_test_predictions, y_true=y_test.to_numpy().ravel(), average = 'binary'))

print("MLP Confusion matrix:\n{}".format(confusion_matrix(y_pred=y_test.to_numpy().ravel(),y_true= mlp_test_predictions)))
print("MLP Classification report:\n",classification_report(y_pred=mlp_test_predictions,  y_true= y_test.to_numpy().ravel()))

MLP Train Accuracy Score : 0.9879339512152803
MLP Test Accuracy Score : 0.9096722621902478
MLP Train F1 Score : 0.9879072867835121
MLP Test F1 Score : 0.9510158078779297
MLP Confusion matrix:
[[  616   875]
 [  820 16454]]
MLP Classification report:
               precision    recall  f1-score   support

           0       0.41      0.43      0.42      1436
           1       0.95      0.95      0.95     17329

    accuracy                           0.91     18765
   macro avg       0.68      0.69      0.69     18765
weighted avg       0.91      0.91      0.91     18765



### Random Forest Classifier

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline


rfc = RandomForestClassifier(random_state=0, n_jobs=3)

rfcgrid = {
    'rfc__max_depth' : list(range(1,50,5)),
    'rfc__n_estimators' : list(range(1,500,50)),
}

f1 = make_scorer(f1_score, average='binary', pos_label = 1) 

imba_pipe_rfc = Pipeline([
    ('sampling', RandomOverSampler(random_state=0)), 
    ('rfc', rfc)
    ])


rfcrs = HalvingGridSearchCV(estimator=imba_pipe_rfc, param_grid=rfcgrid, factor=5, cv=5, scoring=f1, verbose=10)

rfcrs.fit(x_trainvec, y_train.to_numpy().ravel())

print(rfcrs.best_score_)
print(rfcrs.best_params_)

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 5532
max_resources_: 138322
aggressive_elimination: False
factor: 5
----------
iter: 0
n_candidates: 100
n_resources: 5532
Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5; 1/100] START rfc__max_depth=1, rfc__n_estimators=1.....................
[CV 1/5; 1/100] END rfc__max_depth=1, rfc__n_estimators=1;, score=(train=0.072, test=0.056) total time=   0.9s
[CV 2/5; 1/100] START rfc__max_depth=1, rfc__n_estimators=1.....................
[CV 2/5; 1/100] END rfc__max_depth=1, rfc__n_estimators=1;, score=(train=0.666, test=0.689) total time=   0.4s
[CV 3/5; 1/100] START rfc__max_depth=1, rfc__n_estimators=1.....................
[CV 3/5; 1/100] END rfc__max_depth=1, rfc__n_estimators=1;, score=(train=0.664, test=0.645) total time=   0.4s
[CV 4/5; 1/100] START rfc__max_depth=1, rfc__n_estimators=1.....................
[CV 4/5; 1/100] END rfc__max_depth=1, rfc__n_estimators=1;, score=(train=0.

### RFC Evaluation

In [16]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

rfc_train_predictions = rfcrs.best_estimator_.predict(x_trainvec)
rfc_test_predictions = rfcrs.best_estimator_.predict(x_testvec)

print("RFC Train Accuracy Score :",accuracy_score(y_pred=rfc_train_predictions, y_true=y_train.to_numpy().ravel()))
print("RFC Test Accuracy Score :",accuracy_score(y_pred=rfc_test_predictions, y_true= y_test.to_numpy().ravel()))

print("RFC Train F1 Score :",f1_score(y_pred=rfc_train_predictions, y_true=y_train.to_numpy().ravel(), average = 'binary'))
print("RFC Test F1 Score :",f1_score(y_pred=rfc_test_predictions, y_true=y_test.to_numpy().ravel(), average = 'binary'))

print("RFC Confusion matrix:\n{}".format(confusion_matrix(y_pred=y_test.to_numpy().ravel(),y_true= rfc_test_predictions)))
print("RFC Classification report:\n",classification_report(y_pred=rfc_test_predictions,  y_true= y_test.to_numpy().ravel()))

RFC Train Accuracy Score : 0.8865328725726928
RFC Test Accuracy Score : 0.9232613908872902
RFC Train F1 Score : 0.896847252436692
RFC Test F1 Score : 0.9587652482675677
RFC Confusion matrix:
[[  584   588]
 [  852 16741]]
RFC Classification report:
               precision    recall  f1-score   support

           0       0.50      0.41      0.45      1436
           1       0.95      0.97      0.96     17329

    accuracy                           0.92     18765
   macro avg       0.72      0.69      0.70     18765
weighted avg       0.92      0.92      0.92     18765



### Multinomial Naive Bayes Classifier

In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score

mnbc = MultinomialNB()

mnbc.get_params()

mnbc_grid = {
    'mnbc__alpha' : [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
    'mnbc__fit_prior' : [True, False]
}

imba_pipe_mnbc = Pipeline([
    ('sampling', RandomOverSampler(random_state=0)), 
    ('mnbc', mnbc)
    ])

f1 = make_scorer(f1_score, average='binary', pos_label = 1)

mnbcgs = GridSearchCV(estimator=imba_pipe_mnbc, param_grid=mnbc_grid, cv=5, scoring=f1, verbose=10)

mnbcgs.fit(x_trainvec, y_train.to_numpy().ravel())

print(mnbcgs.best_score_)
print(mnbcgs.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5; 1/20] START mnbc__alpha=0.1, mnbc__fit_prior=True......................
[CV 1/5; 1/20] END mnbc__alpha=0.1, mnbc__fit_prior=True;, score=0.857 total time=   0.0s
[CV 2/5; 1/20] START mnbc__alpha=0.1, mnbc__fit_prior=True......................
[CV 2/5; 1/20] END mnbc__alpha=0.1, mnbc__fit_prior=True;, score=0.857 total time=   0.0s
[CV 3/5; 1/20] START mnbc__alpha=0.1, mnbc__fit_prior=True......................
[CV 3/5; 1/20] END mnbc__alpha=0.1, mnbc__fit_prior=True;, score=0.859 total time=   0.0s
[CV 4/5; 1/20] START mnbc__alpha=0.1, mnbc__fit_prior=True......................
[CV 4/5; 1/20] END mnbc__alpha=0.1, mnbc__fit_prior=True;, score=0.862 total time=   0.0s
[CV 5/5; 1/20] START mnbc__alpha=0.1, mnbc__fit_prior=True......................
[CV 5/5; 1/20] END mnbc__alpha=0.1, mnbc__fit_prior=True;, score=0.857 total time=   0.0s
[CV 1/5; 2/20] START mnbc__alpha=0.1, mnbc__fit_prior=False.....................
[C

### MNBC Evaluation

In [18]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

mnb_train_predictions = mnbcgs.best_estimator_.predict(x_trainvec)
mnb_test_predictions = mnbcgs.best_estimator_.predict(x_testvec)

print("MNBC Model Train Accuracy Score :",accuracy_score(mnb_train_predictions, y_train.to_numpy().ravel()))
print("MNBC Model Test Accuracy Score :",accuracy_score(mnb_test_predictions, y_test.to_numpy().ravel()))

print("MNBC Model Train F1 Score :",f1_score(mnb_train_predictions, y_train.to_numpy().ravel(), average = 'binary'))
print("MNBC Model Test F1 Score :",f1_score(mnb_test_predictions, y_test.to_numpy().ravel(), average = 'binary'))

print("MNBC Confusion matrix:\n{}".format(confusion_matrix(y_pred=y_test.to_numpy().ravel(),y_true= mnb_test_predictions)))
print("MNBC Classification report:\n",classification_report(y_pred=mnb_test_predictions,  y_true= y_test.to_numpy().ravel()))

MNBC Model Train Accuracy Score : 0.8769682335420251
MNBC Model Test Accuracy Score : 0.8349054090061284
MNBC Model Train F1 Score : 0.8774449085409766
MNBC Model Test F1 Score : 0.9046240995012622
MNBC Confusion matrix:
[[  975  2637]
 [  461 14692]]
MNBC Classification report:
               precision    recall  f1-score   support

           0       0.27      0.68      0.39      1436
           1       0.97      0.85      0.90     17329

    accuracy                           0.83     18765
   macro avg       0.62      0.76      0.65     18765
weighted avg       0.92      0.83      0.86     18765



### Dataset for Stacking Classifier

In [22]:
print(rfcrs.best_estimator_.predict_proba(x_trainvec))

[[0.45430597 0.54569403]
 [0.35307651 0.64692349]
 [0.4106744  0.5893256 ]
 ...
 [0.6098439  0.3901561 ]
 [0.58683237 0.41316763]
 [0.96207004 0.03792996]]


In [23]:
def get_predict_probas(data):
    mlp_proba = best_model.predict(data)
    rfc_proba = np.delete(rfcrs.best_estimator_.predict_proba(data),0,1)
    mnbc_proba = np.delete(mnbcgs.best_estimator_.predict_proba(data),0,1)
    mlp_proba_df = pd.DataFrame(mlp_proba,columns=['mlp'])
    rfc_proba_df = pd.DataFrame(rfc_proba,columns=['rfc'])
    mnbc_proba_df = pd.DataFrame(mnbc_proba,columns=['mnbc'])
    return pd.concat([mlp_proba_df,rfc_proba_df,mnbc_proba_df],axis=1)

x_trainvec_stack = get_predict_probas(x_trainvec)
x_testvec_stack = get_predict_probas(x_testvec)

print(x_trainvec_stack.head())

    

    

        mlp       rfc      mnbc
0  0.499469  0.545694  0.466269
1  0.999986  0.646923  0.811235
2  0.999627  0.589326  0.762265
3  0.999293  0.612861  0.700163
4  0.998928  0.579946  0.792787


### Stacking Classifier(Logistic Regression)

In [24]:
from sklearn.linear_model import LogisticRegression

sclf = LogisticRegression(random_state=0)

sclf_param_grid = {
    'sclf__solver' : ['newton-cg', 'lbfgs'],
    'sclf__C': [100, 10, 1.0, 0.1, 0.01]
}

imba_pipe_stack = Pipeline([
    ('sampling', RandomOverSampler(random_state=0)), 
    ('sclf', sclf)
    ])


sclfrs = HalvingGridSearchCV(estimator=imba_pipe_stack , param_grid=sclf_param_grid, factor=5, cv=5, scoring=f1, verbose=1)

sclfrs.fit(x_trainvec_stack, y_train.to_numpy().ravel())

n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 27664
max_resources_: 138322
aggressive_elimination: False
factor: 5
----------
iter: 0
n_candidates: 10
n_resources: 27664
Fitting 5 folds for each of 10 candidates, totalling 50 fits
----------
iter: 1
n_candidates: 2
n_resources: 138320
Fitting 5 folds for each of 2 candidates, totalling 10 fits


## Evaluating

In [25]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

# I probably shldv written a function for this...
sclfrs_train_predictions = sclfrs.predict(x_trainvec_stack)
sclfrs_test_predictions = sclfrs.predict(x_testvec_stack)

In [26]:
print("MLP Test Accuracy Score :",accuracy_score(y_pred=mlp_test_predictions, y_true= y_test.to_numpy().ravel()))
print("RFC Test Accuracy Score :",accuracy_score(y_pred=rfc_test_predictions, y_true= y_test.to_numpy().ravel()))
print("MNBC Model Test Accuracy Score :",accuracy_score(mnb_test_predictions, y_test.to_numpy().ravel()))
print("Stacked Model Test Accuracy Score :",accuracy_score(sclfrs_test_predictions, y_test.to_numpy().ravel()))

MLP Test Accuracy Score : 0.9096722621902478
RFC Test Accuracy Score : 0.9232613908872902
MNBC Model Test Accuracy Score : 0.8349054090061284
Stacked Model Test Accuracy Score : 0.9140953903543831


In [27]:
print("MLP Test F1 Score :",f1_score(y_pred=mlp_test_predictions, y_true=y_test.to_numpy().ravel(), average = 'binary'))
print("RFC Test F1 Score :",f1_score(y_pred=rfc_test_predictions, y_true=y_test.to_numpy().ravel(), average = 'binary'))
print("MNBC Model Test F1 Score :",f1_score(mnb_test_predictions, y_test.to_numpy().ravel(), average = 'binary'))
print("Stacked Model Test F1 Score :",f1_score(sclfrs_test_predictions, y_test.to_numpy().ravel(),average='binary'))

MLP Test F1 Score : 0.9510158078779297
RFC Test F1 Score : 0.9587652482675677
MNBC Model Test F1 Score : 0.9046240995012622
Stacked Model Test F1 Score : 0.9534937395418613


In [28]:

print("MLP Confusion matrix:\n{}".format(confusion_matrix(y_pred=y_test.to_numpy().ravel(),y_true= mlp_test_predictions)))
print("RFC Confusion matrix:\n{}".format(confusion_matrix(y_pred=y_test.to_numpy().ravel(),y_true= rfc_test_predictions)))
print("MNBC Confusion matrix:\n{}".format(confusion_matrix(y_pred=y_test.to_numpy().ravel(),y_true= mnb_test_predictions)))
print("Stacked Confusion matrix:\n{}".format(confusion_matrix(y_pred=y_test.to_numpy().ravel(),y_true= sclfrs_test_predictions)))

MLP Confusion matrix:
[[  616   875]
 [  820 16454]]
RFC Confusion matrix:
[[  584   588]
 [  852 16741]]
MNBC Confusion matrix:
[[  975  2637]
 [  461 14692]]
Stacked Confusion matrix:
[[  628   804]
 [  808 16525]]


In [29]:

print("MLP Classification report:\n",classification_report(y_pred=mlp_test_predictions,  y_true= y_test.to_numpy().ravel()))
print("RFC Classification report:\n",classification_report(y_pred=rfc_test_predictions,  y_true= y_test.to_numpy().ravel()))

MLP Classification report:
               precision    recall  f1-score   support

           0       0.41      0.43      0.42      1436
           1       0.95      0.95      0.95     17329

    accuracy                           0.91     18765
   macro avg       0.68      0.69      0.69     18765
weighted avg       0.91      0.91      0.91     18765

RFC Classification report:
               precision    recall  f1-score   support

           0       0.50      0.41      0.45      1436
           1       0.95      0.97      0.96     17329

    accuracy                           0.92     18765
   macro avg       0.72      0.69      0.70     18765
weighted avg       0.92      0.92      0.92     18765



In [30]:
print("MNBC Classification report:\n",classification_report(y_pred=mnb_test_predictions,  y_true= y_test.to_numpy().ravel()))
print("Stacked Classification report:\n",classification_report(y_pred=sclfrs_test_predictions,  y_true= y_test.to_numpy().ravel()))

MNBC Classification report:
               precision    recall  f1-score   support

           0       0.27      0.68      0.39      1436
           1       0.97      0.85      0.90     17329

    accuracy                           0.83     18765
   macro avg       0.62      0.76      0.65     18765
weighted avg       0.92      0.83      0.86     18765

Stacked Classification report:
               precision    recall  f1-score   support

           0       0.44      0.44      0.44      1436
           1       0.95      0.95      0.95     17329

    accuracy                           0.91     18765
   macro avg       0.70      0.70      0.70     18765
weighted avg       0.91      0.91      0.91     18765

