In [None]:
# Importing Libraries
import pandas as pd
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from skopt import BayesSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
# Packages for modeling
from keras import models
from keras import layers
from keras import regularizers
import joblib


# Importing custom utility functions
from utilities.data_loader import load_modeling_data, load_testing_data, prepare_kaggle_submission
from utilities.text_cleaner import advanced_data_cleaning

In [None]:
# Loading testin and training data
train_data, train_labels = load_modeling_data()
test_data = load_testing_data()

In [None]:
le = LabelEncoder()
train_labels['target'] = le.fit_transform(train_labels['target'].values)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_data, train_labels, test_size=0.2, random_state = 8)

In [None]:
print('-'*175+'Logistic Regression with Best parameters'+'-'*175)
vectorizer = TfidfVectorizer(max_features=250000, ngram_range=(1,3))

In [None]:
X_train_clean = X_train.copy()
X_val_clean = X_val.copy()
X_train_clean['text'] = X_train_clean['text'].apply(advanced_data_cleaning)
X_val_clean['text'] = X_val_clean['text'].apply(advanced_data_cleaning)

In [None]:
# Fitting and training the bag of words
X_train_vectorizer = vectorizer.fit_transform(X_train_clean['text'])
X_val_vectorizer = vectorizer.transform(X_val_clean['text'])

In [None]:
results = pd.DataFrame()

In [None]:
nb = joblib.load('models/naive_bayes+tfidf.sav')

In [None]:
results['naiveBayes'] = nb.predict(X_val_vectorizer)

In [None]:
logreg = joblib.load('models/logreg+tfidf+smote.sav')

In [None]:
results['logreg'] = logreg.predict(X_val_vectorizer)

In [None]:
svm = joblib.load('models/svc+tfidf.sav')

In [None]:
results['svm'] = svm.predict(X_val_vectorizer)

In [None]:
complement_nb = joblib.load('models/complement_nb.sav')

In [None]:
results['complement_nb'] = complement_nb.predict(X_val_vectorizer)

In [None]:
results.mode(axis=1)

In [None]:
# LSTM model

In [43]:
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

In [44]:
NB_WORDS = 10000

tk = Tokenizer(num_words=NB_WORDS,lower=True,split=" ")

In [46]:
full_df = pd.concat([X_train_clean['text'], X_val_clean['text']], axis = 0)
tk.fit_on_texts(full_df)

In [48]:
X_train_seq = tk.texts_to_sequences(X_train_clean['text'])
X_valid_seq = tk.texts_to_sequences(X_val_clean['text'])

In [49]:
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=30)
X_valid_seq_trunc = pad_sequences(X_valid_seq, maxlen=30)

In [50]:
from gensim.models import Word2Vec
model = Word2Vec.load("word2vec-training.model")

In [51]:
vector_size = 100
gensim_weight_matrix = np.zeros((NB_WORDS ,vector_size))
gensim_weight_matrix.shape

(10000, 100)

In [52]:
for word, index in tk.word_index.items():
    if index < NB_WORDS: # since index starts with zero 
        if word in model.wv:
            gensim_weight_matrix[index] = model.wv[word]
        else:
            gensim_weight_matrix[index] = np.zeros(100)

In [53]:
NB_WORDS = 10000  
def create_model():
    emb_model = models.Sequential()
    emb_model.add(Embedding(input_dim = NB_WORDS,
     output_dim = 100,
     input_length=30,
     weights = [gensim_weight_matrix],trainable = False))
    emb_model.add(Bidirectional(LSTM(64, return_sequences=True, input_shape=(None, 1))))
    emb_model.add(Dropout(0.2))
    emb_model.add(Bidirectional(LSTM(32)))
    emb_model.add(Dropout(0.2))
    emb_model.add(Dense(64, activation='relu'))
    emb_model.add(Dropout(0.1))
    emb_model.add(Dense(3, activation='softmax'))
    emb_model.summary()
    emb_model.compile(optimizer='adam'
                  , loss='categorical_crossentropy'
                  , metrics=['accuracy'])
    return emb_model

In [54]:
emb_model = create_model()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 100)           1000000   
                                                                 
 bidirectional (Bidirectiona  (None, 30, 128)          84480     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 30, 128)           0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 64)               

In [55]:
lstm_pred = emb_model.predict(X_valid_seq_trunc)

2022-12-18 23:08:11.737355: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-12-18 23:08:12.252635: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-18 23:08:12.526776: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-18 23:08:12.543254: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-18 23:08:13.518262: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-18 23:08:13.532964: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




array([[0.30459064, 0.3676831 , 0.32772622],
       [0.32924414, 0.36694667, 0.30380914],
       [0.28449824, 0.37490532, 0.34059647],
       ...,
       [0.3101136 , 0.36446267, 0.32542378],
       [0.30894312, 0.3546044 , 0.3364525 ],
       [0.31922632, 0.33493224, 0.3458415 ]], dtype=float32)

In [None]:
lstm_pred.argmax()

In [58]:
y_pred = results.mode(axis=1).iloc[:, 0]

In [59]:
y_pred

0         2.0
1         0.0
2         0.0
3         0.0
4         2.0
         ... 
208060    0.0
208061    2.0
208062    2.0
208063    2.0
208064    2.0
Name: 0, Length: 208065, dtype: float64

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val['target'].values, y_pred)

In [None]:
result