In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import os
os.environ["KERAS_BACKEND"]="jax"
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]="1.00"

In [2]:
import os
import nltk
import zipfile

wordnet_path="/usr/share/nltk_data/corpora/wordnet.zip"
wordnet_dir="/usr/share/nltk_data/corpora/wordnet"

if not os.path.exists(wordnet_dir):
    with zipfile.ZipFile(wordnet_path,'r') as z:
        z.extractall("/usr/share/nltk_data/corpora/")
        
nltk.data.path.append("/usr/share/nltk_data/")

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('wordnet')

stop_words=set(stopwords.words('english'))
lemmatizer=WordNetLemmatizer()

def preprocess_text(text):
    text=re.sub(r'<.*?>', '', text)
    text=text.lower()
    text=re.sub(r'[^a-z\s]', '', text)
    tokens=word_tokenize(text)
    tokens=[lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
data=pd.read_csv("/kaggle/input/happydb-cleaned/cleaned_hm.csv")

In [5]:
data.head()

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,27673,2053,24h,I went on a successful date with someone I fel...,I went on a successful date with someone I fel...,True,1,,affection
1,27674,2,24h,I was happy when my son got 90% marks in his e...,I was happy when my son got 90% marks in his e...,True,1,,affection
2,27675,1936,24h,I went to the gym this morning and did yoga.,I went to the gym this morning and did yoga.,True,1,,exercise
3,27676,206,24h,We had a serious talk with some friends of our...,We had a serious talk with some friends of our...,True,2,bonding,bonding
4,27677,6227,24h,I went with grandchildren to butterfly display...,I went with grandchildren to butterfly display...,True,1,,affection


In [6]:
data.tail()

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
100530,128762,112,24h,My husband announced he is getting a decent bo...,My husband announced he is getting a decent bo...,True,1,,affection
100531,128763,714,24h,Had a can of Pepsi to drink.,Had a can of Pepsi to drink.,True,1,,enjoy_the_moment
100532,128764,3934,24h,Cuddling with my girlfriend last night.,Cuddling with my girlfriend last night.,True,1,affection,affection
100533,128765,1629,24h,I had a great meeting yesterday at work with m...,I had a great meeting yesterday at work with m...,True,1,,bonding
100534,128766,141,24h,I had a great workout last night.,I had a great workout last night.,True,1,,exercise


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100535 entries, 0 to 100534
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   hmid                   100535 non-null  int64 
 1   wid                    100535 non-null  int64 
 2   reflection_period      100535 non-null  object
 3   original_hm            100535 non-null  object
 4   cleaned_hm             100535 non-null  object
 5   modified               100535 non-null  bool  
 6   num_sentence           100535 non-null  int64 
 7   ground_truth_category  14125 non-null   object
 8   predicted_category     100535 non-null  object
dtypes: bool(1), int64(3), object(5)
memory usage: 6.2+ MB


In [8]:
data['predicted_category'].value_counts()

predicted_category
affection           34168
achievement         33993
enjoy_the_moment    11144
bonding             10727
leisure              7458
nature               1843
exercise             1202
Name: count, dtype: int64

In [9]:
data['cleaned_hm']=data['cleaned_hm'].apply(preprocess_text)

In [10]:
mode_value=data['ground_truth_category'].mode()[0]
data['ground_truth_category'].fillna(mode_value,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['ground_truth_category'].fillna(mode_value,inplace=True)


In [11]:
from sklearn.preprocessing import LabelEncoder

reflection_encoder=LabelEncoder()
data['reflection_period_encoded']=reflection_encoder.fit_transform(data['reflection_period'])

category_encoder = LabelEncoder()
data['ground_truth_category_encoded']=category_encoder.fit_transform(data['ground_truth_category'])
data['predicted_category_encoded']=category_encoder.fit_transform(data['predicted_category'])

encoded_columns = {
    "reflection_period_encoded": data['reflection_period_encoded'].unique(),
    "ground_truth_category_encoded": data['ground_truth_category_encoded'].unique(),
    "predicted_category_encoded": data['predicted_category_encoded'].unique(),
}
encoded_columns

{'reflection_period_encoded': array([0, 1]),
 'ground_truth_category_encoded': array([1, 2, 5, 3, 0, 6, 4]),
 'predicted_category_encoded': array([1, 4, 2, 5, 0, 3, 6])}

In [12]:
from sklearn.model_selection import train_test_split
X=data['cleaned_hm']
y=data['predicted_category']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42,stratify=y)

In [13]:
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.10,random_state=42,stratify=y_train)
X_train.shape,X_val.shape,X_test.shape,y_train.shape,y_val.shape,y_test.shape

((67860,), (7541,), (25134,), (67860,), (7541,), (25134,))

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer 
tfidf=TfidfVectorizer(max_features=5000,ngram_range=(1,2),min_df=2,max_df=0.90,sublinear_tf=True)
X_train_tfidf=tfidf.fit_transform(X_train)
X_test_tfidf=tfidf.transform(X_test)
X_val_tfidf=tfidf.transform(X_val)

In [15]:
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
y_train_encoded=label_encoder.fit_transform(y_train)
y_test_encoded=label_encoder.transform(y_test)
y_val_encoded=label_encoder.transform(y_val)

In [16]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer=tf.keras.preprocessing.text.Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_seq=tokenizer.texts_to_sequences(X_train)
X_test_seq=tokenizer.texts_to_sequences(X_test)
X_val_seq=tokenizer.texts_to_sequences(X_val)

In [17]:
max_sequence_length=100  # Maximum length of sequences
X_train_padded=pad_sequences(X_train_seq,maxlen=max_sequence_length,padding='post')
X_test_padded=pad_sequences(X_test_seq,maxlen=max_sequence_length,padding='post')
X_val_padded=pad_sequences(X_val_seq,maxlen=max_sequence_length,padding='post')

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Dropout,Embedding

model_lstm = Sequential([
    Embedding(input_dim=5000,output_dim=128,input_length=max_sequence_length),
    LSTM(128,return_sequences=True),
    LSTM(64),
    Dense(64,activation='relu'),
    Dense(7, activation='softmax')
])
model_lstm.build(input_shape=(None, max_sequence_length))
model_lstm.summary()



In [19]:
from tensorflow.keras.optimizers import Adam
model_lstm.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=["accuracy"])

In [20]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping=EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=True)

In [21]:
history=model_lstm.fit(X_train_padded,y_train_encoded,epochs=25,batch_size=64,validation_data=(X_test_padded, y_test_encoded),verbose=1,callbacks=early_stopping)

Epoch 1/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 24ms/step - accuracy: 0.3373 - loss: 1.5532 - val_accuracy: 0.3399 - val_loss: 1.5393
Epoch 2/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 19ms/step - accuracy: 0.3375 - loss: 1.5343 - val_accuracy: 0.3399 - val_loss: 1.5387
Epoch 3/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 19ms/step - accuracy: 0.3398 - loss: 1.5378 - val_accuracy: 0.3399 - val_loss: 1.5360
Epoch 4/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 19ms/step - accuracy: 0.3389 - loss: 1.5298 - val_accuracy: 0.3386 - val_loss: 1.5365
Epoch 5/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 19ms/step - accuracy: 0.3371 - loss: 1.5349 - val_accuracy: 0.6381 - val_loss: 1.0012
Epoch 6/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 19ms/step - accuracy: 0.6991 - loss: 0.8295 - val_accuracy: 0.8077 - val_loss: 0.6004
Epoc

In [22]:
import numpy as np
y_pred_lstm=model_lstm.predict(X_test_padded)
y_pred_classes_lstm=np.argmax(y_pred_lstm,axis=1)

[1m786/786[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step


In [23]:
test_loss,test_accuracy=model_lstm.evaluate(X_test_padded,y_test_encoded)
print(f"Test accuracy: {test_accuracy * 100:.4f}%")

[1m786/786[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.8726 - loss: 0.4312
Test accuracy: 87.5627%


In [24]:
val_loss,val_accuracy=model_lstm.evaluate(X_val_padded,y_val_encoded)
print(f"Validation accuracy: {val_accuracy * 100:.4f}%")

[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8789 - loss: 0.4048
Validation accuracy: 87.7470%


In [25]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score, precision_score, recall_score

# Print Classification Report
print("\nClassification Report for Vanilla LSTM:")
print(classification_report(y_test_encoded, y_pred_classes_lstm, zero_division=1))

# Print Accuracy
print("\nAccuracy for Vanilla LSTM:")
print(accuracy_score(y_test_encoded, y_pred_classes_lstm))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded, y_pred_classes_lstm, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded, y_pred_classes_lstm, average='weighted',zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded, y_pred_classes_lstm, average='weighted'))


Classification Report for Vanilla LSTM:
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      8498
           1       0.91      0.95      0.93      8542
           2       0.94      0.90      0.92      2682
           3       0.78      0.71      0.74      2786
           4       0.72      0.64      0.68       300
           5       0.71      0.84      0.77      1865
           6       0.63      0.57      0.60       461

    accuracy                           0.88     25134
   macro avg       0.80      0.78      0.79     25134
weighted avg       0.88      0.88      0.88     25134


Accuracy for Vanilla LSTM:
0.8756266412031511

F1 Score:
0.8750344247287425

Precision Score:
0.8764954685222385

Recall Score:
0.8756266412031511


In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

# Define the CNN model
model_cnn = Sequential([
    Embedding(input_dim=5000,output_dim=128,input_length=max_sequence_length),
    Conv1D(filters=128,kernel_size=5,activation='relu'),
    GlobalMaxPooling1D(),
    Dense(128,activation='relu'),
    Dense(7,activation='softmax') 
])
model_cnn.build(input_shape=(None,max_sequence_length))
model_cnn.summary()



In [27]:
model_cnn.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=["accuracy"])

In [28]:
history=model_cnn.fit(X_train_padded,y_train_encoded,epochs=25,batch_size=64,validation_data=(X_test_padded,y_test_encoded),verbose=1,callbacks=[early_stopping])

Epoch 1/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - accuracy: 0.7292 - loss: 0.7766 - val_accuracy: 0.8803 - val_loss: 0.3338
Epoch 2/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9160 - loss: 0.2414 - val_accuracy: 0.8806 - val_loss: 0.3294
Epoch 3/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9498 - loss: 0.1431 - val_accuracy: 0.8805 - val_loss: 0.3629
Epoch 4/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9717 - loss: 0.0854 - val_accuracy: 0.8823 - val_loss: 0.4044
Epoch 5/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9829 - loss: 0.0562 - val_accuracy: 0.8788 - val_loss: 0.5009
Epoch 6/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9886 - loss: 0.0384 - val_accuracy: 0.8783 - val_loss: 0.5679
Epoch 7/25
[1m1

In [29]:
y_pred_cnn=model_cnn.predict(X_test_padded)
y_pred_classes_cnn=np.argmax(y_pred_cnn,axis=1)

[1m786/786[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 829us/step


In [30]:
test_loss,test_accuracy=model_cnn.evaluate(X_test_padded,y_test_encoded)
print(f"Test accuracy: {test_accuracy * 100:.4f}%")

[1m786/786[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8800 - loss: 0.3394
Test accuracy: 88.0560%


In [31]:
val_loss,val_accuracy=model_cnn.evaluate(X_val_padded,y_val_encoded)
print(f"Validation accuracy: {val_accuracy * 100:.4f}%")

[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8821 - loss: 0.3315
Validation accuracy: 88.3835%


In [32]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score, precision_score, recall_score

# Print Classification Report
print("\nClassification Report for CNN:")
print(classification_report(y_test_encoded,y_pred_classes_cnn, zero_division=1))

# Print Accuracy
print("\nAccuracy for CNN:")
print(accuracy_score(y_test_encoded,y_pred_classes_cnn))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded,y_pred_classes_cnn, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_classes_cnn, average='weighted',zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_classes_cnn, average='weighted'))


Classification Report for CNN:
              precision    recall  f1-score   support

           0       0.92      0.86      0.89      8498
           1       0.94      0.93      0.94      8542
           2       0.92      0.93      0.92      2682
           3       0.70      0.79      0.75      2786
           4       0.79      0.83      0.81       300
           5       0.76      0.83      0.79      1865
           6       0.78      0.72      0.75       461

    accuracy                           0.88     25134
   macro avg       0.83      0.84      0.83     25134
weighted avg       0.88      0.88      0.88     25134


Accuracy for CNN:
0.8805601973422456

F1 Score:
0.8818123623885681

Precision Score:
0.8845895741192616

Recall Score:
0.8805601973422456


In [33]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Dropout, Embedding

model_rnn=Sequential([
    Embedding(input_dim=5000,output_dim=128,input_length=max_sequence_length),
    SimpleRNN(128,return_sequences=True),
    SimpleRNN(64),
    Dense(64,activation='relu'),
    Dense(7,activation='softmax')
])
model_rnn.build(input_shape=(None,max_sequence_length))
model_rnn.summary()



In [34]:
model_rnn.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [35]:
history=model_rnn.fit(X_train_padded,y_train_encoded,epochs=25,batch_size=64,validation_data=(X_test_padded, y_test_encoded),verbose=1,callbacks=early_stopping)

Epoch 1/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 17ms/step - accuracy: 0.3383 - loss: 1.5518 - val_accuracy: 0.3400 - val_loss: 1.5415
Epoch 2/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 14ms/step - accuracy: 0.3410 - loss: 1.5425 - val_accuracy: 0.3382 - val_loss: 1.5399
Epoch 3/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 14ms/step - accuracy: 0.3365 - loss: 1.5385 - val_accuracy: 0.3399 - val_loss: 1.5372
Epoch 4/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 14ms/step - accuracy: 0.3394 - loss: 1.5387 - val_accuracy: 0.3693 - val_loss: 1.5115
Epoch 5/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 14ms/step - accuracy: 0.3664 - loss: 1.5171 - val_accuracy: 0.3757 - val_loss: 1.5084


In [36]:
import numpy as np
y_pred_rnn=model_rnn.predict(X_test_padded)
y_pred_classes_rnn=np.argmax(y_pred_rnn,axis=1)

[1m786/786[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step


In [37]:
test_loss,test_accuracy=model_rnn.evaluate(X_test_padded,y_test_encoded)
print(f"Test accuracy: {test_accuracy * 100:.4f}%")

[1m786/786[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.3410 - loss: 1.5415
Test accuracy: 33.9978%


In [38]:
val_loss,val_accuracy=model_rnn.evaluate(X_val_padded,y_val_encoded)
print(f"Validation accuracy: {val_accuracy * 100:.4f}%")

[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.3504 - loss: 1.5390
Validation accuracy: 33.9875%


In [39]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score, precision_score, recall_score

# Print Classification Report
print("\nClassification Report for RNN:")
print(classification_report(y_test_encoded,y_pred_classes_rnn, zero_division=1))

# Print Accuracy
print("\nAccuracy for RNN:")
print(accuracy_score(y_test_encoded,y_pred_classes_rnn))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded,y_pred_classes_rnn, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_classes_rnn, average='weighted',zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_classes_rnn, average='weighted'))


Classification Report for RNN:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      8498
           1       0.34      1.00      0.51      8542
           2       1.00      0.00      0.00      2682
           3       0.60      0.00      0.00      2786
           4       1.00      0.00      0.00       300
           5       1.00      0.00      0.00      1865
           6       1.00      0.00      0.00       461

    accuracy                           0.34     25134
   macro avg       0.85      0.14      0.07     25134
weighted avg       0.73      0.34      0.17     25134


Accuracy for RNN:
0.33997771942388794

F1 Score:
0.1728781885776089

Precision Score:
0.7313123903924638

Recall Score:
0.33997771942388794


In [40]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,GRU,Dropout,Embedding

model_gru=Sequential([
    Embedding(input_dim=5000,output_dim=128,input_length=max_sequence_length),
    GRU(128,return_sequences=True),
    GRU(64),
    Dense(64,activation='relu'),
    Dense(7,activation='softmax')
])
model_gru.build(input_shape=(None,max_sequence_length))
model_gru.summary()



In [41]:
model_gru.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [42]:
history=model_gru.fit(X_train_padded,y_train_encoded,epochs=25,batch_size=64,validation_data=(X_test_padded,y_test_encoded),verbose=1,callbacks=early_stopping)

Epoch 1/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 26ms/step - accuracy: 0.3495 - loss: 1.5419 - val_accuracy: 0.7542 - val_loss: 0.7515
Epoch 2/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 22ms/step - accuracy: 0.7721 - loss: 0.6862 - val_accuracy: 0.8356 - val_loss: 0.5155
Epoch 3/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 22ms/step - accuracy: 0.8672 - loss: 0.4203 - val_accuracy: 0.8730 - val_loss: 0.3973
Epoch 4/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 22ms/step - accuracy: 0.9052 - loss: 0.2934 - val_accuracy: 0.8816 - val_loss: 0.3624
Epoch 5/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 22ms/step - accuracy: 0.9226 - loss: 0.2355 - val_accuracy: 0.8843 - val_loss: 0.3542


In [43]:
import numpy as np
y_pred_gru=model_gru.predict(X_test_padded)
y_pred_classes_gru=np.argmax(y_pred_gru,axis=1)

[1m786/786[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step


In [44]:
test_loss,test_accuracy=model_gru.evaluate(X_test_padded,y_test_encoded)
print(f"Test accuracy: {test_accuracy * 100:.4f}%")

[1m786/786[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.7534 - loss: 0.7491
Test accuracy: 75.4158%


In [45]:
val_loss,val_accuracy=model_gru.evaluate(X_val_padded,y_val_encoded)
print(f"Validation accuracy: {val_accuracy * 100:.4f}%")

[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.7586 - loss: 0.7323
Validation accuracy: 75.4940%


In [46]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score, precision_score, recall_score

# Print Classification Report
print("\nClassification Report for GRU:")
print(classification_report(y_test_encoded, y_pred_classes_gru, zero_division=1))

# Print Accuracy
print("\nAccuracy for GRU:")
print(accuracy_score(y_test_encoded, y_pred_classes_gru))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded, y_pred_classes_gru, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded, y_pred_classes_gru, average='weighted',zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded, y_pred_classes_gru, average='weighted'))


Classification Report for GRU:
              precision    recall  f1-score   support

           0       0.83      0.88      0.85      8498
           1       0.87      0.92      0.90      8542
           2       0.89      0.82      0.86      2682
           3       0.00      0.00      0.00      2786
           4       1.00      0.00      0.00       300
           5       0.31      0.76      0.43      1865
           6       1.00      0.00      0.00       461

    accuracy                           0.75     25134
   macro avg       0.70      0.48      0.43     25134
weighted avg       0.72      0.75      0.72     25134


Accuracy for GRU:
0.7541577146494788

F1 Score:
0.7165159666797515

Precision Score:
0.7246370341906802

Recall Score:
0.7541577146494788


In [47]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout,LSTM,Bidirectional

model_lstm_bi=Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_sequence_length),
    Bidirectional(LSTM(128, return_sequences=True)),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu'),
    Dense(7, activation='softmax')
])
model_lstm_bi.build(input_shape=(None, max_sequence_length))
model_lstm_bi.summary()



In [48]:
model_lstm_bi.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [49]:
history=model_lstm_bi.fit(X_train_padded,y_train_encoded,epochs=25,batch_size=64,validation_data=(X_test_padded,y_test_encoded),verbose=1,callbacks=early_stopping)

Epoch 1/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 44ms/step - accuracy: 0.6981 - loss: 0.8437 - val_accuracy: 0.8824 - val_loss: 0.3383
Epoch 2/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 38ms/step - accuracy: 0.9048 - loss: 0.2731 - val_accuracy: 0.8868 - val_loss: 0.3216
Epoch 3/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 38ms/step - accuracy: 0.9236 - loss: 0.2133 - val_accuracy: 0.8886 - val_loss: 0.3160
Epoch 4/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 38ms/step - accuracy: 0.9379 - loss: 0.1762 - val_accuracy: 0.8874 - val_loss: 0.3312
Epoch 5/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 38ms/step - accuracy: 0.9472 - loss: 0.1495 - val_accuracy: 0.8842 - val_loss: 0.3634
Epoch 6/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 38ms/step - accuracy: 0.9536 - loss: 0.1304 - val_accuracy: 0.8852 - val_loss: 0.3858
Epoc

In [50]:
import numpy as np
y_pred_lstm_bi=model_lstm_bi.predict(X_test_padded)
y_pred_classes_lstm_bi=np.argmax(y_pred_lstm_bi,axis=1)

[1m786/786[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step


In [51]:
test_loss,test_accuracy=model_lstm_bi.evaluate(X_test_padded,y_test_encoded)
print(f"Test accuracy: {test_accuracy * 100:.4f}%")

[1m786/786[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.8869 - loss: 0.3248
Test accuracy: 88.8637%


In [52]:
val_loss,val_accuracy=model_lstm_bi.evaluate(X_val_padded,y_val_encoded)
print(f"Validation accuracy: {val_accuracy * 100:.4f}%")

[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.8920 - loss: 0.3199
Validation accuracy: 88.9139%


In [53]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score, precision_score, recall_score

# Print Classification Report
print("\nClassification Report for Bi-LSTM:")
print(classification_report(y_test_encoded,y_pred_classes_lstm_bi, zero_division=1))

# Print Accuracy
print("\nAccuracy for Bi-LSTM:")
print(accuracy_score(y_test_encoded,y_pred_classes_lstm_bi))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded, y_pred_classes_lstm_bi, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_classes_lstm_bi, average='weighted',zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_classes_lstm_bi, average='weighted'))


Classification Report for Bi-LSTM:
              precision    recall  f1-score   support

           0       0.92      0.88      0.90      8498
           1       0.95      0.93      0.94      8542
           2       0.91      0.94      0.93      2682
           3       0.75      0.79      0.77      2786
           4       0.75      0.88      0.81       300
           5       0.80      0.83      0.81      1865
           6       0.68      0.84      0.75       461

    accuracy                           0.89     25134
   macro avg       0.82      0.87      0.84     25134
weighted avg       0.89      0.89      0.89     25134


Accuracy for Bi-LSTM:
0.8886369061828598

F1 Score:
0.8896065108185842

Precision Score:
0.8915655475592965

Recall Score:
0.8886369061828598


In [54]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout,SimpleRNN,Bidirectional

model_rnn_bi=Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_sequence_length),
    Bidirectional(SimpleRNN(128, return_sequences=True)),
    Bidirectional(SimpleRNN(64)),
    Dense(64, activation='relu'),
    Dense(7, activation='softmax')
])
model_rnn_bi.build(input_shape=(None, max_sequence_length))
model_rnn_bi.summary()



In [55]:
model_rnn_bi.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [56]:
history=model_rnn_bi.fit(X_train_padded,y_train_encoded,epochs=25,batch_size=64,validation_data=(X_test_padded, y_test_encoded),verbose=1,callbacks=early_stopping)

Epoch 1/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 30ms/step - accuracy: 0.6895 - loss: 0.8630 - val_accuracy: 0.8623 - val_loss: 0.4087
Epoch 2/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 26ms/step - accuracy: 0.8872 - loss: 0.3321 - val_accuracy: 0.8549 - val_loss: 0.4291
Epoch 3/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 26ms/step - accuracy: 0.8899 - loss: 0.3287 - val_accuracy: 0.8630 - val_loss: 0.4134
Epoch 4/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 26ms/step - accuracy: 0.9092 - loss: 0.2662 - val_accuracy: 0.8685 - val_loss: 0.3981
Epoch 5/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 26ms/step - accuracy: 0.9238 - loss: 0.2210 - val_accuracy: 0.8730 - val_loss: 0.3876


In [57]:
import numpy as np
y_pred_rnn_bi=model_rnn_bi.predict(X_test_padded)
y_pred_classes_rnn_bi=np.argmax(y_pred_rnn_bi,axis=1)

[1m786/786[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step


In [58]:
test_loss,test_accuracy=model_rnn_bi.evaluate(X_test_padded,y_test_encoded)
print(f"Test accuracy: {test_accuracy * 100:.4f}%")

[1m786/786[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.8615 - loss: 0.4136
Test accuracy: 86.2298%


In [59]:
val_loss,val_accuracy=model_rnn_bi.evaluate(X_val_padded,y_val_encoded)
print(f"Validation accuracy: {val_accuracy * 100:.4f}%")

[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.8682 - loss: 0.4011
Validation accuracy: 86.3944%


In [60]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score, precision_score, recall_score

# Print Classification Report
print("\nClassification Report for Bi-RNN:")
print(classification_report(y_test_encoded,y_pred_classes_rnn_bi, zero_division=1))

# Print Accuracy
print("\nAccuracy for Bi-RNN:")
print(accuracy_score(y_test_encoded,y_pred_classes_rnn_bi))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded, y_pred_classes_rnn_bi, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_classes_rnn_bi, average='weighted',zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_classes_rnn_bi, average='weighted'))


Classification Report for Bi-RNN:
              precision    recall  f1-score   support

           0       0.87      0.88      0.88      8498
           1       0.93      0.93      0.93      8542
           2       0.89      0.94      0.91      2682
           3       0.70      0.75      0.72      2786
           4       0.72      0.70      0.71       300
           5       0.90      0.56      0.69      1865
           6       0.53      0.79      0.63       461

    accuracy                           0.86     25134
   macro avg       0.79      0.79      0.78     25134
weighted avg       0.87      0.86      0.86     25134


Accuracy for Bi-RNN:
0.8622980822789846

F1 Score:
0.8613868529380244

Precision Score:
0.8677603382472187

Recall Score:
0.8622980822789846


In [61]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,GRU,Bidirectional

model_gru_bi=Sequential([
    Embedding(input_dim=5000, output_dim=128,input_length=max_sequence_length),
    Bidirectional(GRU(128,return_sequences=True)),
    Bidirectional(GRU(64)),
    Dense(64,activation='relu'),
    Dense(7,activation='softmax')
])
model_gru_bi.build(input_shape=(None,max_sequence_length))
model_gru_bi.summary()



In [62]:
model_gru_bi.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [63]:
history=model_gru_bi.fit(X_train_padded, y_train_encoded,epochs=25,batch_size=64,validation_data=(X_test_padded, y_test_encoded),verbose=1,callbacks=early_stopping)

Epoch 1/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 57ms/step - accuracy: 0.7436 - loss: 0.7299 - val_accuracy: 0.8825 - val_loss: 0.3285
Epoch 2/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 51ms/step - accuracy: 0.9078 - loss: 0.2577 - val_accuracy: 0.8909 - val_loss: 0.3029
Epoch 3/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 51ms/step - accuracy: 0.9297 - loss: 0.2005 - val_accuracy: 0.8901 - val_loss: 0.3095
Epoch 4/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 51ms/step - accuracy: 0.9398 - loss: 0.1686 - val_accuracy: 0.8875 - val_loss: 0.3345
Epoch 5/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 51ms/step - accuracy: 0.9503 - loss: 0.1421 - val_accuracy: 0.8812 - val_loss: 0.3637
Epoch 6/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 51ms/step - accuracy: 0.9574 - loss: 0.1210 - val_accuracy: 0.8842 - val_loss: 0.3705
Epoc

In [64]:
import numpy as np
y_pred_gru_bi=model_gru_bi.predict(X_test_padded)
y_pred_classes_gru_bi=np.argmax(y_pred_gru_bi,axis=1)

[1m786/786[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step


In [65]:
test_loss,test_accuracy=model_gru_bi.evaluate(X_test_padded,y_test_encoded)
print(f"Test accuracy: {test_accuracy * 100:.4f}%")

[1m786/786[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.8891 - loss: 0.3101
Test accuracy: 89.0945%


In [66]:
val_loss,val_accuracy=model_gru_bi.evaluate(X_val_padded,y_val_encoded)
print(f"Validation accuracy: {val_accuracy * 100:.4f}%")

[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.8920 - loss: 0.3038
Validation accuracy: 88.8609%


In [67]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score, precision_score, recall_score

# Print Classification Report
print("\nClassification Report for Bi-GRU:")
print(classification_report(y_test_encoded,y_pred_classes_gru_bi, zero_division=1))

# Print Accuracy
print("\nAccuracy for Bi-GRU:")
print(accuracy_score(y_test_encoded,y_pred_classes_gru_bi))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded, y_pred_classes_gru_bi, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_classes_gru_bi, average='weighted',zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_classes_gru_bi, average='weighted'))


Classification Report for Bi-GRU:
              precision    recall  f1-score   support

           0       0.89      0.91      0.90      8498
           1       0.94      0.94      0.94      8542
           2       0.92      0.93      0.92      2682
           3       0.81      0.74      0.77      2786
           4       0.75      0.88      0.81       300
           5       0.84      0.79      0.81      1865
           6       0.76      0.80      0.78       461

    accuracy                           0.89     25134
   macro avg       0.84      0.85      0.85     25134
weighted avg       0.89      0.89      0.89     25134


Accuracy for Bi-GRU:
0.8909445372801782

F1 Score:
0.8902743202481717

Precision Score:
0.8902860846599424

Recall Score:
0.8909445372801782


In [69]:
from sklearn.linear_model import LogisticRegression
logistic_model=LogisticRegression(max_iter=1000,penalty="l2",C=1.0,solver="lbfgs")
logistic_model.fit(X_train_tfidf,y_train_encoded)
y_pred_log=logistic_model.predict(X_test_tfidf)

In [70]:
y_val_pred=logistic_model.predict(X_val_tfidf)  
val_accuracy=accuracy_score(y_val_encoded,y_val_pred)  
print(f"Validation Accuracy: {val_accuracy * 100:.4f}%")

Validation Accuracy: 89.3118%


In [71]:
# Print Classification Report
print("\nClassification Report for Logistic Regression:")
print(classification_report(y_test_encoded, y_pred_log, zero_division=1))

# Print Accuracy
print("\nAccuracy for Logistic Regression:")
print(accuracy_score(y_test_encoded, y_pred_log))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded, y_pred_log, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded, y_pred_log, average='weighted', zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded, y_pred_log, average='weighted'))


Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      8498
           1       0.94      0.94      0.94      8542
           2       0.94      0.93      0.94      2682
           3       0.80      0.74      0.77      2786
           4       0.93      0.79      0.86       300
           5       0.84      0.78      0.81      1865
           6       0.83      0.69      0.75       461

    accuracy                           0.89     25134
   macro avg       0.88      0.83      0.85     25134
weighted avg       0.89      0.89      0.89     25134


Accuracy for Logistic Regression:
0.8928543009469245

F1 Score:
0.8918525290836666

Precision Score:
0.8922322664003801

Recall Score:
0.8928543009469245


In [72]:
from sklearn.svm import SVC
svm1=SVC(kernel='rbf',C=1.0,gamma='scale') 
svm1.fit(X_train_tfidf,y_train_encoded)
y_pred_svm=svm1.predict(X_test_tfidf)

In [73]:
y_val_pred=svm1.predict(X_val_tfidf)  
val_accuracy=accuracy_score(y_val_encoded,y_val_pred)  
print(f"Validation Accuracy: {val_accuracy * 100:.4f}%")

Validation Accuracy: 88.7681%


In [74]:
# Print Classification Report
print("\nClassification Report for SVM:")
print(classification_report(y_test_encoded, y_pred_svm, zero_division=1))

# Print Accuracy
print("\nAccuracy for SVM:")
print(accuracy_score(y_test_encoded, y_pred_svm))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded, y_pred_svm, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded, y_pred_svm, average='weighted', zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded, y_pred_svm, average='weighted'))


Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.87      0.92      0.89      8498
           1       0.94      0.93      0.94      8542
           2       0.92      0.94      0.93      2682
           3       0.80      0.72      0.76      2786
           4       0.83      0.83      0.83       300
           5       0.83      0.78      0.80      1865
           6       0.75      0.72      0.73       461

    accuracy                           0.89     25134
   macro avg       0.85      0.83      0.84     25134
weighted avg       0.89      0.89      0.89     25134


Accuracy for SVM:
0.8870056497175142

F1 Score:
0.8860160513331892

Precision Score:
0.8863234876369831

Recall Score:
0.8870056497175142


In [75]:
from sklearn.svm import SVC
svm2=SVC(kernel='rbf',C=100,gamma='scale') 
svm2.fit(X_train_tfidf,y_train_encoded)
y_pred_svm2=svm2.predict(X_test_tfidf)

In [76]:
y_val_pred=svm2.predict(X_val_tfidf)  
val_accuracy=accuracy_score(y_val_encoded,y_val_pred)  
print(f"Validation Accuracy: {val_accuracy * 100:.4f}%")

Validation Accuracy: 89.4178%


In [78]:
# Print Classification Report
print("\nClassification Report for SVM with C=100:")
print(classification_report(y_test_encoded, y_pred_svm2, zero_division=1))

# Print Accuracy
print("\nAccuracy for SVM with C=100:")
print(accuracy_score(y_test_encoded, y_pred_svm2))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded, y_pred_svm2, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded, y_pred_svm2, average='weighted', zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded, y_pred_svm2, average='weighted'))


Classification Report for SVM with C=100:
              precision    recall  f1-score   support

           0       0.88      0.91      0.89      8498
           1       0.94      0.93      0.94      8542
           2       0.93      0.94      0.94      2682
           3       0.79      0.73      0.76      2786
           4       0.85      0.87      0.86       300
           5       0.82      0.79      0.81      1865
           6       0.76      0.74      0.75       461

    accuracy                           0.89     25134
   macro avg       0.85      0.85      0.85     25134
weighted avg       0.89      0.89      0.89     25134


Accuracy for SVM with C=100:
0.8887164796689743

F1 Score:
0.8880875604387555

Precision Score:
0.8880196206297097

Recall Score:
0.8887164796689743


In [79]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=100,random_state=42)
rf.fit(X_train_tfidf,y_train_encoded)
y_pred_rf=rf.predict(X_test_tfidf)

In [80]:
y_val_pred=rf.predict(X_val_tfidf)  
val_accuracy=accuracy_score(y_val_encoded,y_val_pred)  
print(f"Validation Accuracy: {val_accuracy * 100:.4f}%")

Validation Accuracy: 84.4053%


In [81]:
# Print Classification Report
print("\nClassification Report for RandomForest Classifier:")
print(classification_report(y_test_encoded,y_pred_rf, zero_division=1))

# Print Accuracy
print("\nAccuracy for RandomForest Classifier:")
print(accuracy_score(y_test_encoded,y_pred_rf))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded,y_pred_rf, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_rf,average='weighted', zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_rf,average='weighted'))


Classification Report for RandomForest Classifier:
              precision    recall  f1-score   support

           0       0.81      0.89      0.85      8498
           1       0.90      0.93      0.91      8542
           2       0.93      0.89      0.91      2682
           3       0.75      0.55      0.63      2786
           4       0.87      0.73      0.79       300
           5       0.79      0.70      0.74      1865
           6       0.76      0.58      0.66       461

    accuracy                           0.85     25134
   macro avg       0.83      0.75      0.78     25134
weighted avg       0.84      0.85      0.84     25134


Accuracy for RandomForest Classifier:
0.8466618922574998

F1 Score:
0.8421604224870318

Precision Score:
0.8437025159854344

Recall Score:
0.8466618922574998


In [82]:
from sklearn.linear_model import SGDClassifier
sgd=SGDClassifier(loss='hinge',penalty='elasticnet',max_iter=1000,learning_rate='adaptive',eta0=0.01)
sgd.fit(X_train_tfidf,y_train_encoded)
y_pred_sgd=sgd.predict(X_test_tfidf)

In [83]:
y_val_pred=sgd.predict(X_val_tfidf)  
val_accuracy=accuracy_score(y_val_encoded,y_val_pred)  
print(f"Validation Accuracy: {val_accuracy * 100:.4f}%")

Validation Accuracy: 86.0894%


In [84]:
# Print Classification Report
print("\nClassification Report for SGD Classifier:")
print(classification_report(y_test_encoded,y_pred_sgd, zero_division=1))

# Print Accuracy
print("\nAccuracy for SGD Classifier:")
print(accuracy_score(y_test_encoded,y_pred_sgd))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded,y_pred_sgd, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_sgd,average='weighted', zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_sgd,average='weighted'))


Classification Report for SGD Classifier:
              precision    recall  f1-score   support

           0       0.81      0.93      0.87      8498
           1       0.92      0.94      0.93      8542
           2       0.91      0.93      0.92      2682
           3       0.83      0.54      0.65      2786
           4       0.87      0.69      0.77       300
           5       0.84      0.70      0.76      1865
           6       0.78      0.60      0.68       461

    accuracy                           0.87     25134
   macro avg       0.85      0.76      0.80     25134
weighted avg       0.87      0.87      0.86     25134


Accuracy for SGD Classifier:
0.8656401686957905

F1 Score:
0.8599670629088709

Precision Score:
0.865655823580858

Recall Score:
0.8656401686957905


In [85]:
from sklearn.linear_model import SGDClassifier
sgd1=SGDClassifier(loss='log_loss',penalty='elasticnet',max_iter=1000,learning_rate='adaptive',eta0=0.01)
sgd1.fit(X_train_tfidf,y_train_encoded)
y_pred_sgd1=sgd1.predict(X_test_tfidf)

In [86]:
y_val_pred=sgd1.predict(X_val_tfidf)  
val_accuracy=accuracy_score(y_val_encoded,y_val_pred)  
print(f"Validation Accuracy: {val_accuracy * 100:.4f}%")

Validation Accuracy: 83.2383%


In [87]:
# Print Classification Report
print("\nClassification Report for SGD Classifier with log loss:")
print(classification_report(y_test_encoded,y_pred_sgd1, zero_division=1))

# Print Accuracy
print("\nAccuracy for SGD Classifier:")
print(accuracy_score(y_test_encoded,y_pred_sgd1))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded,y_pred_sgd1, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_sgd1,average='weighted', zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_sgd1,average='weighted'))


Classification Report for SGD Classifier with log loss:
              precision    recall  f1-score   support

           0       0.77      0.95      0.85      8498
           1       0.89      0.93      0.91      8542
           2       0.94      0.88      0.91      2682
           3       0.80      0.49      0.61      2786
           4       0.96      0.37      0.54       300
           5       0.85      0.59      0.70      1865
           6       0.92      0.37      0.53       461

    accuracy                           0.84     25134
   macro avg       0.88      0.65      0.72     25134
weighted avg       0.84      0.84      0.83     25134


Accuracy for SGD Classifier:
0.8382271027293706

F1 Score:
0.8282984729307973

Precision Score:
0.8437126129661108

Recall Score:
0.8382271027293706


In [93]:
from sklearn.svm import LinearSVC
lin_svc=LinearSVC(penalty='l2',loss='squared_hinge',C=0.1,max_iter=1000,random_state=42)
lin_svc.fit(X_train_tfidf,y_train_encoded)
y_pred_lin_svc=lin_svc.predict(X_test_tfidf)

In [94]:
y_val_pred=lin_svc.predict(X_val_tfidf)  
val_accuracy=accuracy_score(y_val_encoded,y_val_pred)  
print(f"Validation Accuracy: {val_accuracy * 100:.4f}%")

Validation Accuracy: 88.8344%


In [95]:
# Print Classification Report
print("\nClassification Report for LinearSVC:")
print(classification_report(y_test_encoded,y_pred_lin_svc, zero_division=1))

# Print Accuracy
print("\nAccuracy for LinearSVC:")
print(accuracy_score(y_test_encoded,y_pred_lin_svc))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded,y_pred_lin_svc, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_lin_svc,average='weighted', zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_lin_svc,average='weighted'))


Classification Report for LinearSVC:
              precision    recall  f1-score   support

           0       0.86      0.93      0.89      8498
           1       0.94      0.94      0.94      8542
           2       0.93      0.94      0.93      2682
           3       0.81      0.68      0.74      2786
           4       0.90      0.79      0.84       300
           5       0.83      0.77      0.80      1865
           6       0.81      0.72      0.76       461

    accuracy                           0.89     25134
   macro avg       0.87      0.82      0.84     25134
weighted avg       0.89      0.89      0.89     25134


Accuracy for LinearSVC:
0.8891939205856608

F1 Score:
0.8873413823646068

Precision Score:
0.8879907940144418

Recall Score:
0.8891939205856608


In [96]:
from sklearn.neural_network import MLPClassifier
mlp=MLPClassifier(hidden_layer_sizes=(130,),activation='relu',solver='adam',learning_rate='adaptive',nesterovs_momentum=True,verbose=1)
mlp.fit(X_train_tfidf,y_train_encoded)
y_pred_mlp=mlp.predict(X_test_tfidf)

Iteration 1, loss = 0.87251656
Iteration 2, loss = 0.31757904
Iteration 3, loss = 0.24146272
Iteration 4, loss = 0.20924093
Iteration 5, loss = 0.18931645
Iteration 6, loss = 0.17549480
Iteration 7, loss = 0.16477856
Iteration 8, loss = 0.15619360
Iteration 9, loss = 0.14905659
Iteration 10, loss = 0.14302496
Iteration 11, loss = 0.13747821
Iteration 12, loss = 0.13245035
Iteration 13, loss = 0.12828921
Iteration 14, loss = 0.12391999
Iteration 15, loss = 0.11964886
Iteration 16, loss = 0.11560835
Iteration 17, loss = 0.11188833
Iteration 18, loss = 0.10840349
Iteration 19, loss = 0.10489654
Iteration 20, loss = 0.10142328
Iteration 21, loss = 0.09796240
Iteration 22, loss = 0.09453073
Iteration 23, loss = 0.09156237
Iteration 24, loss = 0.08787415
Iteration 25, loss = 0.08454643
Iteration 26, loss = 0.08157091
Iteration 27, loss = 0.07811659
Iteration 28, loss = 0.07526047
Iteration 29, loss = 0.07254516
Iteration 30, loss = 0.06909435
Iteration 31, loss = 0.06620043
Iteration 32, los

In [97]:
y_val_pred=mlp.predict(X_val_tfidf)  
val_accuracy=accuracy_score(y_val_encoded,y_val_pred)  
print(f"Validation Accuracy: {val_accuracy * 100:.4f}%")

Validation Accuracy: 87.9326%


In [98]:
# Print Classification Report
print("\nClassification Report for MLP Classifier :")
print(classification_report(y_test_encoded,y_pred_mlp, zero_division=1))

# Print Accuracy
print("\nAccuracy for MLP Classifier:")
print(accuracy_score(y_test_encoded,y_pred_mlp))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded,y_pred_mlp, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_mlp,average='weighted', zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_mlp,average='weighted'))


Classification Report for MLP Classifier :
              precision    recall  f1-score   support

           0       0.88      0.89      0.89      8498
           1       0.93      0.92      0.93      8542
           2       0.91      0.93      0.92      2682
           3       0.75      0.72      0.73      2786
           4       0.83      0.89      0.86       300
           5       0.76      0.81      0.79      1865
           6       0.78      0.77      0.78       461

    accuracy                           0.88     25134
   macro avg       0.84      0.85      0.84     25134
weighted avg       0.88      0.88      0.88     25134


Accuracy for MLP Classifier:
0.8765815230365243

F1 Score:
0.8765391672046253

Precision Score:
0.8767350741421566

Recall Score:
0.8765815230365243


In [99]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier(random_state=42,max_depth=10)
dt.fit(X_train_tfidf,y_train_encoded)
y_pred_dt=dt.predict(X_test_tfidf)

In [100]:
y_val_pred=dt.predict(X_val_tfidf)  
val_accuracy=accuracy_score(y_val_encoded,y_val_pred)  
print(f"Validation Accuracy: {val_accuracy * 100:.4f}%")

Validation Accuracy: 61.9546%


In [101]:
# Print Classification Report
print("\nClassification Report for Decision tree Classifier :")
print(classification_report(y_test_encoded,y_pred_dt, zero_division=1))

# Print Accuracy
print("\nAccuracy for Decision Classifier:")
print(accuracy_score(y_test_encoded,y_pred_dt))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded,y_pred_dt, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_dt,average='weighted', zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_dt,average='weighted'))


Classification Report for Decision tree Classifier :
              precision    recall  f1-score   support

           0       0.48      0.98      0.64      8498
           1       0.95      0.60      0.74      8542
           2       0.93      0.81      0.86      2682
           3       1.00      0.00      0.01      2786
           4       1.00      0.00      0.00       300
           5       0.00      0.00      0.00      1865
           6       1.00      0.00      0.00       461

    accuracy                           0.62     25134
   macro avg       0.76      0.34      0.32     25134
weighted avg       0.72      0.62      0.56     25134


Accuracy for Decision Classifier:
0.6215087132967295

F1 Score:
0.5603259964219454

Precision Score:
0.7247912341798444

Recall Score:
0.6215087132967295


In [102]:
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB()
nb.fit(X_train_tfidf, y_train_encoded)
y_pred_nb=nb.predict(X_test_tfidf)

In [103]:
y_val_pred=nb.predict(X_val_tfidf)  
val_accuracy=accuracy_score(y_val_encoded,y_val_pred)  
print(f"Validation Accuracy: {val_accuracy * 100:.4f}%")

Validation Accuracy: 78.5837%


In [104]:
# Print Classification Report
print("\nClassification Report for Naive Bayes Classifier :")
print(classification_report(y_test_encoded,y_pred_nb, zero_division=1))

# Print Accuracy
print("\nAccuracy for Naive Bayes Classifier:")
print(accuracy_score(y_test_encoded,y_pred_nb))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded,y_pred_nb, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_nb,average='weighted', zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_nb,average='weighted'))


Classification Report for Naive Bayes Classifier :
              precision    recall  f1-score   support

           0       0.79      0.87      0.83      8498
           1       0.76      0.89      0.82      8542
           2       0.92      0.71      0.80      2682
           3       0.74      0.49      0.59      2786
           4       0.98      0.39      0.56       300
           5       0.77      0.58      0.66      1865
           6       0.86      0.40      0.54       461

    accuracy                           0.78     25134
   macro avg       0.83      0.62      0.69     25134
weighted avg       0.79      0.78      0.78     25134


Accuracy for Naive Bayes Classifier:
0.7839977719423888

F1 Score:
0.7756350075659006

Precision Score:
0.7886859928127729

Recall Score:
0.7839977719423888


In [105]:
from sklearn.ensemble import GradientBoostingClassifier
gbm=GradientBoostingClassifier(n_estimators=100,random_state=42,max_depth=10)
gbm.fit(X_train_tfidf,y_train_encoded)
y_pred_gbm=gbm.predict(X_test_tfidf)

In [106]:
y_val_pred=gbm.predict(X_val_tfidf)  
val_accuracy=accuracy_score(y_val_encoded,y_val_pred)  
print(f"Validation Accuracy: {val_accuracy * 100:.4f}%")

Validation Accuracy: 84.2992%


In [107]:
# Print Classification Report
print("\nClassification Report for Grdient Boosting Classifier :")
print(classification_report(y_test_encoded,y_pred_gbm, zero_division=1))

# Print Accuracy
print("\nAccuracy for Gradient Boosting Classifier:")
print(accuracy_score(y_test_encoded,y_pred_gbm))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded,y_pred_gbm, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_gbm,average='weighted', zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_gbm,average='weighted'))


Classification Report for Grdient Boosting Classifier :
              precision    recall  f1-score   support

           0       0.78      0.92      0.84      8498
           1       0.94      0.91      0.92      8542
           2       0.90      0.93      0.92      2682
           3       0.78      0.53      0.63      2786
           4       0.77      0.75      0.76       300
           5       0.82      0.68      0.74      1865
           6       0.66      0.62      0.64       461

    accuracy                           0.85     25134
   macro avg       0.81      0.76      0.78     25134
weighted avg       0.85      0.85      0.84     25134


Accuracy for Gradient Boosting Classifier:
0.8465425320283282

F1 Score:
0.8425259321390922

Precision Score:
0.8487291501800371

Recall Score:
0.8465425320283282


In [108]:
import lightgbm as lgb
lgbm=lgb.LGBMClassifier(n_estimators=150,max_depth=50)
lgbm.fit(X_train_tfidf,y_train_encoded)
y_pred_lgbm=lgbm.predict(X_test_tfidf)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.325969 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 162540
[LightGBM] [Info] Number of data points in the train set: 67860, number of used features: 4954
[LightGBM] [Info] Start training from score -1.084347
[LightGBM] [Info] Start training from score -1.079217
[LightGBM] [Info] Start training from score -2.237826
[LightGBM] [Info] Start training from score -2.199615
[LightGBM] [Info] Start training from score -4.425702
[LightGBM] [Info] Start training from score -2.601232
[LightGBM] [Info] Start training from score -3.999115


In [109]:
y_val_pred=lgbm.predict(X_val_tfidf)  
val_accuracy=accuracy_score(y_val_encoded,y_val_pred)  
print(f"Validation Accuracy: {val_accuracy * 100:.4f}%")

Validation Accuracy: 87.8000%


In [110]:
# Print Classification Report
print("\nClassification Report for LightGBM Classifier :")
print(classification_report(y_test_encoded,y_pred_lgbm, zero_division=1))

# Print Accuracy
print("\nAccuracy for LightGBM Classifier:")
print(accuracy_score(y_test_encoded,y_pred_lgbm))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded,y_pred_lgbm, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_lgbm,average='weighted', zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_lgbm,average='weighted'))


Classification Report for LightGBM Classifier :
              precision    recall  f1-score   support

           0       0.86      0.91      0.89      8498
           1       0.94      0.94      0.94      8542
           2       0.93      0.94      0.93      2682
           3       0.78      0.69      0.74      2786
           4       0.84      0.81      0.82       300
           5       0.83      0.77      0.80      1865
           6       0.77      0.74      0.75       461

    accuracy                           0.88     25134
   macro avg       0.85      0.83      0.84     25134
weighted avg       0.88      0.88      0.88     25134


Accuracy for LightGBM Classifier:
0.8846980186201957

F1 Score:
0.8834925612673156

Precision Score:
0.8834802871736203

Recall Score:
0.8846980186201957


In [111]:
from sklearn.tree import ExtraTreeClassifier
ext_class=ExtraTreeClassifier(max_depth=20,random_state=42,splitter='best')
ext_class.fit(X_train_tfidf,y_train_encoded)
y_pred_ext_class=ext_class.predict(X_test_tfidf)

In [112]:
y_val_pred=ext_class.predict(X_val_tfidf)  
val_accuracy=accuracy_score(y_val_encoded,y_val_pred)  
print(f"Validation Accuracy: {val_accuracy * 100:.4f}%")

Validation Accuracy: 47.2484%


In [113]:
# Print Classification Report
print("\nClassification Report for Extratree Classifier :")
print(classification_report(y_test_encoded,y_pred_ext_class, zero_division=1))

# Print Accuracy
print("\nAccuracy for ExtraTree Classifier:")
print(accuracy_score(y_test_encoded,y_pred_ext_class))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded,y_pred_ext_class, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_ext_class,average='weighted', zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_ext_class,average='weighted'))


Classification Report for Extratree Classifier :
              precision    recall  f1-score   support

           0       0.40      0.98      0.57      8498
           1       0.85      0.32      0.46      8542
           2       0.88      0.10      0.18      2682
           3       0.69      0.13      0.21      2786
           4       0.77      0.33      0.47       300
           5       0.62      0.08      0.14      1865
           6       1.00      0.00      0.00       461

    accuracy                           0.47     25134
   macro avg       0.74      0.28      0.29     25134
weighted avg       0.67      0.47      0.41     25134


Accuracy for ExtraTree Classifier:
0.4727062942627516

F1 Score:
0.4075140543967474

Precision Score:
0.6688009700940755

Recall Score:
0.4727062942627516


In [114]:
from sklearn.ensemble import AdaBoostClassifier
adb_class=AdaBoostClassifier(learning_rate=0.1)
adb_class.fit(X_train_tfidf,y_train_encoded)
y_pred_adb=adb_class.predict(X_test_tfidf)

In [115]:
y_val_pred=adb_class.predict(X_val_tfidf)  
val_accuracy=accuracy_score(y_val_encoded,y_val_pred)  
print(f"Validation Accuracy: {val_accuracy * 100:.4f}%")

Validation Accuracy: 51.7703%


In [116]:
# Print Classification Report
print("\nClassification Report for AdaBoost Classifier :")
print(classification_report(y_test_encoded,y_pred_adb, zero_division=1))

# Print Accuracy
print("\nAccuracy for AdaBoost Classifier:")
print(accuracy_score(y_test_encoded,y_pred_adb))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded,y_pred_adb, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_adb,average='weighted', zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_adb,average='weighted'))


Classification Report for AdaBoost Classifier :
              precision    recall  f1-score   support

           0       0.43      0.98      0.60      8498
           1       0.79      0.30      0.44      8542
           2       0.89      0.80      0.84      2682
           3       1.00      0.00      0.00      2786
           4       1.00      0.00      0.01       300
           5       1.00      0.00      0.00      1865
           6       1.00      0.00      0.01       461

    accuracy                           0.52     25134
   macro avg       0.87      0.30      0.27     25134
weighted avg       0.72      0.52      0.44     25134


Accuracy for AdaBoost Classifier:
0.5205697461605793

F1 Score:
0.44133331588019853

Precision Score:
0.7245501450672243

Recall Score:
0.5205697461605793


In [117]:
X_train_vect_dense=X_train_tfidf.toarray()
X_test_vect_dense=X_test_tfidf.toarray()
X_val_vect_dense=X_val_tfidf.toarray()
from sklearn.ensemble import HistGradientBoostingClassifier
hist_gdb = HistGradientBoostingClassifier(loss='log_loss',max_depth=25,random_state=42)
hist_gdb.fit(X_train_vect_dense,y_train_encoded)

y_pred_hgdb=hist_gdb.predict(X_test_vect_dense)

In [118]:
y_val_pred=hist_gdb.predict(X_val_vect_dense)  
val_accuracy=accuracy_score(y_val_encoded,y_val_pred)  
print(f"Validation Accuracy: {val_accuracy * 100:.4f}%")

Validation Accuracy: 87.0574%


In [119]:
# Print Classification Report
print("\nClassification Report for HistGradientBoostingClassifier:")
print(classification_report(y_test_encoded,y_pred_hgdb, zero_division=1))

# Print Accuracy
print("\nAccuracy for HistGradientBoostingClassifier:")
print(accuracy_score(y_test_encoded,y_pred_hgdb))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded,y_pred_hgdb, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_hgdb,average='weighted', zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_hgdb,average='weighted'))


Classification Report for HistGradientBoostingClassifier:
              precision    recall  f1-score   support

           0       0.85      0.91      0.88      8498
           1       0.94      0.93      0.94      8542
           2       0.92      0.94      0.93      2682
           3       0.78      0.66      0.72      2786
           4       0.82      0.79      0.80       300
           5       0.82      0.75      0.78      1865
           6       0.74      0.72      0.73       461

    accuracy                           0.88     25134
   macro avg       0.84      0.81      0.82     25134
weighted avg       0.87      0.88      0.87     25134


Accuracy for HistGradientBoostingClassifier:
0.8753481340017506

F1 Score:
0.8736487415014405

Precision Score:
0.874039619233424

Recall Score:
0.8753481340017506


In [120]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train_tfidf,y_train_encoded)
y_pred_knn=knn.predict(X_test_tfidf)

In [121]:
y_val_pred=knn.predict(X_val_tfidf)  
val_accuracy=accuracy_score(y_val_encoded,y_val_pred)  
print(f"Validation Accuracy: {val_accuracy * 100:.4f}%")

Validation Accuracy: 53.0964%


In [122]:
# Print Classification Report
print("\nClassification Report for K-NN Classifier :")
print(classification_report(y_test_encoded,y_pred_knn, zero_division=1))

# Print Accuracy
print("\nAccuracy for K-NN Classifier:")
print(accuracy_score(y_test_encoded,y_pred_knn))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded,y_pred_knn, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_knn,average='weighted', zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_knn,average='weighted'))


Classification Report for K-NN Classifier :
              precision    recall  f1-score   support

           0       0.44      0.96      0.60      8498
           1       0.84      0.28      0.42      8542
           2       0.90      0.33      0.48      2682
           3       0.67      0.29      0.41      2786
           4       0.78      0.43      0.56       300
           5       0.66      0.45      0.53      1865
           6       0.72      0.14      0.23       461

    accuracy                           0.53     25134
   macro avg       0.72      0.41      0.46     25134
weighted avg       0.68      0.53      0.49     25134


Accuracy for K-NN Classifier:
0.5278905068831066

F1 Score:
0.4931734253314029

Precision Score:
0.6755676265028473

Recall Score:
0.5278905068831066


In [123]:
from xgboost import XGBClassifier
xgb=XGBClassifier(n_estimators=100,learning_rate=0.1,max_depth=10,random_state=42)
xgb.fit(X_train_tfidf, y_train_encoded)
y_pred_xgb=xgb.predict(X_test_tfidf)

In [124]:
y_val_pred=xgb.predict(X_val_tfidf)  
val_accuracy=accuracy_score(y_val_encoded,y_val_pred)  
print(f"Validation Accuracy: {val_accuracy * 100:.4f}%")

Validation Accuracy: 83.2913%


In [125]:
# Print Classification Report
print("\nClassification Report for XGBoost Classifier :")
print(classification_report(y_test_encoded,y_pred_xgb, zero_division=1))

# Print Accuracy
print("\nAccuracy for XGBoost Classifier:")
print(accuracy_score(y_test_encoded,y_pred_xgb))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded,y_pred_xgb, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_xgb,average='weighted', zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_xgb,average='weighted'))


Classification Report for XGBoost Classifier :
              precision    recall  f1-score   support

           0       0.75      0.92      0.83      8498
           1       0.94      0.89      0.91      8542
           2       0.91      0.93      0.92      2682
           3       0.77      0.48      0.59      2786
           4       0.80      0.77      0.79       300
           5       0.81      0.65      0.72      1865
           6       0.75      0.61      0.67       461

    accuracy                           0.83     25134
   macro avg       0.82      0.75      0.78     25134
weighted avg       0.84      0.83      0.83     25134


Accuracy for XGBoost Classifier:
0.8342484284236492

F1 Score:
0.8289734253930116

Precision Score:
0.8393658164139163

Recall Score:
0.8342484284236492


In [129]:
import catboost
catb=catboost.CatBoostClassifier(iterations=100,learning_rate=0.1,depth=8,verbose=0)
catb.fit(X_train_tfidf,y_train_encoded)
y_pred_catb=catb.predict(X_test_tfidf)

In [130]:
y_val_pred=catb.predict(X_val_tfidf)  
val_accuracy=accuracy_score(y_val_encoded,y_val_pred)  
print(f"Validation Accuracy: {val_accuracy * 100:.4f}%")

Validation Accuracy: 76.5416%


In [131]:
# Print Classification Report
print("\nClassification Report for CatBoost Classifier :")
print(classification_report(y_test_encoded,y_pred_catb, zero_division=1))

# Print Accuracy
print("\nAccuracy for CatBoost Classifier:")
print(accuracy_score(y_test_encoded,y_pred_catb))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded,y_pred_catb, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_catb,average='weighted', zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_catb,average='weighted'))


Classification Report for CatBoost Classifier :
              precision    recall  f1-score   support

           0       0.64      0.95      0.76      8498
           1       0.93      0.87      0.90      8542
           2       0.92      0.89      0.90      2682
           3       0.84      0.20      0.32      2786
           4       0.89      0.44      0.59       300
           5       0.83      0.43      0.56      1865
           6       0.81      0.29      0.43       461

    accuracy                           0.78     25134
   macro avg       0.84      0.58      0.64     25134
weighted avg       0.81      0.78      0.75     25134


Accuracy for CatBoost Classifier:
0.7750457547545158

F1 Score:
0.7524154757989663

Precision Score:
0.8110280834709155

Recall Score:
0.7750457547545158


In [126]:
from sklearn.svm import NuSVC
nu_svc=NuSVC(nu=0.001,kernel='rbf',gamma='scale',coef0=0.0)
nu_svc.fit(X_train_tfidf,y_train_encoded)
y_pred_nu_svc=nu_svc.predict(X_test_tfidf)

In [127]:
y_val_pred=nu_svc.predict(X_val_tfidf)  
val_accuracy=accuracy_score(y_val_encoded,y_val_pred)  
print(f"Validation Accuracy: {val_accuracy * 100:.4f}%")

Validation Accuracy: 85.1744%


In [128]:
# Print Classification Report
print("\nClassification Report for NuSVC:")
print(classification_report(y_test_encoded,y_pred_nu_svc, zero_division=1))

# Print Accuracy
print("\nAccuracy for NuSVC:")
print(accuracy_score(y_test_encoded,y_pred_nu_svc))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded,y_pred_nu_svc, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_nu_svc,average='weighted', zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_nu_svc, average='weighted'))


Classification Report for NuSVC:
              precision    recall  f1-score   support

           0       0.82      0.86      0.84      8498
           1       0.93      0.93      0.93      8542
           2       0.93      0.94      0.93      2682
           3       0.67      0.60      0.63      2786
           4       0.84      0.87      0.86       300
           5       0.75      0.68      0.71      1865
           6       0.75      0.74      0.75       461

    accuracy                           0.85     25134
   macro avg       0.81      0.80      0.81     25134
weighted avg       0.85      0.85      0.85     25134


Accuracy for NuSVC:
0.8486114426673033

F1 Score:
0.8471194162640453

Precision Score:
0.846536166152234

Recall Score:
0.8486114426673033


In [133]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Input, Conv1D, Bidirectional, LSTM, Dense, GlobalMaxPooling1D, Dropout
from tensorflow.keras.models import Model

vocab_size=5000
embed_dim=128
max_len=100
num_classes=7

# RCNN Architecture
inputs=Input(shape=(max_len,))
x=Embedding(input_dim=vocab_size,output_dim=embed_dim,input_length=max_len)(inputs)
x=Conv1D(filters=64,kernel_size=5,activation='relu',padding='same')(x)
x=Bidirectional(LSTM(64,return_sequences=True))(x)
x=GlobalMaxPooling1D()(x)
x=Dropout(0.5)(x)
outputs=Dense(num_classes,activation='softmax')(x)

rcnn_model=Model(inputs, outputs)
rcnn_model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])



In [134]:
rcnn_model.summary()

In [135]:
rcnn_model.fit(X_train_padded,y_train_encoded,validation_data=(X_test_padded,y_test_encoded),batch_size=64,epochs=25,callbacks=early_stopping,verbose=1)

Epoch 1/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 22ms/step - accuracy: 0.6870 - loss: 0.8881 - val_accuracy: 0.8820 - val_loss: 0.3497
Epoch 2/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 17ms/step - accuracy: 0.8986 - loss: 0.3033 - val_accuracy: 0.8862 - val_loss: 0.3215
Epoch 3/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 17ms/step - accuracy: 0.9198 - loss: 0.2359 - val_accuracy: 0.8914 - val_loss: 0.3165
Epoch 4/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 17ms/step - accuracy: 0.9337 - loss: 0.1956 - val_accuracy: 0.8869 - val_loss: 0.3323
Epoch 5/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 17ms/step - accuracy: 0.9454 - loss: 0.1628 - val_accuracy: 0.8843 - val_loss: 0.3659


<keras.src.callbacks.history.History at 0x7b8751fd4850>

In [136]:
import numpy as np
y_pred_rcnn_model=rcnn_model.predict(X_test_padded)
y_pred_classes_rcnn_model=np.argmax(y_pred_rcnn_model,axis=1)

[1m786/786[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step


In [137]:
val_loss,val_accuracy=rcnn_model.evaluate(X_val_padded,y_val_encoded)
print(f"Validation accuracy: {val_accuracy * 100:.4f}%")

[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8820 - loss: 0.3473
Validation accuracy: 88.0387%


In [138]:
test_loss,test_accuracy=rcnn_model.evaluate(X_test_padded,y_test_encoded)
print(f"Test accuracy: {test_accuracy * 100:.4f}%")

[1m786/786[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.8798 - loss: 0.3575
Test accuracy: 88.1993%


In [139]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score, precision_score, recall_score

# Print Classification Report
print("\nClassification Report for RCNN:")
print(classification_report(y_test_encoded,y_pred_classes_rcnn_model, zero_division=1))

# Print Accuracy
print("\nAccuracy for RCNN:")
print(accuracy_score(y_test_encoded,y_pred_classes_rcnn_model))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded,y_pred_classes_rcnn_model, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_classes_rcnn_model, average='weighted',zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_classes_rcnn_model, average='weighted'))


Classification Report for RCNN:
              precision    recall  f1-score   support

           0       0.89      0.90      0.89      8498
           1       0.93      0.94      0.94      8542
           2       0.88      0.96      0.92      2682
           3       0.78      0.69      0.73      2786
           4       0.83      0.74      0.78       300
           5       0.82      0.76      0.79      1865
           6       0.72      0.69      0.70       461

    accuracy                           0.88     25134
   macro avg       0.83      0.81      0.82     25134
weighted avg       0.88      0.88      0.88     25134


Accuracy for RCNN:
0.8819925200923052

F1 Score:
0.8803115047540795

Precision Score:
0.8799617212608657

Recall Score:
0.8819925200923052


In [142]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, MultiHeadAttention, Dense, GlobalAveragePooling1D


vocab_size=5000
embed_dim=128
max_sequence_length=100  
num_classes=7
# Input layer
inputs=Input(shape=(max_sequence_length,))
# Embedding layer
x=Embedding(input_dim=vocab_size,output_dim=embed_dim,input_length=max_sequence_length)(inputs)
# MultiHeadAttention layer
x=MultiHeadAttention(num_heads=8,key_dim=embed_dim)(x,x)  
# Feedforward layers
x=Dense(64,activation='relu')(x)
x=Dense(32,activation='relu')(x)
x=GlobalAveragePooling1D()(x)
# Output layer
outputs=Dense(num_classes, activation='softmax')(x)
# Build the model
model_self_attention=Model(inputs=inputs,outputs=outputs)
model_self_attention.summary()

In [143]:
model_self_attention.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [144]:
history=model_self_attention.fit(X_train_padded, y_train_encoded,epochs=25,batch_size=64,validation_data=(X_test_padded,y_test_encoded),verbose=1,callbacks=early_stopping)

Epoch 1/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 23ms/step - accuracy: 0.5581 - loss: 1.0832 - val_accuracy: 0.8654 - val_loss: 0.3927
Epoch 2/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 19ms/step - accuracy: 0.8794 - loss: 0.3531 - val_accuracy: 0.8711 - val_loss: 0.3579
Epoch 3/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 18ms/step - accuracy: 0.9026 - loss: 0.2754 - val_accuracy: 0.8851 - val_loss: 0.3279
Epoch 4/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 18ms/step - accuracy: 0.9152 - loss: 0.2345 - val_accuracy: 0.8832 - val_loss: 0.3491
Epoch 5/25
[1m1061/1061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 18ms/step - accuracy: 0.9264 - loss: 0.2021 - val_accuracy: 0.8848 - val_loss: 0.3381


In [145]:
import numpy as np
y_pred_self_attention = model_self_attention.predict(X_test_padded)
y_pred_classes_self_attention = np.argmax(y_pred_self_attention,axis=1)

[1m786/786[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step


In [146]:
test_loss,test_accuracy=model_self_attention.evaluate(X_test_padded,y_test_encoded)
print(f"Test accuracy: {test_accuracy * 100:.4f}%")

[1m786/786[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8644 - loss: 0.3959
Test accuracy: 86.5401%


In [147]:
val_loss,val_accuracy=model_self_attention.evaluate(X_val_padded,y_val_encoded)
print(f"Validation accuracy: {val_accuracy * 100:.4f}%")

[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8634 - loss: 0.3878
Validation accuracy: 86.1822%


In [148]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score, precision_score, recall_score

# Print Classification Report
print("\nClassification Report for Self attention network")
print(classification_report(y_test_encoded,y_pred_classes_self_attention, zero_division=1))

# Print Accuracy
print("\nAccuracy for Self-attention network:")
print(accuracy_score(y_test_encoded,y_pred_classes_self_attention))

# Print F1 Score
print("\nF1 Score:")
print(f1_score(y_test_encoded, y_pred_classes_self_attention, average='weighted'))

# Print Precision Score
print("\nPrecision Score:")
print(precision_score(y_test_encoded,y_pred_classes_self_attention, average='weighted',zero_division=1))

# Print Recall Score
print("\nRecall Score:")
print(recall_score(y_test_encoded,y_pred_classes_self_attention, average='weighted'))


Classification Report for Self attention network
              precision    recall  f1-score   support

           0       0.85      0.91      0.88      8498
           1       0.93      0.92      0.93      8542
           2       0.89      0.94      0.91      2682
           3       0.69      0.70      0.70      2786
           4       0.82      0.74      0.78       300
           5       0.85      0.67      0.75      1865
           6       0.81      0.49      0.61       461

    accuracy                           0.87     25134
   macro avg       0.83      0.77      0.79     25134
weighted avg       0.87      0.87      0.86     25134


Accuracy for Self-attention network:
0.8654014482374472

F1 Score:
0.8635125827776264

Precision Score:
0.8655193179992172

Recall Score:
0.8654014482374472
