In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Flatten
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from xgboost import XGBClassifier


# Loading the files

In [17]:
df_train = pd.read_csv('./datasets/train/train_text_seq.csv')
df_valid = pd.read_csv('./datasets/valid/valid_text_seq.csv')

# Extract input sequences and labels
X_train = df_train['input_str'].values
y_train = df_train['label'].values
X_valid = df_valid['input_str'].values
y_valid = df_valid['label'].values



# Preprocessing the data

In [18]:
# Encode the digits using LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(list('0123456789'))

# Convert input_str into sequences of encoded digits, removing the first three zeroes
def encode_sequence(sequence):
    return label_encoder.transform(list(sequence.lstrip('0')))  # Remove leading zeroes

X_train_encoded = [encode_sequence(seq) for seq in X_train]
X_valid_encoded = [encode_sequence(seq) for seq in X_valid]

# Pad the sequences to ensure they all have the same length (47 in this case)
X_train_padded = pad_sequences(X_train_encoded, maxlen=47, padding='post')
X_valid_padded = pad_sequences(X_valid_encoded, maxlen=47, padding='post')

# Convert the labels to categorical (0 or 1)
y_train_categorical = to_categorical(y_train, num_classes=2)
y_valid_categorical = to_categorical(y_valid, num_classes=2)

# Experimentation with different models

In [19]:
#apply logistic regression to the data
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
logreg = LogisticRegression()
logreg.fit(X_train_padded, y_train)
y_pred = logreg.predict(X_valid_padded)
print("Accuracy:", accuracy_score(y_valid, y_pred))

Accuracy: 0.5030674846625767


In [20]:
#apply random forest classification on the data
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_padded, y_train)
y_pred = rf.predict(X_valid_padded)
print("Accuracy:", accuracy_score(y_valid, y_pred))

Accuracy: 0.5460122699386503


In [21]:
#apply xgboost classification on the data after instaling xgboost
xgb = XGBClassifier()
xgb.fit(X_train_padded, y_train)
y_pred = xgb.predict(X_valid_padded)
print("Accuracy:", accuracy_score(y_valid, y_pred))


Accuracy: 0.6053169734151329


# Our Model - LSTM + CNN

In [30]:
total_samples = len(X_train_padded)
indices = np.arange(total_samples)
np.random.shuffle(indices)
train_size_80_random = int(total_samples* 0.8)
X_train_80 = X_train_padded[:train_size_80_random]
y_train_80 = y_train_categorical[:train_size_80_random]
model = Sequential()
model.add(Embedding(input_dim=10, output_dim=12, input_length=47))  # Embedding layer with output_dim=12
model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))  # Convolutional layer with 32 filters
model.add(MaxPooling1D(pool_size=2))  # Max pooling layer
model.add(Dropout(0.3))  # Dropout layer
model.add(LSTM(24, return_sequences=False))  # LSTM layer with 24 units
model.add(Dense(32, activation='relu'))  # Dense layer with 32 units
model.add(Dense(2, activation='softmax'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train_categorical, 
          validation_data=(X_valid_padded, y_valid_categorical), 
          epochs=150, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_valid_padded, y_valid_categorical)
print(f'Validation Loss: {loss}, Validation Accuracy: {accuracy}')

Epoch 1/150
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.5127 - loss: 0.6930 - val_accuracy: 0.5194 - val_loss: 0.6889
Epoch 2/150
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5533 - loss: 0.6833 - val_accuracy: 0.6667 - val_loss: 0.6272
Epoch 3/150
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6381 - loss: 0.6370 - val_accuracy: 0.6789 - val_loss: 0.6003
Epoch 4/150
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6694 - loss: 0.6035 - val_accuracy: 0.7076 - val_loss: 0.5583
Epoch 5/150
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6936 - loss: 0.5790 - val_accuracy: 0.7117 - val_loss: 0.5483
Epoch 6/150
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7068 - loss: 0.5542 - val_accuracy: 0.7444 - val_loss: 0.5270
Epoch 7/150
[1m222/22

In [31]:
model.summary()

In [32]:
model.save('./models/dataset3.keras')

# Using 80%, 60%, 40%, 20% of dataset to train

Model using 80% of dataset

In [26]:
total_samples = len(X_train_padded)
indices = np.arange(total_samples)
np.random.shuffle(indices)
train_size_80_random = int(total_samples* 0.8)
X_train_80 = X_train_padded[:train_size_80_random]
y_train_80 = y_train_categorical[:train_size_80_random]
model = Sequential()
model.add(Embedding(input_dim=10, output_dim=12, input_length=47))  # Embedding layer with output_dim=12
model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))  # Convolutional layer with 32 filters
model.add(MaxPooling1D(pool_size=2))  # Max pooling layer
model.add(Dropout(0.3))  # Dropout layer
model.add(LSTM(24, return_sequences=False))  # LSTM layer with 24 units
model.add(Dense(32, activation='relu'))  # Dense layer with 32 units
model.add(Dense(2, activation='softmax'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_80, y_train_80, 
          validation_data=(X_valid_padded, y_valid_categorical), 
          epochs=250, batch_size=32)

# Evaluate the model
loss, accuracy80 = model.evaluate(X_valid_padded, y_valid_categorical)
print(f'Validation Loss: {loss}, Validation Accuracy: {accuracy80}')



Epoch 1/250
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.5058 - loss: 0.6932 - val_accuracy: 0.5174 - val_loss: 0.6891
Epoch 2/250
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.5648 - loss: 0.6745 - val_accuracy: 0.6687 - val_loss: 0.6100
Epoch 3/250
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.6553 - loss: 0.6229 - val_accuracy: 0.7035 - val_loss: 0.5799
Epoch 4/250
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6747 - loss: 0.5924 - val_accuracy: 0.7423 - val_loss: 0.5426
Epoch 5/250
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.7179 - loss: 0.5527 - val_accuracy: 0.7403 - val_loss: 0.5282
Epoch 6/250
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.7287 - loss: 0.5332 - val_accuracy: 0.7710 - val_loss: 0.5073
Epoch 7/250
[1m222/22

Model using 60% of dataset

In [28]:
indices = np.arange(total_samples)
np.random.shuffle(indices)
train_size_60_random = int(total_samples* 0.6)
X_train_60 = X_train_padded[:train_size_60_random]
y_train_60 = y_train_categorical[:train_size_60_random]
model = Sequential()
model.add(Embedding(input_dim=10, output_dim=12, input_length=47))  # Embedding layer with output_dim=12
model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))  # Convolutional layer with 32 filters
model.add(MaxPooling1D(pool_size=2))  # Max pooling layer
model.add(Dropout(0.3))  # Dropout layer
model.add(LSTM(24, return_sequences=False))  # LSTM layer with 24 units
model.add(Dense(32, activation='relu'))  # Dense layer with 32 units
model.add(Dense(2, activation='softmax'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_60, y_train_60, 
          validation_data=(X_valid_padded, y_valid_categorical), 
          epochs=250, batch_size=32)

# Evaluate the model
loss, accuracy60 = model.evaluate(X_valid_padded, y_valid_categorical)
print(f'Validation Loss: {loss}, Validation Accuracy: {accuracy60}')

Epoch 1/250




[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.4946 - loss: 0.6933 - val_accuracy: 0.5256 - val_loss: 0.6910
Epoch 2/250
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.5241 - loss: 0.6898 - val_accuracy: 0.6401 - val_loss: 0.6368
Epoch 3/250
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6244 - loss: 0.6459 - val_accuracy: 0.6748 - val_loss: 0.6001
Epoch 4/250
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6404 - loss: 0.6300 - val_accuracy: 0.6912 - val_loss: 0.5793
Epoch 5/250
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6474 - loss: 0.6198 - val_accuracy: 0.6933 - val_loss: 0.5704
Epoch 6/250
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6830 - loss: 0.5904 - val_accuracy: 0.7321 - val_loss: 0.5456
Epoch 7/250
[1m133/133[0m [32m━

Model using 40% of dataset

In [26]:
total_samples = len(X_train_padded)
indices = np.arange(total_samples)
np.random.shuffle(indices)
train_size_40_random = int(total_samples* 0.4)
X_train_40 = X_train_padded[:train_size_40_random]
y_train_40 = y_train_categorical[:train_size_40_random]
model = Sequential()
model.add(Embedding(input_dim=10, output_dim=12, input_length=47))  # Embedding layer with output_dim=12
model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))  # Convolutional layer with 32 filters
model.add(MaxPooling1D(pool_size=2))  # Max pooling layer
model.add(Dropout(0.3))  # Dropout layer
model.add(LSTM(24, return_sequences=False))  # LSTM layer with 24 units
model.add(Dense(32, activation='relu'))  # Dense layer with 32 units
model.add(Dense(2, activation='softmax'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_40, y_train_40, 
          validation_data=(X_valid_padded, y_valid_categorical), 
          epochs=250, batch_size=32)

# Evaluate the model
loss, accuracy40 = model.evaluate(X_valid_padded, y_valid_categorical)
print(f'Validation Loss: {loss}, Validation Accuracy: {accuracy40}')

Epoch 1/250
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.4781 - loss: 0.6932 - val_accuracy: 0.4847 - val_loss: 0.6940
Epoch 2/250
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5075 - loss: 0.6931 - val_accuracy: 0.5153 - val_loss: 0.6901
Epoch 3/250
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5531 - loss: 0.6861 - val_accuracy: 0.6360 - val_loss: 0.6414
Epoch 4/250
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6206 - loss: 0.6492 - val_accuracy: 0.6360 - val_loss: 0.6474
Epoch 5/250
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6340 - loss: 0.6415 - val_accuracy: 0.6912 - val_loss: 0.6036
Epoch 6/250
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6438 - loss: 0.6206 - val_accuracy: 0.6912 - val_loss: 0.5970
Epoch 7/250
[1m89/89[0m [32m━━━

Model using 20% of dataset

In [25]:
total_samples = len(X_train_padded)
indices = np.arange(total_samples)
np.random.shuffle(indices)
train_size_20_random = int(total_samples* 0.2)
X_train_20 = X_train_padded[:train_size_20_random]
y_train_20 = y_train_categorical[:train_size_20_random]
model = Sequential()
model.add(Embedding(input_dim=10, output_dim=12, input_length=47))  # Embedding layer with output_dim=12
model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))  # Convolutional layer with 32 filters
model.add(MaxPooling1D(pool_size=2))  # Max pooling layer
model.add(Dropout(0.3))  # Dropout layer
model.add(LSTM(24, return_sequences=False))  # LSTM layer with 24 units
model.add(Dense(32, activation='relu'))  # Dense layer with 32 units
model.add(Dense(2, activation='softmax'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_20, y_train_20, 
          validation_data=(X_valid_padded, y_valid_categorical), 
          epochs=250, batch_size=32)

# Evaluate the model
loss, accuracy20 = model.evaluate(X_valid_padded, y_valid_categorical)
print(f'Validation Loss: {loss}, Validation Accuracy: {accuracy20}')

Epoch 1/250




[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.4841 - loss: 0.6943 - val_accuracy: 0.5153 - val_loss: 0.6928
Epoch 2/250
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.4618 - loss: 0.6943 - val_accuracy: 0.5153 - val_loss: 0.6927
Epoch 3/250
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5294 - loss: 0.6924 - val_accuracy: 0.5194 - val_loss: 0.6930
Epoch 4/250
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5008 - loss: 0.6932 - val_accuracy: 0.5501 - val_loss: 0.6928
Epoch 5/250
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5104 - loss: 0.6930 - val_accuracy: 0.4847 - val_loss: 0.6930
Epoch 6/250
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.4987 - loss: 0.6928 - val_accuracy: 0.4847 - val_loss: 0.6926
Epoch 7/250
[1m45/45[0m [32m━━━━━━━━━━━━━━

In [4]:
import matplotlib as mtpl
from matplotlib import pyplot as plt

Matplotlib is building the font cache; this may take a moment.


In [None]:
import matplotlib.pyplot as plt

# Percentages of data used
percentages = [100, 80, 60, 40, 20]

# Accuracies corresponding to the percentages 

accuracies = [accuracy,accuracy80,accuracy60,accuracy40,accuracy20]

# Plotting the accuracies
plt.figure(figsize=(10, 5))
plt.plot(percentages, accuracies, marker='o')
plt.title('Model Accuracy vs. Percentage of Data Used')
plt.xlabel('Percentage of Data Used')
plt.ylabel('Accuracy')
plt.grid(True)
plt.show()