# MLP
## read data

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data

import torchvision.transforms as transforms
import torchvision.datasets as datasets

from sklearn import metrics
from sklearn import decomposition
from sklearn import manifold
from tqdm.notebook import trange, tqdm
import matplotlib.pyplot as plt
import numpy as np

import copy
import random
import time

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
import pandas as pd
import json

from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2

In [7]:


data1 = []
data2 = []
data3 = []
test = []

# Assuming the NDJSON data is stored in a file called 'data.ndjson'
with open("domain1_train.json", 'r') as file:
    for line in file:
        dict = json.loads(line.strip())
        dict["domain"] = 1
        data3.append(dict)
# Assuming the NDJSON data is stored in a file called 'data.ndjson'
with open("domain2_train.json", 'r') as file:
    for line in file:
        dict = json.loads(line.strip())
        dict["domain"] = 2
        data3.append(dict)

with open("test_set.json", 'r') as file:
    for line in file:
        dict = json.loads(line.strip())
        test.append(dict)


# Assuming the NDJSON data is stored in a file called 'data.ndjson'
with open("domain1_train.json", 'r') as file:
    for line in file:
        dict = json.loads(line.strip())
        dict["domain"] = 1
        data1.append(dict)
# Assuming the NDJSON data is stored in a file called 'data.ndjson'
with open("domain2_train.json", 'r') as file:
    for line in file:
        dict = json.loads(line.strip())
        dict["domain"] = 2
        data2.append(dict)

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
df3 = pd.DataFrame(data3)
test = pd.DataFrame(test)


y = pd.concat([df1['label'], df2['label']])

# Convert sequences to BoW representation
def sequences_to_bow(sequences, vocab_size):
    matrix = np.zeros((len(sequences), vocab_size))
    for i, sequence in enumerate(sequences):
        for token in sequence:
            if token < vocab_size:
                matrix[i][token] += 1
    return matrix

vocab_size = 5000  # as mentioned

X = sequences_to_bow(df3['text'], vocab_size)
X_pred = sequences_to_bow(test['text'], vocab_size)
Xtrain,Ytrain = np.array(X),np.array(y)
X_train,X_test,Y_train,Y_test = train_test_split(Xtrain,Ytrain,test_size=0.25,stratify=Ytrain)


## MLP baseline
- bow
- no sampling
- no weight
- test acc: 87%

In [8]:
model = Sequential()

model.add(Dense(50,input_shape=(5000,),activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(100,activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(150,activation='relu'))
model.add(Dropout(0.4))

model.add(Dense(200,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.summary()
model.fit(X_train,Y_train,epochs=15,verbose=2)
model.evaluate(X_test,Y_test)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                250050    
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense_1 (Dense)             (None, 100)               5100      
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 150)               15150     
                                                                 
 dropout_2 (Dropout)         (None, 150)               0         
                                                                 
 dense_3 (Dense)             (None, 200)               3

[0.7270073294639587, 0.8797674179077148]

## Optimised MLP
- parameter optimisation
- acc 0.9009302258491516

In [11]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Model
optimized_model = Sequential()

optimized_model.add(Dense(128, input_shape=(5000,), activation='relu', kernel_regularizer=l2(0.001)))
optimized_model.add(Dropout(0.3))

optimized_model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
optimized_model.add(Dropout(0.3))

optimized_model.add(Dense(1, activation='sigmoid'))

optimizer = Adam(learning_rate=0.001)
optimized_model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

optimized_model.summary()
optimized_model.fit(X_train, Y_train, epochs=50, verbose=2, validation_split=0.1, batch_size=32, callbacks=[early_stopping])
optimized_model.evaluate(X_test, Y_test)


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 128)               640128    
                                                                 
 dropout_4 (Dropout)         (None, 128)               0         
                                                                 
 dense_6 (Dense)             (None, 128)               16512     
                                                                 
 dropout_5 (Dropout)         (None, 128)               0         
                                                                 
 dense_7 (Dense)             (None, 1)                 129       
                                                                 
Total params: 656769 (2.51 MB)
Trainable params: 656769 (2.51 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/50
726/726

[0.3545358180999756, 0.9009302258491516]

## Optimised MLP - imbalance handling
- model-based
- SMOTE


In [18]:
df1.loc[df1['label'] == 0, 'model'] = 7 # machine no.7
df1.loc[df1['label'] == 1, 'model'] = 8 # human label as 8

df2['model'] = df2['model'].fillna(9) # human label as 9
df3 = pd.concat([df1, df2], axis=0, ignore_index=True)
df3['model'].describe()
 #0-7: machine; 8-9 human

from sklearn.utils import resample

# Split the dataset based on model values
dfs = [df3[df3['model'] == i] for i in range(10)]  # 0-9 models


In [9]:
df3.groupby('model').count()

Unnamed: 0_level_0,text,label,domain
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,2364,2364,2364
1.0,2357,2357,2357
2.0,2339,2339,2339
3.0,2358,2358,2358
4.0,789,789,789
5.0,780,780,780
6.0,1763,1763,1763
7.0,9750,9750,9750
8.0,9750,9750,9750
9.0,2150,2150,2150


### model-based

In [20]:

target_samples = 1488

# Resample machine labels 0-7
for i in range(8):
    if len(dfs[i]) > target_samples:
        # Undersample if the count is greater than 1488
        dfs[i] = resample(dfs[i], replace=False, n_samples=target_samples, random_state=42)
    elif len(dfs[i]) < target_samples:
        # Oversample if the count is less than 1488
        dfs[i] = resample(dfs[i], replace=True, n_samples=target_samples, random_state=42)

# Note: Not resampling human labels 8 and 9, as they are to be kept as they are

# Concatenate results
df_resampled1 = pd.concat(dfs)

# Check the new distribution
print(df_resampled1['model'].value_counts())
#df_resampled

8.0    9750
9.0    2150
0.0    1488
1.0    1488
2.0    1488
3.0    1488
4.0    1488
5.0    1488
6.0    1488
7.0    1488
Name: model, dtype: int64


In [22]:
X = sequences_to_bow(df_resampled1['text'], vocab_size)
y = df_resampled1['label'].values

In [23]:
X_pred = sequences_to_bow(test['text'], 5000)
Xtrain,Ytrain = np.array(X),np.array(y) # model
X_train,X_test,Y_train,Y_test = train_test_split(Xtrain,Ytrain,test_size=0.25,stratify=Ytrain)


In [24]:
# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Model
optimized_model = Sequential()

optimized_model.add(Dense(128, input_shape=(5000,), activation='relu', kernel_regularizer=l2(0.001)))
optimized_model.add(Dropout(0.3))

optimized_model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
optimized_model.add(Dropout(0.3))

optimized_model.add(Dense(1, activation='sigmoid'))

optimizer = Adam(learning_rate=0.001)
optimized_model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

optimized_model.summary()
optimized_model.fit(X_train, Y_train, epochs=50, verbose=2, validation_split=0.1, batch_size=32, callbacks=[early_stopping])
optimized_model.evaluate(X_test, Y_test)


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 128)               640128    
                                                                 
 dropout_6 (Dropout)         (None, 128)               0         
                                                                 
 dense_10 (Dense)            (None, 128)               16512     
                                                                 
 dropout_7 (Dropout)         (None, 128)               0         
                                                                 
 dense_11 (Dense)            (None, 1)                 129       
                                                                 
Total params: 656769 (2.51 MB)
Trainable params: 656769 (2.51 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/50
503/503

[0.3600963056087494, 0.8924550414085388]

In [38]:
# Model
model = Sequential()

model.add(Dense(50,input_shape=(5000,),activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(100,activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(150,activation='relu'))
model.add(Dropout(0.4))

model.add(Dense(200,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

from sklearn.utils import class_weight

class_weights = {0: 1., 1: 2.}
model.fit(X, y, epochs=10, verbose=2, validation_split=0.1, batch_size=32, class_weight=class_weights)



Epoch 1/10
670/670 - 6s - loss: 0.1500 - accuracy: 0.9655 - val_loss: 36.4075 - val_accuracy: 0.1000 - 6s/epoch - 10ms/step
Epoch 2/10
670/670 - 5s - loss: 0.0522 - accuracy: 0.9877 - val_loss: 40.5819 - val_accuracy: 0.1004 - 5s/epoch - 8ms/step
Epoch 3/10
670/670 - 5s - loss: 0.0254 - accuracy: 0.9937 - val_loss: 68.1191 - val_accuracy: 0.0995 - 5s/epoch - 7ms/step
Epoch 4/10
670/670 - 5s - loss: 0.0204 - accuracy: 0.9957 - val_loss: 83.8880 - val_accuracy: 0.0987 - 5s/epoch - 7ms/step
Epoch 5/10
670/670 - 5s - loss: 0.0115 - accuracy: 0.9973 - val_loss: 82.4594 - val_accuracy: 0.0962 - 5s/epoch - 8ms/step
Epoch 6/10
670/670 - 5s - loss: 0.0090 - accuracy: 0.9980 - val_loss: 87.2169 - val_accuracy: 0.1008 - 5s/epoch - 7ms/step
Epoch 7/10
670/670 - 5s - loss: 0.0063 - accuracy: 0.9986 - val_loss: 145.6340 - val_accuracy: 0.0958 - 5s/epoch - 8ms/step
Epoch 8/10
670/670 - 5s - loss: 0.0049 - accuracy: 0.9990 - val_loss: 143.8586 - val_accuracy: 0.0983 - 5s/epoch - 7ms/step
Epoch 9/10
67

<keras.src.callbacks.History at 0x790297f1c100>

In [40]:
# 3. Predict on the new test data
y_pred = model.predict(X_pred)
y_class_pred = (y_pred > 0.5).astype(int)
list(y_class_pred).count(1)



286

### smote
- label-based(0/1): acc 0.91

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.over_sampling import SMOTE
from collections import Counter

vocab_size = 5000

def sequences_to_bow(sequences, vocab_size):
    matrix = np.zeros((len(sequences), vocab_size))
    for i, sequence in enumerate(sequences):
        for token in sequence:
            if token < vocab_size:
                matrix[i][token] += 1
    return matrix

X_vectorized = sequences_to_bow(df3['text'], vocab_size)
y = df3['label'].values

# Display distribution before SMOTE
counter = Counter(y)
print('Before', counter)

# Apply SMOTE for balancing
smote = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_vectorized, y)

# Display distribution after SMOTE
counter = Counter(y_resampled_smote)
print('After', counter)


Before Counter({0: 22500, 1: 11900})
After Counter({1: 22500, 0: 22500})


In [11]:
X_pred = sequences_to_bow(test['text'], 5000)
Xtrain,Ytrain = np.array(X_resampled_smote),np.array(y_resampled_smote) # smote
X_train,X_test,Y_train,Y_test = train_test_split(Xtrain,Ytrain,test_size=0.25,stratify=Ytrain)


In [6]:
# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Model
optimized_model = Sequential()

optimized_model.add(Dense(128, input_shape=(5000,), activation='relu', kernel_regularizer=l2(0.001)))
optimized_model.add(Dropout(0.3))

optimized_model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
optimized_model.add(Dropout(0.3))

optimized_model.add(Dense(1, activation='sigmoid'))

optimizer = Adam(learning_rate=0.001)
optimized_model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

optimized_model.summary()
optimized_model.fit(X_train, Y_train, epochs=50, verbose=2, validation_split=0.1, batch_size=32, callbacks=[early_stopping])
optimized_model.evaluate(X_test, Y_test)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               640128    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 656769 (2.51 MB)
Trainable params: 656769 (2.51 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/50
950/950 -

[0.336286723613739, 0.9148444533348083]

In [None]:

# Model
optimized_model = Sequential()

optimized_model.add(Dense(128, input_shape=(5000,), activation='relu', kernel_regularizer=l2(0.001)))
optimized_model.add(Dropout(0.3))

optimized_model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
optimized_model.add(Dropout(0.3))

optimized_model.add(Dense(1, activation='sigmoid'))

optimizer = Adam(learning_rate=0.001)
optimized_model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

optimized_model.summary()
optimized_model.fit(Xtrain, Ytrain, epochs=10, verbose=2, validation_split=0.1, batch_size=32,class_weight=class_weights)
#optimized_model.evaluate(X_test, Y_test)


Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_40 (Dense)            (None, 128)               640128    
                                                                 
 dropout_30 (Dropout)        (None, 128)               0         
                                                                 
 dense_41 (Dense)            (None, 128)               16512     
                                                                 
 dropout_31 (Dropout)        (None, 128)               0         
                                                                 
 dense_42 (Dense)            (None, 1)                 129       
                                                                 
Total params: 656769 (2.51 MB)
Trainable params: 656769 (2.51 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [32]:
# 3. Predict on the new test data
y_pred = optimized_model.predict(X_pred)
y_class_pred = (y_pred > 0.5).astype(int)
list(y_class_pred).count(1)




296

### ADASYN

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.over_sampling import ADASYN  # Import ADASYN
from collections import Counter

vocab_size = 5000

def sequences_to_bow(sequences, vocab_size):
    matrix = np.zeros((len(sequences), vocab_size))
    for i, sequence in enumerate(sequences):
        for token in sequence:
            if token < vocab_size:
                matrix[i][token] += 1
    return matrix

X_vectorized = sequences_to_bow(df3['text'], vocab_size)
y = df3['label'].values

# Display distribution before ADASYN
counter = Counter(y)
print('Before', counter)

# Apply ADASYN for balancing
adasyn = ADASYN(random_state=42)  # Create an ADASYN instance
X_resampled_adasyn, y_resampled_adasyn = adasyn.fit_resample(X_vectorized, y)  # Use ADASYN for resampling

# Display distribution after ADASYN
counter = Counter(y_resampled_adasyn)
print('After', counter)


Before Counter({0: 22500, 1: 11900})
After Counter({0: 22500, 1: 20802})


In [4]:

# Model
optimized_model = Sequential()

optimized_model.add(Dense(128, input_shape=(5000,), activation='relu', kernel_regularizer=l2(0.001)))
optimized_model.add(Dropout(0.3))

optimized_model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
optimized_model.add(Dropout(0.3))

optimized_model.add(Dense(1, activation='sigmoid'))

optimizer = Adam(learning_rate=0.001)
optimized_model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

optimized_model.summary()
class_weights = {0: 1., 1: 2.}
optimized_model.fit(X_resampled_adasyn, y_resampled_adasyn, epochs=10, verbose=2, validation_split=0.1, batch_size=32,class_weight=class_weights)
#optimized_model.evaluate(X_test, Y_test)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               640128    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 656769 (2.51 MB)
Trainable params: 656769 (2.51 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
1218/1218

<keras.src.callbacks.History at 0x7949f064f7f0>

In [10]:
# 3. Predict on the new test data
y_pred = optimized_model.predict(X_pred)
y_class_pred = (y_pred > 0.375).astype(int)
list(y_class_pred).count(1)



499

In [11]:
test['class'] = y_class_pred
test[['id', 'class']].to_csv('smote_mlp5.csv', index= False) #83.2% on kaggle

In [12]:
test[['id', 'class']].to_csv('smote_mlp5.csv', index= False) #83.2% on kaggle

In [None]:
test[['id', 'class']].to_csv('smote_mlp6.csv', index= False) #85% on kaggle

## data augmentation
suggested to be used before imbalance handling (if we are using ADASYN)

In [4]:
import random

def random_insertion(sequence, n):
    for _ in range(n):
        add_random_token(sequence)
    return sequence

def random_deletion(sequence, p):
    if len(sequence) == 1:
        return sequence
    remaining = list(filter(lambda x: random.uniform(0,1) > p, sequence))
    if len(remaining) == 0:
        return [random.choice(sequence)]
    return remaining

def random_swap(sequence, n):
    length = len(sequence)
    for _ in range(n):
        idx1, idx2 = random.randint(0, length-1), random.randint(0, length-1)
        sequence[idx1], sequence[idx2] = sequence[idx2], sequence[idx1]
    return sequence

def add_random_token(sequence):
    if len(sequence) == 0 or max(sequence) == 0:
        # Either add a default token or skip this sequence.
        sequence.append(1)
        return
    position = random.randint(0, len(sequence)-1)
    random_token = random.randint(1, max(sequence))
    sequence.insert(position, random_token)


In [6]:
from sklearn.utils import shuffle
from imblearn.over_sampling import ADASYN
from collections import Counter
batch_size = 5000  # you can adjust this based on your memory capacity

# Shuffle data
df3 = shuffle(df3)

num_batches = int(len(df3) / batch_size) + (1 if len(df3) % batch_size != 0 else 0)

for batch in range(num_batches):
    start_idx = batch * batch_size
    end_idx = min((batch + 1) * batch_size, len(df3))

    batch_data = df3.iloc[start_idx:end_idx]

    augmented_data = []
    augmented_labels = []

    for text, label in zip(batch_data['text'].values, batch_data['label'].values):
        # Original data
        augmented_data.append(text)
        augmented_labels.append(label)

        # Augment data
        augmented_sequence = random_insertion(list(text), 1)
        augmented_sequence = random_deletion(augmented_sequence, 0.1)
        augmented_sequence = random_swap(augmented_sequence, 2)

        augmented_data.append(augmented_sequence)
        augmented_labels.append(label)

    augmented_data = np.array(augmented_data, dtype=object)
    augmented_labels = np.array(augmented_labels)

    # Convert augmented data to bag-of-words representation
    X_vectorized = sequences_to_bow(augmented_data, vocab_size)
    y = np.array(augmented_labels)

    # Display distribution before ADASYN
    counter = Counter(y)
    print(f'Batch {batch + 1} - Before', counter)

    # Apply ADASYN for balancing
    adasyn = ADASYN(random_state=42)
    X_resampled_adasyn, y_resampled_adasyn = adasyn.fit_resample(X_vectorized, y)

    # Display distribution after ADASYN
    counter = Counter(y_resampled_adasyn)
    print(f'Batch {batch + 1} - After', counter)


Batch 1 - Before Counter({0: 6450, 1: 3550})
Batch 1 - After Counter({1: 6824, 0: 6450})
Batch 2 - Before Counter({0: 6464, 1: 3536})
Batch 2 - After Counter({1: 6493, 0: 6464})
Batch 3 - Before Counter({0: 6628, 1: 3372})
Batch 3 - After Counter({1: 6780, 0: 6628})
Batch 4 - Before Counter({0: 6574, 1: 3426})
Batch 4 - After Counter({1: 6670, 0: 6574})
Batch 5 - Before Counter({0: 6520, 1: 3480})
Batch 5 - After Counter({1: 6686, 0: 6520})
Batch 6 - Before Counter({0: 6616, 1: 3384})
Batch 6 - After Counter({1: 6858, 0: 6616})
Batch 7 - Before Counter({0: 5748, 1: 3052})
Batch 7 - After Counter({1: 6138, 0: 5748})


In [5]:
from sklearn.utils import shuffle
from imblearn.over_sampling import ADASYN
from collections import Counter
batch_size = 5000  # you can adjust this based on your memory capacity

# Shuffle data
df3 = shuffle(df3)

num_batches = int(len(df3) / batch_size) + (1 if len(df3) % batch_size != 0 else 0)

all_X_resampled = []
all_y_resampled = []

for batch in range(num_batches):
    start_idx = batch * batch_size
    end_idx = min((batch + 1) * batch_size, len(df3))

    batch_data = df3.iloc[start_idx:end_idx]

    augmented_data = []
    augmented_labels = []

    for text, label in zip(batch_data['text'].values, batch_data['label'].values):
        # Original data
        augmented_data.append(text)
        augmented_labels.append(label)

        # Augment data
        augmented_sequence = random_insertion(list(text), 1)
        augmented_sequence = random_deletion(augmented_sequence, 0.1)
        augmented_sequence = random_swap(augmented_sequence, 2)

        augmented_data.append(augmented_sequence)
        augmented_labels.append(label)

    augmented_data = np.array(augmented_data, dtype=object)
    augmented_labels = np.array(augmented_labels)

    # Convert augmented data to bag-of-words representation
    X_vectorized = sequences_to_bow(augmented_data, vocab_size)
    y = np.array(augmented_labels)

    # Apply ADASYN for balancing
    adasyn = ADASYN(random_state=42)
    X_resampled_adasyn, y_resampled_adasyn = adasyn.fit_resample(X_vectorized, y)

    all_X_resampled.append(X_resampled_adasyn)
    all_y_resampled.append(y_resampled_adasyn)

# Combine data from all batches
all_X_resampled = np.vstack(all_X_resampled)
all_y_resampled = np.hstack(all_y_resampled)

# Display distribution of combined data
counter = Counter(all_y_resampled)
print('Overall After Resampling', counter)


Overall After Resampling Counter({1: 45690, 0: 45000})


In [8]:
list(all_y_resampled).count(1)

44613

In [6]:
np.save('all_X_resampled.npy', all_X_resampled)
np.save('all_y_resampled.npy', all_y_resampled)


In [3]:
all_X_resampled = np.load('all_X_resampled.npy')
all_y_resampled = np.load('all_y_resampled.npy')



In [4]:
# Model
optimized_model = Sequential()

optimized_model.add(Dense(128, input_shape=(5000,), activation='relu', kernel_regularizer=l2(0.001)))
optimized_model.add(Dropout(0.3))

optimized_model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
optimized_model.add(Dropout(0.3))

optimized_model.add(Dense(1, activation='sigmoid'))

optimizer = Adam(learning_rate=0.001)
optimized_model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

optimized_model.summary()
class_weights = {0: 1., 1: 2.}
optimized_model.fit(all_X_resampled, all_y_resampled, epochs=10, verbose=2, validation_split=0.1, batch_size=32,class_weight=class_weights)
#optimized_model.evaluate(X_test, Y_test)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               640128    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 656769 (2.51 MB)
Trainable params: 656769 (2.51 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
2551/2551

<keras.src.callbacks.History at 0x7fcaa1c95f60>

In [8]:
X_pred = sequences_to_bow(test['text'], 5000)

y_pred = optimized_model.predict(X_pred)




In [22]:

y_class_pred = (y_pred > 0.3).astype(int)
list(y_class_pred).count(1)

450

In [23]:
test['class'] = y_class_pred
test[['id', 'class']].to_csv('smote_mlp8.csv', index= False) #