In [1]:
import matplotlib.pyplot as plt
from PIL import Image
import os
import cv2
from collections import Counter
import pandas as pd
import numpy as np
import math

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!mkdir -p ~/.kaggle
!cp /content/drive/MyDrive/Colab\ Notebooks/kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
!kaggle competitions download -c nlp-getting-started
!unzip nlp-getting-started.zip -d /content/nlp-getting-started

Downloading nlp-getting-started.zip to /content
  0% 0.00/593k [00:00<?, ?B/s]
100% 593k/593k [00:00<00:00, 124MB/s]
Archive:  nlp-getting-started.zip
  inflating: /content/nlp-getting-started/sample_submission.csv  
  inflating: /content/nlp-getting-started/test.csv  
  inflating: /content/nlp-getting-started/train.csv  


In [5]:
base_dir = '/content/nlp-getting-started/'

train_df = pd.read_csv(base_dir + 'train.csv')
test_df = pd.read_csv(base_dir + 'test.csv')
sample_submission_df = pd.read_csv(base_dir + 'sample_submission.csv')

In [13]:
class_0 = train_df[train_df['target'] == 0].sample(n=3271, random_state=42)
class_1 = train_df[train_df['target'] == 1]
balanced_df = pd.concat([class_0, class_1])

In [15]:
train_df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,4342
1,3271


In [14]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(balanced_df['text'])
sequences = tokenizer.texts_to_sequences(balanced_df['text'])
padded_sequences = pad_sequences(sequences, padding='post', truncating='post', maxlen=200)


labels = balanced_df['target'].values


X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


inputs = tf.keras.Input(shape=(200,))

x = tf.keras.layers.Embedding(input_dim=10000, output_dim=256, input_length=200)(inputs)
x = tf.keras.layers.SpatialDropout1D(0.3)(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(x)
x = tf.keras.layers.Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(x)
attention = tf.keras.layers.Attention()([x, x])
x = tf.keras.layers.GlobalAveragePooling1D()(attention)

# Dense layers with Batch Normalization and Dropout
x = tf.keras.layers.Dense(128, activation='relu')(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dropout(0.5)(x)

x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dropout(0.5)(x)


outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.00001)

history = model.fit(X_train, y_train, epochs=50, batch_size=64,
                    validation_data=(X_val, y_val),
                    callbacks=[early_stopping, reduce_lr])


loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {loss:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

test_sequences = tokenizer.texts_to_sequences(test_df['text'])
padded_test_sequences = pad_sequences(test_sequences, padding='post', truncating='post', maxlen=200)

predictions = model.predict(padded_test_sequences)
predicted_labels = (predictions > 0.5).astype(int)

Epoch 1/50




[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 481ms/step - accuracy: 0.4993 - loss: 0.9855 - val_accuracy: 0.4859 - val_loss: 0.7246 - learning_rate: 0.0010
Epoch 2/50
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 471ms/step - accuracy: 0.5215 - loss: 0.8342 - val_accuracy: 0.4859 - val_loss: 0.7263 - learning_rate: 0.0010
Epoch 3/50
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 471ms/step - accuracy: 0.4936 - loss: 0.7867 - val_accuracy: 0.4859 - val_loss: 0.7325 - learning_rate: 0.0010
Epoch 4/50
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 467ms/step - accuracy: 0.5015 - loss: 0.7624 - val_accuracy: 0.4859 - val_loss: 0.7147 - learning_rate: 0.0010
Epoch 5/50
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 483ms/step - accuracy: 0.6351 - loss: 0.6349 - val_accuracy: 0.4859 - val_loss: 1.4345 - learning_rate: 0.0010
Epoch 6/50
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 

In [18]:
test_ids = test_df['id']
test_texts = test_df['text']

test_sequences = tokenizer.texts_to_sequences(test_texts)
padded_test_sequences = pad_sequences(test_sequences, padding='post', truncating='post', maxlen=200)

predictions = model.predict(padded_test_sequences)
predicted_labels = (predictions > 0.5).astype(int)
result_df = pd.DataFrame({'id': test_ids, 'target': predicted_labels.flatten()})

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 104ms/step
   id  target
0   0       1
1   2       1
2   3       1
3   9       1
4  11       1


In [20]:
result_df.to_csv("submission.csv", index=False)