In [2]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scipy, gensim
  Attempting uninstall: scipy
    Found existing installation: scipy 1.14.1
    Uninstalling scipy-1.14.1:
      Successfully 

In [3]:
import nltk
nltk.download('punkt')
import pandas as pd
import numpy as np
import gensim.downloader as api
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Embedding, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
#load dataset
url = "https://raw.githubusercontent.com/ironhack-labs/project-nlp-challenge/refs/heads/main/dataset/data.csv"
df = pd.read_csv(url)

#drop unnecessary columns
df = df.drop(columns=['date','subject','title'],axis=1)

#fill missing values in the text column
df['text'] = df['text'].fillna("")


# Load Pre-trained Word2Vec Model



In [5]:
#load pre-trained Word2Vec model (Google News 300)
w2v_model = api.load("word2vec-google-news-300")
embedding_dim = 300  #word vector size




In [6]:
#function to convert text into a list of word vectors
def text_to_sequence(text, model):
    words = word_tokenize(text.lower()) #Tokenize text into words
    return [model[word] for word in words if word in model]  #extract available word vectors

In [8]:
#convert text into lists of word vectors
nltk.download('punkt_tab')
df['vectors'] = df['text'].apply(lambda x: text_to_sequence(str(x), w2v_model))


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# Split Data into Training and Testing Sets

In [9]:
#set a fixed sentence length (Padding/Truncating)
max_length = 30

#prepare input data (X)
X = pad_sequences(df['vectors'], maxlen=max_length, dtype='float32', padding='post', truncating='post', value=0.0)

#prepare labels (y)
y = df['label'].values

#split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Build and Compile CNN Model


In [10]:
model = Sequential([
    Conv1D(filters=64, kernel_size=5, activation='relu', input_shape=(max_length, embedding_dim)),
    Dropout(0.5),
    GlobalMaxPooling1D(),
    Dense(32, activation='relu', kernel_regularizer=l2(0.01)),  # إضافة L2 Regularization
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

#add early stop
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


# Training



In [11]:
model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_test, y_test), callbacks=[early_stop])

Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 61ms/step - accuracy: 0.8868 - loss: 0.5632 - val_accuracy: 0.9964 - val_loss: 0.1202
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 56ms/step - accuracy: 0.9963 - loss: 0.0802 - val_accuracy: 0.9965 - val_loss: 0.0535
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 56ms/step - accuracy: 0.9977 - loss: 0.0325 - val_accuracy: 0.9974 - val_loss: 0.0256
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 55ms/step - accuracy: 0.9981 - loss: 0.0226 - val_accuracy: 0.9975 - val_loss: 0.0204
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 55ms/step - accuracy: 0.9974 - loss: 0.0205 - val_accuracy: 0.9975 - val_loss: 0.0192
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 56ms/step - accuracy: 0.9984 - loss: 0.0165 - val_accuracy: 0.9977 - val_loss: 0.0185
Epoch 7/10
[1m2

<keras.src.callbacks.history.History at 0x7de46e865710>

# Evaluation


In [12]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Improved CNN Model Accuracy: {accuracy:.4f}")


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9981 - loss: 0.0123
Improved CNN Model Accuracy: 0.9975


#save the model



In [17]:
model.save("cnn_model.h5")




#Generate Predictions and Save to CSV


In [16]:
import pandas as pd

y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int)  #convert labels into 0 and 1

predictions_df = pd.DataFrame({'predicted_label': y_pred.flatten()})

predictions_df.to_csv('cnn_predictions.csv', index=False)

print("predictions.csv is saved successfuly")


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
predictions.csv is saved successfuly
