In [6]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Preprocess the text
def preprocess_text(text):
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()  # Convert to lowercase
    text = text.split()  # Split the text into words
    text = [word for word in text if not word in stop_words]  # Remove stopwords
    return ' '.join(text)  # Join the words back into a string

# Load the dataset
df = pd.read_csv('train.csv')  # Replace with the correct path to your dataset

# Handle missing or non-string values in 'title' column
df['title'] = df['title'].fillna('').astype(str)

# Apply preprocessing to the 'title' column
df['cleaned_title'] = df['title'].apply(preprocess_text)

# Inspect the data
print(df[['title', 'cleaned_title']].head())

# Tokenize the text
tokenizer = Tokenizer(num_words=5000, lower=True)  # Limit vocabulary size to 5000
tokenizer.fit_on_texts(df['cleaned_title'])
X = tokenizer.texts_to_sequences(df['cleaned_title'])

# Pad sequences to ensure uniform input size
max_len = 50  # Maximum sentence length
X_padded = pad_sequences(X, maxlen=max_len, padding='post')

# Prepare the labels
y = df['label'].values  # Assuming the label column is 'label'



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                               title  \
0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2                  Why the Truth Might Get You Fired   
3  15 Civilians Killed In Single US Airstrike Hav...   
4  Iranian woman jailed for fictional unpublished...   

                                       cleaned_title  
0  house dem aide even see comey letter jason cha...  
1   flynn hillary clinton big woman campus breitbart  
2                              truth might get fired  
3    civilians killed single us airstrike identified  
4  iranian woman jailed fictional unpublished sto...  


In [7]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Define model parameters
embedding_vector_features = 40  # Feature representation size

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=embedding_vector_features, input_length=max_len))  # Embedding layer
model.add(LSTM(100))  # LSTM layer
model.add(Dropout(0.3))  # Dropout to prevent overfitting
model.add(Dense(1, activation='sigmoid'))  # Output layer

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
print(model.summary())

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model on the test set
score, accuracy = model.evaluate(X_test, y_test, batch_size=32)
print(f"Test Accuracy: {accuracy*100:.2f}%")

# Make predictions (optional)
predictions = model.predict(X_test)
predictions = (predictions > 0.5).astype(int)  # Convert to binary values (0 or 1)

# Display some predictions
print(predictions[:10])





None
Epoch 1/10
[1m520/520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.5622 - loss: 0.6499 - val_accuracy: 0.7930 - val_loss: 0.4360
Epoch 2/10
[1m520/520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.7757 - loss: 0.4583 - val_accuracy: 0.5125 - val_loss: 0.6929
Epoch 3/10
[1m520/520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.5105 - loss: 0.6936 - val_accuracy: 0.4875 - val_loss: 0.6938
Epoch 4/10
[1m520/520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.5767 - loss: 0.6335 - val_accuracy: 0.8851 - val_loss: 0.3050
Epoch 5/10
[1m520/520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.8963 - loss: 0.2787 - val_accuracy: 0.8538 - val_loss: 0.3572
Epoch 6/10
[1m520/520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.9113 - loss: 0.2562 - val_accuracy: 0.9291 - val_loss: 0.2077
Epoch 7/10
[1m520/520