In [1]:
import numpy as np
import pandas as pd

In [3]:
df=pd.read_csv('improved_dataset.csv')
df=df[['text','label']]

In [4]:
df.shape

(3816, 2)

In [5]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [6]:
df.duplicated().sum()

941

In [7]:
df=df.drop_duplicates()

In [8]:
df.shape

(2875, 2)

In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\upend\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

# Initialize Porter Stemmer
stemmer = PorterStemmer()

# Load English stopwords
english_stopwords = set(stopwords.words("english"))

def preprocess_text(text):
    # Convert text to lowercase
    text = str(text).lower()
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove digits and single characters
    text = re.sub(r'\b\w\b|\d+', '', text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize the text and remove stopwords
    words = [word for word in text.split() if word not in english_stopwords]
    
    # Stem the words
    stemmed_words = [stemmer.stem(word) for word in words]
    
    # Join the stemmed words back into a single string
    processed_text = ' '.join(stemmed_words)
    
    return processed_text

# Apply the preprocess_text function to the "text" column of your DataFrame
df["text"] = df["text"].apply(preprocess_text)


In [11]:
test_df=pd.read_csv('dreaddit-test.csv')
test_df=test_df[['text','label']]
test_df['text']=test_df['text'].apply(preprocess_text)

In [17]:
X_train=df['text']
X_test=test_df['text']
y_train=df['label']
y_test=test_df['label']

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

# Convert the Pandas Series X_train and X_test to TF-IDF sparse matrices
X_train_sparse = vectorizer.fit_transform(X_train)
X_test_sparse = vectorizer.transform(X_test)

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense

vocab_size = len(vectorizer.vocabulary_)  # Get the vocabulary size directly from the vectorizer
embedding_dim = 100
max_length = X_train_sparse.shape[1]  # Use the length of TF-IDF vectors as max_length

# Define the model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(LSTM(128))
model.add(Dropout(0.5))  # Adding dropout with a rate of 0.5 (adjust as needed)
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [22]:
model.summary()

In [24]:
X_train = np.array(X_train_sparse.toarray())
X_test = np.array(X_test_sparse.toarray())

model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1129s[0m 25s/step - accuracy: 0.5142 - loss: 0.6943 - val_accuracy: 0.5161 - val_loss: 0.6927
Epoch 2/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1342s[0m 30s/step - accuracy: 0.5352 - loss: 0.6915 - val_accuracy: 0.4839 - val_loss: 0.6940
Epoch 3/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1533s[0m 34s/step - accuracy: 0.5090 - loss: 0.6926 - val_accuracy: 0.5161 - val_loss: 0.6927
Epoch 4/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1678s[0m 37s/step - accuracy: 0.5159 - loss: 0.6928 - val_accuracy: 0.5161 - val_loss: 0.6927
Epoch 5/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1685s[0m 38s/step - accuracy: 0.5142 - loss: 0.6932 - val_accuracy: 0.5161 - val_loss: 0.6927
Epoch 6/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1770s[0m 39s/step - accuracy: 0.5081 - loss: 0.6935 - val_accuracy: 0.5161 - val_loss: 0.6928
Epoch 7/10
[1m45/45[

<keras.src.callbacks.history.History at 0x208b251fc40>

In [25]:
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Accuracy:', accuracy)


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 4s/step - accuracy: 0.5128 - loss: 0.6937
Test Accuracy: 0.5160838961601257
