In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout, GlobalMaxPooling1D
from sklearn.model_selection import train_test_split

In [2]:
# Load the dataset
df = pd.read_csv('../improved_dataset.csv')
df_test=pd.read_csv('../dreaddit-test.csv')

In [3]:
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

# Initialize Porter Stemmer
stemmer = PorterStemmer()

# Load English stopwords
english_stopwords = set(stopwords.words("english"))

def preprocess_text(text):
    # Convert text to lowercase
    text = str(text).lower()
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove digits and single characters
    text = re.sub(r'\b\w\b|\d+', '', text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize the text and remove stopwords
    words = [word for word in text.split() if word not in english_stopwords]
    
    # Stem the words
    stemmed_words = [stemmer.stem(word) for word in words]
    
    # Join the stemmed words back into a single string
    processed_text = ' '.join(stemmed_words)
    
    return processed_text

# Apply the preprocess_text function to the "text" column of your DataFrame
df["text"] = df["text"].apply(preprocess_text)


In [5]:
# Split the dataset into features and labels
X = df['text']
y = df['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Train Word2Vec model on your tokenized text data
word2vec_model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, workers=4)

# Convert text to Word2Vec embeddings and pad sequences
max_length = 100  # Set the maximum length of sequences
X_train_word2vec_padded = pad_sequences(word2vec_model.wv[X_train], maxlen=max_length, padding='post')
X_test_word2vec_padded = pad_sequences(word2vec_model.wv[X_test], maxlen=max_length, padding='post')

KeyError: "Key 'everi minor detail feel overwhelm feel suffoc panicki time imagin other gone someth similar want ask experi grad school applic surviv preserv mental health best could thank much' not present"

In [12]:
# Vocabulary size is the number of features in Word2Vec representation
vocab_size = word2vec_model.wv.vectors.shape[0]
embedding_dim = word2vec_model.wv.vectors.shape[1]
max_length = max_length  # Assuming you've defined this earlier
num_filters = 64
kernel_size = 3
lstm_units = 128
dropout_rate = 0.5

# Define CNN-BiLSTM model architecture
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[word2vec_model.wv.vectors], input_length=max_length, trainable=False))
model.add(Conv1D(num_filters, kernel_size, activation='relu'))
model.add(MaxPooling1D())
model.add(Bidirectional(LSTM(lstm_units)))
model.add(Dropout(dropout_rate))
model.add(Dense(1, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

ValueError: Unrecognized keyword arguments passed to Embedding: {'weights': [array([[-0.02517058, -0.08561629,  0.29184958, ..., -0.143632  ,
         0.14459042,  0.22142088],
       [-0.35471848,  0.01280394,  0.02710865, ..., -0.02166992,
         0.01136221,  0.30584252],
       [ 0.14902855,  0.07591613, -0.15477288, ...,  0.08476508,
        -0.02332102,  0.05000773],
       ...,
       [-0.02133056,  0.04920521,  0.03786643, ..., -0.0204885 ,
        -0.00547265,  0.06712766],
       [-0.01782992,  0.10218916,  0.06875921, ..., -0.01665184,
        -0.01513472,  0.07899512],
       [ 0.00054663,  0.00130581,  0.01583871, ...,  0.0003749 ,
        -0.00327075,  0.00218709]], dtype=float32)]}

In [20]:
model.summary()

In [21]:
vocab_size

6994

In [22]:
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1025s[0m 21s/step - accuracy: 0.5318 - loss: 0.6920 - val_accuracy: 0.5118 - val_loss: 0.6933
Epoch 2/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1165s[0m 24s/step - accuracy: 0.5207 - loss: 0.6925 - val_accuracy: 0.5118 - val_loss: 0.6952
Epoch 3/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1221s[0m 26s/step - accuracy: 0.5294 - loss: 0.6918 - val_accuracy: 0.5118 - val_loss: 0.6931
Epoch 4/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1319s[0m 28s/step - accuracy: 0.5221 - loss: 0.6926 - val_accuracy: 0.5118 - val_loss: 0.6942
Epoch 5/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1589s[0m 33s/step - accuracy: 0.5323 - loss: 0.6921 - val_accuracy: 0.5118 - val_loss: 0.6938
Epoch 6/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1690s[0m 35s/step - accuracy: 0.5290 - loss: 0.6916 - val_accuracy: 0.5118 - val_loss: 0.6934
Epoch 7/10
[1m48/48[

<keras.src.callbacks.history.History at 0x1b8166e3940>

In [23]:
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Accuracy:', accuracy)


[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 2s/step - accuracy: 0.5256 - loss: 0.6919
Test Accuracy: 0.5117800831794739


In [19]:
# Preprocess the input text
def preprocess_input(text):
    # Tokenize the text
    text_seq = tokenizer.texts_to_sequences([text])
    # Pad sequences to ensure uniform length
    text_padded = pad_sequences(text_seq, maxlen=max_length)
    return text_padded

# Function to make predictions
def predict_sentiment(text):
    # Preprocess the input
    text_padded = preprocess_input(text)
    # Make predictions
    prediction = model.predict(text_padded)
    return prediction[0][0]

# Example usage
text = "I feel more sad"
prediction = predict_sentiment(text)
def classify_stress(prediction, threshold=0.5):
    if prediction >= threshold:
        return str(f"Stressful+{prediction}")
    else:
        return str(f"Not Stressful+{prediction}")

# Example usage
stress_classification = classify_stress(prediction)
print("Classification:", stress_classification)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Classification: Stressful+0.8774896264076233
