In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, Flatten
import tensorflow as tf
import pickle

# Load the dataset
df = pd.read_csv('merged_output.csv')

# Load positive and depressed words
with open('positive-words.txt', 'r') as file:
    positive_words = set(file.read().splitlines())

with open('depressedword.txt', 'r') as file:
    depressed_words = set(file.read().splitlines())

# Function to label the sentiment
def label_sentiment(text):
    words = set(text.lower().split())
    positive_count = len(words.intersection(positive_words))
    depressed_count = len(words.intersection(depressed_words))
    
    if depressed_count > positive_count:
        return 2  # Depressed
    elif positive_count > 0:
        return 1  # Positive
    else:
        return 0  # Neutral/Negative

# Create labels based on word counts
df['label'] = df['content'].apply(label_sentiment)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['label'], test_size=0.2, random_state=42)

# Tokenize the text
max_words = 670000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Encode labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# Convert to categorical
y_train_cat = tf.keras.utils.to_categorical(y_train_encoded, num_classes=3)
y_test_cat = tf.keras.utils.to_categorical(y_test_encoded, num_classes=3)

# Define the Simple Neural Network model
embedding_dim = 200

model = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_len),
    Flatten(),  # Flatten the output from the embedding layer
    Dense(128, activation='relu'),  # First dense layer
    Dropout(0.5),  # Dropout layer to prevent overfitting
    Dense(64, activation='relu'),   # Second dense layer
    Dropout(0.5),  # Another dropout layer
    Dense(3, activation='softmax')  # Output layer for 3 classes
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print model summary
print(model.summary())

# Train the model
model.fit(X_train_pad, y_train_cat, epochs=5, batch_size=32, validation_split=0.2)





None
Epoch 1/4




[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 242ms/step - accuracy: 0.5816 - loss: 0.9290 - val_accuracy: 0.8132 - val_loss: 0.4955
Epoch 2/4
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 259ms/step - accuracy: 0.8149 - loss: 0.5173 - val_accuracy: 0.8461 - val_loss: 0.3866
Epoch 3/4
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 270ms/step - accuracy: 0.8754 - loss: 0.3197 - val_accuracy: 0.8710 - val_loss: 0.3487
Epoch 4/4
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 270ms/step - accuracy: 0.9013 - loss: 0.2431 - val_accuracy: 0.8615 - val_loss: 0.4056


<keras.src.callbacks.history.History at 0x302ff95b0>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, Flatten
import tensorflow as tf
import pickle

# Load the dataset
df = pd.read_csv('merged_output.csv')

# Load positive and depressed words
with open('positive-words.txt', 'r') as file:
    positive_words = set(file.read().splitlines())

with open('depressedword.txt', 'r') as file:
    depressed_words = set(file.read().splitlines())

# Function to label the sentiment
def label_sentiment(text):
    words = set(text.lower().split())
    positive_count = len(words.intersection(positive_words))
    depressed_count = len(words.intersection(depressed_words))
    
    # Debugging output
    print(f"Text: {text}, Positive Count: {positive_count}, Depressed Count: {depressed_count}")
    
    if depressed_count > positive_count:
        return 2  # Depressed
    elif positive_count > 0:
        return 1  # Positive
    else:
        return 0  # Neutral/Negative

# Create labels based on word counts
df['label'] = df['content'].apply(label_sentiment)

# Check class distribution
print("\nLabel Distribution:")
print(df['label'].value_counts())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['label'], test_size=0.2, random_state=42)

# Tokenize the text
max_words = 670000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Encode labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# Convert to categorical
y_train_cat = tf.keras.utils.to_categorical(y_train_encoded, num_classes=3)
y_test_cat = tf.keras.utils.to_categorical(y_test_encoded, num_classes=3)









Text: thank you, Positive Count: 1, Depressed Count: 0
Text: mmm k, Positive Count: 0, Depressed Count: 0
Text: i'm doing good thank you, Positive Count: 2, Depressed Count: 0
Text: i'm from los angeles, Positive Count: 0, Depressed Count: 0
Text: oh great, Positive Count: 1, Depressed Count: 0
Text: i live in west los angeles the west side, Positive Count: 0, Depressed Count: 0
Text: it's alright, Positive Count: 0, Depressed Count: 0
Text: i xxx, Positive Count: 0, Depressed Count: 0
Text: no i live alone so, Positive Count: 0, Depressed Count: 1
Text: i love it i'm from here so i grew up here it's natural, Positive Count: 1, Depressed Count: 0
Text: the weather um well the weather, Positive Count: 1, Depressed Count: 0
Text: it's always good it's never it's never bad uh um there's always something to do it's rarely a dull moment, Positive Count: 1, Depressed Count: 2
Text: the traffic, Positive Count: 0, Depressed Count: 0
Text: the traffic is horrible well probably traffic is horri

In [3]:
# Define the Simple Neural Network model
embedding_dim = 200

model = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_len),
    Flatten(),  # Flatten the output from the embedding layer
    Dense(128, activation='relu'),  # First dense layer
    Dropout(0.5),  # Dropout layer to prevent overfitting
    Dense(64, activation='relu'),   # Second dense layer
    Dropout(0.5),  # Another dropout layer
    Dense(3, activation='softmax')  # Output layer for 3 classes
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print model summary
print(model.summary())



None


In [5]:
# Train the model
model.fit(X_train_pad, y_train_cat, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5




[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 263ms/step - accuracy: 0.6024 - loss: 0.9092 - val_accuracy: 0.8103 - val_loss: 0.5012
Epoch 2/5
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 265ms/step - accuracy: 0.8357 - loss: 0.4626 - val_accuracy: 0.8534 - val_loss: 0.3747
Epoch 3/5
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 263ms/step - accuracy: 0.8760 - loss: 0.3115 - val_accuracy: 0.8651 - val_loss: 0.3456
Epoch 4/5
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 283ms/step - accuracy: 0.9037 - loss: 0.2395 - val_accuracy: 0.8187 - val_loss: 0.4851
Epoch 5/5
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 279ms/step - accuracy: 0.9129 - loss: 0.2234 - val_accuracy: 0.8882 - val_loss: 0.4306


<keras.src.callbacks.history.History at 0x17c0402f0>

In [6]:
loss, accuracy = model.evaluate(X_test_pad, y_test_cat, verbose=0)
print(f"Test accuracy: {accuracy:.2f}")

Test accuracy: 0.88


In [7]:
def predict_sentiment(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded)
    predicted_class = np.argmax(prediction)
    confidence = prediction[0][predicted_class]
    
    if predicted_class == 2:
        return f"The user might be depressed. Confidence: {confidence:.2f}"
    elif predicted_class == 1:
        return f"The user is in a positive mood. Confidence: {confidence:.2f}"
    else:
        return f"The user is in a neutral/negative mood. Confidence: {confidence:.2f}"

# Test the model with some example conversations
examples = [
    "I feel so hopeless and sad all the time.",
    "I'm excited about my new job and looking forward to the future.",
    "I don't know if life is worth living anymore.",
    "I had a great day with my friends and family.",
    "This weather is terrible, and I'm having a bad day.",
    "I want to kill myself",
    "I'm feeling a bit down today, but I'm sure things will get better.",
     "I feel like nothing is going right for me today.",
"I just can't seem to focus on anything right now.",
"I'm really struggling to keep up, and it's frustrating.",
"Today feels like a complete waste; I can’t get anything done.",
"I'm overwhelmed, and it's dragging my mood down.",
"I feel stuck in a rut, and it’s wearing me out.",
"Everything I touch seems to go wrong today.",
"I'm feeling really down and unmotivated.",
"I just don't have the energy for anything today.",
"It's one of those days where nothing feels worth the effort."
]

for example in examples:
    print(f"Text: {example}")
    print(f"Prediction: {predict_sentiment(example)}\n")

# Save the model
model.save('sentiment_cnn_model.h5')

# Save the tokenizer
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Model and tokenizer saved successfully.")

# To load the model and tokenizer later, you can use:
# loaded_model = tf.keras.models.load_model('sentiment_cnn_model.h5')
# with open('tokenizer.pickle', 'rb') as handle:
#     loaded_tokenizer = pickle.load(handle)

Text: I feel so hopeless and sad all the time.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Prediction: The user is in a neutral/negative mood. Confidence: 0.93

Text: I'm excited about my new job and looking forward to the future.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
Prediction: The user is in a neutral/negative mood. Confidence: 1.00

Text: I don't know if life is worth living anymore.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
Prediction: The user might be depressed. Confidence: 0.57

Text: I had a great day with my friends and family.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
Prediction: The user is in a positive mood. Confidence: 0.95

Text: This weather is terrible, and I'm having a bad day.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
Prediction: The user is in a positive mood. Confidence: 0.82

Text: I want to kill myself
[1m1/1[0m [



Prediction: The user might be depressed. Confidence: 0.52

Model and tokenizer saved successfully.


In [None]:
some more example

In [11]:
def predict_sentiment(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded)
    predicted_class = np.argmax(prediction)
    confidence = prediction[0][predicted_class]
    
    if predicted_class == 2:
        return f"The user might be depressed. Confidence: {confidence:.2f}"
    elif predicted_class == 1:
        return f"The user is in a positive mood. Confidence: {confidence:.2f}"
    else:
        return f"The user is in a neutral/negative mood. Confidence: {confidence:.2f}"
examples = [
        "I feel so hopeless and sad all the time.",
        "I'm excited about my new job and looking forward to the future.",
        "I don't know if life is worth living anymore.",
        "I had a great day with my friends and family.",
        "This weather is terrible, and I'm having a bad day.",
        "I'm feeling a bit down today, but I'm sure things will get better.",
        "I'm really proud of what I accomplished today.",
        "Sometimes I wonder if I'll ever find my way.",
        "I had a productive meeting, but I still have a lot on my plate.",
        "Today was just another ordinary day.",
        "I'm thrilled about my upcoming vacation!",
        "I'm feeling overwhelmed by everything that’s happening.",
        "I'm content with where I am in life right now.",
        "I can't shake off this feeling of dread.",
        "I had a fun time at the park with my friends last weekend."
    ]

for example in examples:
    print(f"Text: {example}")
    print(f"Prediction: {predict_sentiment(example)}\n")

# Save the model
model.save('sentiment_cnn_model.h5')

# Save the tokenizer
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Model and tokenizer saved successfully.")

Text: I feel so hopeless and sad all the time.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Prediction: The user is in a neutral/negative mood. Confidence: 0.93

Text: I'm excited about my new job and looking forward to the future.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Prediction: The user is in a neutral/negative mood. Confidence: 1.00

Text: I don't know if life is worth living anymore.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
Prediction: The user might be depressed. Confidence: 0.57

Text: I had a great day with my friends and family.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Prediction: The user is in a positive mood. Confidence: 0.95

Text: This weather is terrible, and I'm having a bad day.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Prediction: The user is in a positive mood. Confidence: 0.82

Text: I'm feeling a bit down today, bu



Prediction: The user is in a positive mood. Confidence: 0.79

Model and tokenizer saved successfully.
