In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import nltk


In [2]:
# Load data from CSV file
data = pd.read_csv('mental_health.csv')

# Display first few rows to understand the structure of the data
print(data)

# Separate features (X) and target (y)
X = data['text'].astype(str)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


                                                    text  label
0      dear american teens question dutch person hear...      0
1      nothing look forward lifei dont many reasons k...      1
2      music recommendations im looking expand playli...      0
3      im done trying feel betterthe reason im still ...      1
4      worried  year old girl subject domestic physic...      1
...                                                  ...    ...
27972  posting everyday people stop caring  religion ...      0
27973  okay definetly need hear guys opinion ive pret...      0
27974  cant get dog think ill kill myselfthe last thi...      1
27975  whats point princess bridei really think like ...      1
27976  got nudes person might might know snapchat do ...      0

[27977 rows x 2 columns]


In [3]:

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed

# Fit and transform on training data
X_train = vectorizer.fit_transform(X_train)

# Transform test data using the same vectorizer
X_test = vectorizer.transform(X_test)

# Determine input_dim (number of features)
input_dim = X_train.shape[1]  # This will be 5000 in this example




In [4]:
# Initialize Sequential model
model = Sequential()

# Add input layer and hidden layers
model.add(Dense(128, input_dim=input_dim, activation='relu'))
model.add(Dropout(0.2))  # Dropout for regularization

model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               640128    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 648,449
Trainable params: 648,449
Non-trainable params: 0
_________________________________________________________________


In [5]:
import numpy as np

# Convert TF-IDF sparse matrices to dense numpy arrays
X_train = X_train.toarray()
X_test = X_test.toarray()

# Train the model
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_test, y_test))

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy on test set: {accuracy}')



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy on test set: 0.9108291864395142


In [6]:
import joblib
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print("Vectorizer saved successfully.")

Vectorizer saved successfully.


In [7]:
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    # Convert text to lowercase
    text = text.lower()

    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text and remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]

    # Join tokens back into a single string
    processed_text = ' '.join(tokens)

    return processed_text


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jessm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jessm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
# Define the path where you want to save the model
model_path = 'mlp_model.h5'

# Save the model
model.save(model_path)

print(f"Saved model to disk at {model_path}")




Saved model to disk at mlp_model.h5


In [13]:
import tensorflow as tf
from tensorflow.keras.models import load_model


# 2. Define a prediction function
def predict_depression(input_text, model_path):
    # Preprocess the input text
    preprocessed_input = preprocess_text(input_text)
    input_vectorized = vectorizer.transform([preprocessed_input]).toarray()

    # Load the saved model
    loaded_model = load_model(model_path)

    # Perform prediction
    prediction_prob = loaded_model.predict(input_vectorized)

    if prediction_prob[0] >= 0.5:
        print("The input text indicates depression.")
    else:
        print("The input text does not indicate depression.")

    return prediction_prob[0]

# 3. Get user input and make a prediction
model_path = 'mlp_model.h5'
user_input = input("Enter a text: ")
prediction_prob = predict_depression(user_input, model_path)
print(f"Prediction probability: {prediction_prob}")


The input text indicates depression.
Prediction probability: [0.99999964]
