In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords

# Load the dataset
df = pd.read_csv('merged_output.csv')
# Load positive and depressed words
with open('positive-words.txt', 'r') as file:
    positive_words = set(file.read().splitlines())
with open('depressedword.txt', 'r') as file:
    depressed_words = set(file.read().splitlines())

# Function to clean the text
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.lower()  # Convert to lowercase

# Clean the content
df['cleaned_content'] = df['content'].apply(clean_text)

# Function to label the data
def label_sentiment(text):
    pos_count = sum(1 for word in text.split() if word in positive_words)
    dep_count = sum(1 for word in text.split() if word in depressed_words)
    return 1 if dep_count > pos_count else 0  # 1 = depressed, 0 = not depressed

# Apply labeling
df['label'] = df['cleaned_content'].apply(label_sentiment)

# Split the data into features and labels
X = df['cleaned_content']
y = df['label']

# Vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X_vectorized = vectorizer.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

# Example usage
def predict_sentiment(text):
    cleaned_text = clean_text(text)
    vectorized_text = vectorizer.transform([cleaned_text])
    prediction = model.predict(vectorized_text)
    return 'Depressed' if prediction[0] == 1 else 'Not Depressed'

# Test the model with a sample input
sample_input = "Im feeling good."
print(predict_sentiment(sample_input))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3393
           1       0.89      0.30      0.44        27

    accuracy                           0.99      3420
   macro avg       0.94      0.65      0.72      3420
weighted avg       0.99      0.99      0.99      3420

Not Depressed


In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv('merged_output.csv')

# Load positive and depressed words
def load_words(filename):
    with open(filename, 'r') as file:
        return set(file.read().splitlines())

positive_words = load_words('positive-words.txt')
depressed_words = load_words('depressedword.txt')

# Function to clean and tokenize the text
def clean_and_tokenize(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    return word_tokenize(text)

# Function to label the data
def label_sentiment(tokens):
    pos_count = sum(1 for word in tokens if word in positive_words)
    dep_count = sum(1 for word in tokens if word in depressed_words)
    return 1 if dep_count > pos_count else 0  # 1 = depressed, 0 = not depressed

# Clean and label the data
df['cleaned_tokens'] = df['content'].apply(clean_and_tokenize)
df['label'] = df['cleaned_tokens'].apply(label_sentiment)

# Split the data into features and labels
X = df['content']
y = df['label']

# Create a pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', tokenizer=clean_and_tokenize)),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

# Function to predict sentiment
def predict_sentiment(text):
    prediction = pipeline.predict([text])
    return 'Depressed' if prediction[0] == 1 else 'Not Depressed'

# Test the model with a sample input
sample_input = "I want to kill myself."
print(predict_sentiment(sample_input))

[nltk_data] Downloading package punkt to /Users/lt611-10/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lt611-10/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3393
           1       1.00      0.30      0.46        27

    accuracy                           0.99      3420
   macro avg       1.00      0.65      0.73      3420
weighted avg       0.99      0.99      0.99      3420

Not Depressed


Not Depressed
