In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
data = pd.read_csv('/content/drive/MyDrive/MLproject/datasets/preprocessed_data.csv')
masked_data = pd.read_csv('/content/drive/MyDrive/MLproject/datasets/masked_dataset.csv')
data.shape

(7731, 2)

In [3]:
data = data.dropna()

Evaluation with Random forest for unmasked data......

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Assuming you have data.text and data.is_depression as pandas Series
X = data.text.astype(str).tolist()  # Convert to string and create a list
y = data.is_depression.tolist()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a pipeline with CountVectorizer and RandomForestClassifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', RandomForestClassifier()),
])

# Train the model
text_clf.fit(X_train, y_train)

# Predict on test data
predictions = text_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print('Accuracy:', accuracy)

# Print classification report using predictions
print(classification_report(y_test, predictions))


Accuracy: 0.9555843035791289
              precision    recall  f1-score   support

           0       0.94      0.97      0.96      1157
           1       0.97      0.94      0.95      1162

    accuracy                           0.96      2319
   macro avg       0.96      0.96      0.96      2319
weighted avg       0.96      0.96      0.96      2319



Evaluation with Random forest for masked data......

In [6]:
# Assuming you have data.masked_text and data.is_depression as pandas Series
X = masked_data.masked_text.astype(str).tolist()  # Convert to string and create a list
y = masked_data.is_depression.tolist()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a pipeline with CountVectorizer and RandomForestClassifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', RandomForestClassifier()),
])

# Train the model
text_clf.fit(X_train, y_train)

# Predict on test data
predictions = text_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print('Accuracy:', accuracy)

# Print classification report using predictions
print(classification_report(y_test, predictions))

Accuracy: 0.956877964639931
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1157
           1       0.97      0.94      0.96      1162

    accuracy                           0.96      2319
   macro avg       0.96      0.96      0.96      2319
weighted avg       0.96      0.96      0.96      2319



Now use logistic Regression for sentiment analysis with unmasked data

In [None]:
from sklearn.linear_model import LogisticRegression

# Assuming data.masked_text and data.is_depression are pandas Series
X = data.text.astype(str).tolist()  # Convert to string and create a list
y = data.is_depression.tolist()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a pipeline with CountVectorizer and LogisticRegression
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LogisticRegression()),
])

# Train the model
text_clf.fit(X_train, y_train)

# Predict on test data
predictions = text_clf.predict(X_test)

# Calculate accuracy using predictions
accuracy = accuracy_score(y_test, predictions)
print('Accuracy:', accuracy)

# Print classification report using predictions
print(classification_report(y_test, predictions))


Accuracy: 0.963777490297542
              precision    recall  f1-score   support

           0       0.94      0.99      0.96      1157
           1       0.99      0.94      0.96      1162

    accuracy                           0.96      2319
   macro avg       0.96      0.96      0.96      2319
weighted avg       0.96      0.96      0.96      2319



logistic regression with masked data.......

In [7]:
from sklearn.linear_model import LogisticRegression

# Assuming data.masked_text and data.is_depression are pandas Series
X = masked_data.masked_text.astype(str).tolist()  # Convert to string and create a list
y = masked_data.is_depression.tolist()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a pipeline with CountVectorizer and LogisticRegression
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LogisticRegression()),
])

# Train the model
text_clf.fit(X_train, y_train)

# Predict on test data
predictions = text_clf.predict(X_test)

# Calculate accuracy using predictions
accuracy = accuracy_score(y_test, predictions)
print('Accuracy:', accuracy)

# Print classification report using predictions
print(classification_report(y_test, predictions))


Accuracy: 0.9655023717119448
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1157
           1       0.98      0.95      0.96      1162

    accuracy                           0.97      2319
   macro avg       0.97      0.97      0.97      2319
weighted avg       0.97      0.97      0.97      2319

