# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ML Libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import nltk
from nltk.stem import WordNetLemmatizer

In [3]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
    
nltk.download('wordnet')

KeyboardInterrupt: 

# Importing DataFrame

In [None]:
mental_health = pd.read_csv("Dataset//mental-health.csv")
mental_health.head(6)

# Data Visualization & Transformation

In [None]:
mental_health.size

In [None]:
mental_health.shape

In [None]:
distinct_values = np.unique(mental_health["label"], return_counts=True)
distinct_values

In [None]:
pd.DataFrame(np.unique(mental_health["label"], return_counts=True), columns=['Sucide Watch','Depression'], index=["Value", "count"])

In [None]:
mental_health.info()

In [None]:
# Plot class distribution
plt.figure(figsize=(4, 2))
sns.countplot(data=mental_health, x="label", width=0.3)
plt.xlabel("Class")
plt.ylabel("Count")
plt.title("Class Distribution")
plt.show()

# Data Splitting

In [None]:
x = mental_health.text
y = mental_health.label

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Model Preparation & Evaluation

In [None]:
def preprocess_text():
    mental_health['text'] = mental_health['text'].apply(lambda x : [lem.lemmatize(i, pos='v') for i in x])
    mental_health['text'] = mental_health['text'].apply(lambda x : ' '.join(x))


analyzer='word'
preprocessor=preprocess_text
tokenizer=lambda text: [token.text for token in nlp(text)]
ngram_range = (1, 2)
min_df = 3
max_df = 0.8 
max_features = 2000 
stop_words = 'english'
sublinear_tf = True 

### Logistic Regression 

In [None]:
lr = Pipeline([
    ('tfidf', TfidfVectorizer(
        ngram_range=ngram_range,
        min_df=min_df,
        preprocessor=preprocessor,
        max_df=max_df,
        max_features=max_features,
        stop_words=stop_words,
        sublinear_tf=sublinear_tf
    )),
    ('lrc', LogisticRegression(max_iter=1000, random_state=42))  
])
lr.fit(x_train,y_train)

In [None]:
y_pred = lr.predict(x_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print("Logistic Regression Accuracy:", accuracy)
print(y_test[:11])
print(y_pred[:11])

In [None]:
lr_model = lr.named_steps['lrc'] 
feature_names = lr.named_steps['tfidf'].get_feature_names_out()

coefficients = lr_model.coef_[0]
coefficient_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

coefficient_df = coefficient_df.reindex(coefficient_df['Coefficient'].abs().sort_values(ascending=False).index)

# Plot
plt.figure(figsize=(8, 4))
sns.barplot(data=coefficient_df.head(10), x='Coefficient', y='Feature', palette='viridis')
plt.xlabel('Coefficient')
plt.ylabel('Feature')
plt.title('Top Feature Coefficients')
plt.show()

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(5, 3))

class_names = mental_health['label'].unique()
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()

In [None]:
report = classification_report(y_test, y_pred)
print(report)

### K-Nearest Neighbor

In [None]:
knn = Pipeline([
    ('tfidf', TfidfVectorizer(
        ngram_range=ngram_range,
        min_df=min_df,
        max_df=max_df,
        max_features=max_features,
        stop_words=stop_words,
        sublinear_tf=sublinear_tf
    )),
    ('knn', KNeighborsClassifier(n_neighbors=599))
])
knn.fit(x_train,y_train)

In [None]:
y_pred = knn.predict(x_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print("KNN Accuracy:", accuracy)
print(y_test[:11])
print(y_pred[:11])

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(5, 3))
class_names = mental_health['label'].unique()
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()

In [None]:
report = classification_report(y_test, y_pred)
print(report)

### Naive Bayes(Multinomia)

In [None]:
nb_classifier = Pipeline([
    ('tfidf', TfidfVectorizer(
        ngram_range=ngram_range,
        min_df=min_df,
        max_df=max_df,
        max_features=max_features,
        stop_words=stop_words,
        sublinear_tf=sublinear_tf
    )),
    ('nbc', MultinomialNB())
])
nb_classifier.fit(x_train, y_train)

In [None]:
y_pred = nb_classifier.predict(x_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print("MultinomiaNB Accuracy:", accuracy)
print(y_test[:11])
print(y_pred[:11])

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(5, 3))
class_names = mental_health['label'].unique()
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()

In [None]:
report = classification_report(y_test, y_pred)
print(report)

### Decision Tree

In [None]:
decision_tree_classifier = Pipeline([
    ('tfidf', TfidfVectorizer(
        ngram_range=ngram_range,
        min_df=min_df,
        max_df=max_df,
        max_features=max_features,
        stop_words=stop_words,
        sublinear_tf=sublinear_tf
    )),           
    ('dtc', DecisionTreeClassifier(max_depth=10, random_state=42))       
])
decision_tree_classifier.fit(x_train, y_train)

In [None]:
y_pred = decision_tree_classifier.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print("Decision Tree Accuracy:", accuracy)
print(y_test[:11])
print(y_pred[:11])

In [None]:
feature_names = decision_tree_classifier.named_steps['tfidf'].get_feature_names_out()
feature_importances = decision_tree_classifier.named_steps['dtc'].feature_importances_
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(8, 4))
sns.barplot(data=feature_importance_df.head(10), x='Importance', y='Feature', palette='viridis')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top Feature Importances')
plt.show()

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(5, 3))
class_names = mental_health['label'].unique()
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()

In [None]:
report = classification_report(y_test, y_pred)
print(report)

### Random Forest Tree

In [None]:
random_forest_classifier = Pipeline([
    ('tfidf', TfidfVectorizer(
        ngram_range=ngram_range,
        min_df=min_df,
        max_df=max_df,
        max_features=max_features,
        stop_words=stop_words,
        sublinear_tf=sublinear_tf
    )),                
    ('rfc', RandomForestClassifier(n_estimators=400, random_state=42))
])
random_forest_classifier.fit(x_train, y_train)

In [None]:
y_pred = random_forest_classifier.predict(x_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print("Random Forest Tree Accuracy:", accuracy)
print(y_test[:11])
print(y_pred[:11])

In [None]:
feature_names = random_forest_classifier.named_steps['tfidf'].get_feature_names_out()
feature_importances = random_forest_classifier.named_steps['rfc'].feature_importances_
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)


plt.figure(figsize=(8, 4))
sns.barplot(data=feature_importance_df.head(10), x='Importance', y='Feature', palette='viridis')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top Feature Importances')
plt.show()

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(5, 3))
class_names = mental_health['label'].unique()
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()

In [None]:
report = classification_report(y_test, y_pred)
print(report)

### Support Vector Machine

In [None]:
svm = Pipeline([
    ('tfidf', TfidfVectorizer(
        ngram_range=ngram_range,
        min_df=min_df,
        max_df=max_df,
        max_features=max_features,
        stop_words=stop_words,
        sublinear_tf=sublinear_tf
    )),         
    ('svmc', SVC(kernel="rbf", random_state=42))                    
])
svm.fit(x_train, y_train)

In [None]:
y_pred = svm.predict(x_test)

# Calculate accuracyY
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print("SVM Accuracy:", accuracy)

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(5, 3))
class_names = mental_health['label'].unique()
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()

In [None]:
report = classification_report(y_test, y_pred)
print(report)

### Voting Classifier

In [None]:
voting_classifier = VotingClassifier(
    estimators=[
        ('lrc', lr),
        ('knn', knn),
        ('nbc', nb_classifier),
        ('dtc', decision_tree_classifier),
        ('rfc', random_forest_classifier),
        ('svmc', svm)
    ],
    voting='hard'  # Use 'soft' for weighted average voting
)
voting_classifier.fit(x_train, y_train)

In [None]:
y_pred = voting_classifier.predict(x_test)
accuracy = accuracy_score(y_pred, y_test)
print("Models Accuracy:", accuracy)

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(5, 3))
class_names = mental_health['label'].unique()
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()

In [None]:
report = classification_report(y_test, y_pred)
print(report)

# User input

In [None]:
user_input = "kill"

class_probs = lr.predict_proba([user_input])[0]

depressed_percentage = class_probs[1] * 100
suicide_watch_percentage = class_probs[0] * 100

print(f"Depressed percentage: {depressed_percentage:.2f}%")
print(f"Suicide watch percentage: {suicide_watch_percentage:.2f}%")


In [None]:
plt.figure(figsize=(5, 5))
class_names = mental_health['label'].unique()
sns.barplot(x=class_names y=class_probs, width=0.3)
plt.xlabel("Class")
plt.ylabel("Probability")
plt.title("Prediction Probabilities")
plt.show()