In [None]:
# Installing dependencies
!pip install -r requirements.txt

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import re
import random
from imblearn.over_sampling import RandomOverSampler
from scipy.sparse import hstack  # To combine sparse matrices
from wordcloud import WordCloud

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Reading dataset
df = pd.read_csv('data/dataset1.csv', index_col=0)  # Dataset klasifikasi depresi

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# Data cleaning
df.dropna(inplace=True)
df.rename(columns={'statement': 'original_statement'}, inplace=True)  # Misalkan kolom teks adalah 'text'
df['statement'] = df['original_statement'].str.lower()

def remove_patterns(text):
    text = re.sub(r'http[s]?://\S+', '', text)
    text = re.sub(r'\[.*?\]\(.*?\)', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

df['statement'] = df['statement'].apply(remove_patterns)

In [None]:
# Tokenization and stemming
df['tokens'] = df['statement'].apply(word_tokenize)
stemmer = PorterStemmer()

def stem_tokens(tokens):
    return ' '.join(stemmer.stem(str(token)) for token in tokens)

df['tokens_stemmed'] = df['tokens'].apply(stem_tokens)

In [None]:
# Feature engineering
df['num_of_characters'] = df['statement'].str.len()
df['num_of_sentences'] = df['statement'].apply(lambda x: len(re.split(r'[.!?]', x)) - 1)

In [None]:
# Prepare data for training
X = df[['tokens_stemmed', 'num_of_characters', 'num_of_sentences']]
y = df['status']  # Misalkan label untuk jenis depresi adalah 'status'

In [None]:
# Label encoding
lbl_enc = LabelEncoder()
y = lbl_enc.fit_transform(y.values)  # Anxiety, Bipolar, Depression, Normal

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
# Convert text to features using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=50000)
X_train_tfidf = vectorizer.fit_transform(X_train['tokens_stemmed'])
X_test_tfidf = vectorizer.transform(X_test['tokens_stemmed'])

In [None]:
# Combine TF-IDF features with numerical features
X_train_num = X_train[['num_of_characters', 'num_of_sentences']].values
X_test_num = X_test[['num_of_characters', 'num_of_sentences']].values
X_train_combined = hstack([X_train_tfidf, X_train_num])
X_test_combined = hstack([X_test_tfidf, X_test_num])

In [None]:
# Handle class imbalance
ros = RandomOverSampler(random_state=101)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_combined, y_train)

In [None]:
# Define classifiers
classifiers = {
    'Bernoulli Naive Bayes': BernoulliNB(alpha=0.1, binarize=0.0),
    'XGB': XGBClassifier(learning_rate=0.2, max_depth=7, n_estimators=500, random_state=101, tree_method='gpu_hist')
}


In [None]:
# Train and evaluate classifiers
accuracy_scores = []

for name, clf in classifiers.items():
    clf.fit(X_train_resampled, y_train_resampled)
    y_pred = clf.predict(X_test_combined)
    accuracy = accuracy_score(y_test, y_pred)

    print("\nFor", name)
    print("Accuracy:", accuracy)
    print(classification_report(y_test, y_pred, target_names=lbl_enc.classes_))

    # Plot confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Greens', xticklabels=lbl_enc.classes_, yticklabels=lbl_enc.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix for {name}')
    plt.show()

    # Save accuracy score
    accuracy_scores.append(accuracy)

In [None]:
# Save the best model
accuracies_df = pd.DataFrame({'Classifier': classifiers.keys(), 'Accuracy': accuracy_scores}).sort_values('Accuracy', ascending=False)
best_classifier_name = accuracies_df.iloc[0]['Classifier']
best_classifier = classifiers[best_classifier_name]

# Save the best model to .pkl
dump(best_classifier, 'best_depression_model.pkl')
print(f"Model {best_classifier_name} telah disimpan sebagai 'best_depression_model.pkl'.")