In [91]:
##### IMPORT PACKAGES
# system tools
import os

# data munging tools
import pandas as pd
from joblib import dump, load

# Machine learning stuff
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn import metrics

In [93]:
# get filename
filename = os.path.join(
    "..",
    "..",
    "data",  
    "dataset.csv")

In [None]:
# load data
data = pd.read_csv(
    filename)

In [None]:
data

In [None]:
# extract needed columns from data frame
X = data["clean_text"]
y = data["is_depression"]

In [None]:
##### TRAIN-TEST SPLIT

X_train, X_test, y_train, y_test = train_test_split(
    X, # inputs for the model
    y, # classification labels
    test_size = 0.1,   # create a 95/5 train/test split
    random_state = 42) # random state for reproducibility


In [92]:
def load_data(filename):

    # read csv
    data = pd.read_csv(
        filename)

    # extract needed columns from data frame
    X = data["clean_text"]
    y = data["is_depression"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, # inputs for the model
        y, # classification labels
        test_size = 0.1,   # create a 95/5 train/test split
        random_state = 42) # random state for reproducibility

    return X_train, X_test, y_train, y_test

In [94]:
X_train, X_test, y_train, y_test = load_data(filename)

In [96]:
##### VECTORIZE
vectorizer = TfidfVectorizer(
    ngram_range = (1, 2), # unigrams and bigrams (1 word and 2 word units)
    lowercase =  True, # don't distinguish between e.g. words at start vs middle of sentence
    max_df = 0.95, # remove very common words
    min_df = 0.05, # remove very rare words
    max_features = 500) # keep only top 500 features


In [None]:
# first we fit the vectorizer to the training data...
X_train_feats = vectorizer.fit_transform(X_train)

#... then transform our test data
X_test_feats = vectorizer.transform(X_test)

# get feature names if needed
feature_names = vectorizer.get_feature_names_out()

In [95]:
def vectorize(X_train, X_test, vectorizer):

    # first we fit the vectorizer to the training data...
    X_train_feats = vectorizer.fit_transform(X_train)

    #... then transform our test data
    X_test_feats = vectorizer.transform(X_test)

    return X_train_feats, X_test_feats

In [None]:
X_train_feats, X_test_feats = vectorize(X_train, X_test, vectorizer)

In [None]:
feature_names.shape

In [None]:
##### CLASSIFY & PREDICT
# define classifier
classifier = MLPClassifier(
    activation = "relu", 
    hidden_layer_sizes = (10,), # 1 hidden layer of 20 neurons
    max_iter = 1000, # max number of attempts to converge
    early_stopping = True, # stop early if no improvement
    verbose = True, # print what's going on
    random_state = 42) # reproducibility


In [None]:
# fit classifier
classifier.fit(
    X_train_feats, 
    y_train)

In [None]:
# get predictions
y_pred = classifier.predict(
    X_test_feats)

In [None]:
# evaluate
classifier_metrics = metrics.classification_report(
    y_test, 
    y_pred)

In [None]:
print(classifier_metrics)