In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.pipeline import Pipeline
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

Load data from csv file

In [4]:
data = pd.read_csv("Data.csv")

Visualize the training dataset

In [5]:
data['class'].value_counts()

depression        115985
non-depression    115777
Name: class, dtype: int64

Categorize the training dataset

In [28]:
depression = data[data['class']=='depression']
non_depression = data[data['class']=='non-depression']

Joining the data together while ignoring the index

In [30]:
labelled_data = depression.append(non_depression,ignore_index=True)

In [32]:
labelled_data

Unnamed: 0,class,text
0,depression,ex wife threaten suiciderecently leave wife go...
1,depression,need helpjust help m cry hard
2,depression,trigger warning excuse self inflict burn know ...
3,depression,end tonight t anymore quit
4,depression,life year oldhello year old bald male hairline...
...,...,...
231757,non-depression,stay watch youtube video help save kid story v...
231758,non-depression,ruin feel like reflect past year s list ask wa...
231759,non-depression,funniest joke ll hear today assume healthy hea...
231760,non-depression,itchy easily irrate skin eczema skin condition...


Splitting the data to train and test data

In [33]:
input_train, input_test, output_train, output_test = train_test_split(labelled_data['text'],labelled_data['class'],random_state=0,shuffle=True,stratify=labelled_data['class'])

Build model pipeline

In [34]:
model_pipeline = Pipeline([('tfidf',TfidfVectorizer()),('clf',RandomForestClassifier(n_estimators=100,n_jobs=-1))])

Training the model

In [35]:
model_pipeline.fit(input_train,output_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf', RandomForestClassifier(n_jobs=-1))])

In [36]:
test_predict = model_pipeline.predict(input_test)

In [37]:
print(classification_report(output_test, test_predict))

                precision    recall  f1-score   support

    depression       0.89      0.92      0.91     28997
non-depression       0.91      0.89      0.90     28944

      accuracy                           0.90     57941
     macro avg       0.90      0.90      0.90     57941
  weighted avg       0.90      0.90      0.90     57941



In [39]:
joblib.dump(model_pipeline,"Sklearn_ml_model.joblib")

['Sklearn_ml_model.joblib']

In [2]:
model = joblib.load("Sklearn_ml_model.joblib")

In [19]:
model.predict(["I hate my life"])

array(['depression'], dtype=object)