In [None]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plot
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
import statistics
from sklearn.model_selection import GridSearchCV
%matplotlib inline

In [None]:
users = df_tweets_depression = pd.read_csv('df_users_metrics.csv')

In [None]:
len(users)

In [None]:
len(users[users.depression])

In [None]:
len(users[~users.depression])

In [None]:
df_tweets_depression.columns

## 1 - Todas as features puras com 0,33 de teste

In [None]:
y = users.depression.values
X = users.drop(['depression', 'screen_name','created_at',
                'id', 'location', 'name'], axis=1).values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
len(X_test)

In [None]:
users.drop(['depression', 'screen_name','created_at',
                'id', 'location', 'name'], axis=1).columns

### Random Forest

In [None]:
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train,y_train)
y_pred = rf_clf.predict(X_test)
print('acurácia: ' + str(metrics.accuracy_score(y_test, y_pred)))
print('precisão: ' + str(metrics.precision_score(y_test, y_pred)))
print('recall: ' + str(metrics.recall_score(y_test, y_pred)))
print('f1-score: ' + str(metrics.f1_score(y_test, y_pred)))

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)
ax = plot.subplot()
sns.heatmap(cm, annot=True, ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.xaxis.set_ticklabels(['com depressão', 'sem depressão'])
ax.yaxis.set_ticklabels(['com depressão', 'sem depressão'])

### SVM - SVC (linear)

In [None]:
svc_clf = svm.SVC(kernel='linear', C = 1.0, random_state=42)
svc_clf.fit(X_train, y_train)
y_pred = svc_clf.predict(X_test)
print('acurácia: ' + str(metrics.accuracy_score(y_test, y_pred)))
print('precisão: ' + str(metrics.precision_score(y_test, y_pred)))
print('recall: ' + str(metrics.recall_score(y_test, y_pred)))
print('f1-score: ' + str(metrics.f1_score(y_test, y_pred)))

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)
ax = plot.subplot()
sns.heatmap(cm, annot=True, ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.xaxis.set_ticklabels(['com depressão', 'sem depressão'])
ax.yaxis.set_ticklabels(['com depressão', 'sem depressão'])

### KNN

In [None]:
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train, y_train)
y_pred = knn_clf.predict(X_test)
print('acurácia: ' + str(metrics.accuracy_score(y_test, y_pred)))
print('precisão: ' + str(metrics.precision_score(y_test, y_pred)))
print('recall: ' + str(metrics.recall_score(y_test, y_pred)))
print('f1-score: ' + str(metrics.f1_score(y_test, y_pred)))

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)
ax = plot.subplot()
sns.heatmap(cm, annot=True, ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.xaxis.set_ticklabels(['com depressão', 'sem depressão'])
ax.yaxis.set_ticklabels(['com depressão', 'sem depressão'])

## 2 - StandardScaler, 0,33 de teste e GridSearch

### Random Forest

In [None]:
steps = [('scaler', StandardScaler()), ('RandomForest', RandomForestClassifier(criterion='gini', max_depth=5, max_features=1, n_estimators=13, random_state=42))]
pipeline = Pipeline(steps)
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train,y_train)
y_pred = rf_clf.predict(X_test)
print('acurácia: ' + str(metrics.accuracy_score(y_test, y_pred)))
print('precisão: ' + str(metrics.precision_score(y_test, y_pred)))
print('recall: ' + str(metrics.recall_score(y_test, y_pred)))
print('f1-score: ' + str(metrics.f1_score(y_test, y_pred)))

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)
ax = plot.subplot()
sns.heatmap(cm, annot=True, ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.xaxis.set_ticklabels(['com depressão', 'sem depressão'])
ax.yaxis.set_ticklabels(['com depressão', 'sem depressão'])

### SVM - SVC (linear)

In [None]:
steps = [('scaler', StandardScaler()), ('SVM', svm.SVC(kernel='linear', C = 1.0, random_state=42))]
svc_clf = Pipeline(steps)
svc_clf.fit(X_train,y_train)
y_pred = svc_clf.predict(X_test)
print('acurácia: ' + str(metrics.accuracy_score(y_test, y_pred)))
print('precisão: ' + str(metrics.precision_score(y_test, y_pred)))
print('recall: ' + str(metrics.recall_score(y_test, y_pred)))
print('f1-score: ' + str(metrics.f1_score(y_test, y_pred)))

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)
ax = plot.subplot()
sns.heatmap(cm, annot=True, ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.xaxis.set_ticklabels(['com depressão', 'sem depressão'])
ax.yaxis.set_ticklabels(['com depressão', 'sem depressão'])

### KNN

In [None]:
steps = [('scaler', StandardScaler()), ('KNN', KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree', weights='distance'))]
knn_clf = Pipeline(steps)
knn_clf.fit(X_train, y_train)
y_pred = knn_clf.predict(X_test)
print('acurácia: ' + str(metrics.accuracy_score(y_test, y_pred)))
print('precisão: ' + str(metrics.precision_score(y_test, y_pred)))
print('recall: ' + str(metrics.recall_score(y_test, y_pred)))
print('f1-score: ' + str(metrics.f1_score(y_test, y_pred)))

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)
ax = plot.subplot()
sns.heatmap(cm, annot=True, ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.xaxis.set_ticklabels(['com depressão', 'sem depressão'])
ax.yaxis.set_ticklabels(['com depressão', 'sem depressão'])

## 3 - Standard Scaler + Cross Validation

In [None]:
y = users.depression.values
X = users.drop(['depression', 'screen_name','created_at',
                'id', 'location', 'name'], axis=1).values

### SVC

In [None]:
steps = [('scaler', StandardScaler()), ('SVM', svm.SVC(kernel='linear', C = 1.0, random_state=42))]
pipeline = Pipeline(steps)
cv_results = cross_validate(
    pipeline, X, y, cv=10, return_train_score=True,
    scoring=['accuracy', 'f1', 'precision', 'recall', 'roc_auc']
)
print('acurácia - média: ' + str(cv_results['test_accuracy'].mean()))
print('acurácia - desvio padrão:: ' + str(statistics.stdev(cv_results['test_accuracy'])))
print('')
print('precisão - média: ' + str(cv_results['train_precision'].mean()))
print('precisão - desvio padrão:: ' + str(statistics.stdev(cv_results['test_precision'])))
print('')
print('recall - média: ' + str(cv_results['train_recall'].mean()))
print('recall - desvio padrão:: ' + str(statistics.stdev(cv_results['test_recall'])))
print('')
print('f1-score - média: ' + str(cv_results['train_f1'].mean()))
print('f1-score - desvio padrão:: ' + str(statistics.stdev(cv_results['test_f1'])))
print('')
print('roc_auc - média: ' + str(cv_results['train_roc_auc'].mean()))
print('roc_auc - desvio padrão:: ' + str(statistics.stdev(cv_results['test_roc_auc'])))

### KNN

In [None]:
steps = [('scaler', StandardScaler()), ('KNN', KNeighborsClassifier(n_neighbors=5))]
pipeline = Pipeline(steps)
cv_results = cross_validate(
    pipeline, X, y, cv=5, return_train_score=True,
    scoring=['accuracy', 'f1', 'precision', 'recall', 'roc_auc']
)
print('acurácia - média: ' + str(cv_results['test_accuracy'].mean()))
print('acurácia - desvio padrão:: ' + str(statistics.stdev(cv_results['test_accuracy'])))
print('')
print('precisão - média: ' + str(cv_results['train_precision'].mean()))
print('precisão - desvio padrão:: ' + str(statistics.stdev(cv_results['test_precision'])))
print('')
print('recall - média: ' + str(cv_results['train_recall'].mean()))
print('recall - desvio padrão:: ' + str(statistics.stdev(cv_results['test_recall'])))
print('')
print('f1-score - média: ' + str(cv_results['train_f1'].mean()))
print('f1-score - desvio padrão:: ' + str(statistics.stdev(cv_results['test_f1'])))
print('')
print('roc_auc - média: ' + str(cv_results['train_roc_auc'].mean()))
print('roc_auc - desvio padrão:: ' + str(statistics.stdev(cv_results['test_roc_auc'])))

### Random Forest

In [None]:
steps = [('scaler', StandardScaler()), ('RF', RandomForestClassifier(random_state=42))]
pipeline = Pipeline(steps)
cv_results = cross_validate(
    pipeline, X, y, cv=5, return_train_score=True,
    scoring=['accuracy', 'f1', 'precision', 'recall', 'roc_auc']
)
print('acurácia - média: ' + str(cv_results['test_accuracy'].mean()))
print('acurácia - desvio padrão:: ' + str(statistics.stdev(cv_results['test_accuracy'])))
print('')
print('precisão - média: ' + str(cv_results['train_precision'].mean()))
print('precisão - desvio padrão:: ' + str(statistics.stdev(cv_results['test_precision'])))
print('')
print('recall - média: ' + str(cv_results['train_recall'].mean()))
print('recall - desvio padrão:: ' + str(statistics.stdev(cv_results['test_recall'])))
print('')
print('f1-score - média: ' + str(cv_results['train_f1'].mean()))
print('f1-score - desvio padrão:: ' + str(statistics.stdev(cv_results['test_f1'])))
print('')
print('roc_auc - média: ' + str(cv_results['train_roc_auc'].mean()))
print('roc_auc - desvio padrão:: ' + str(statistics.stdev(cv_results['test_roc_auc'])))

## GridSearchCV

### SVC

In [None]:
np.arange(0, 1.5, 0.1)

In [None]:
X.shape

In [None]:
X.shape[0]

In [None]:
steps = [('scaler', StandardScaler()), ('SVM', svm.SVC(random_state=42))]
pipeline = Pipeline(steps)
param_grid = { 
    'SVM__C': [0.001, 0.01, 0.1, 1.0],
    'SVM__gamma': [0, 0.001, 0.01, 0.1, 1],
    'SVM__kernel': ['linear', 'rbf', 'sigmoid']
}
gridsearch = GridSearchCV(pipeline, param_grid, verbose=1).fit(X_train, y_train)
gridsearch.best_params_

#### LINEAR

In [None]:
steps = [('scaler', StandardScaler()), ('SVM', svm.SVC(random_state=42, C=1, kernel='rbf', gamma=0.1))]
pipeline = Pipeline(steps)
cv_results = cross_validate(
    pipeline, X, y, cv=5, return_train_score=True,
    scoring=['accuracy', 'f1', 'precision', 'recall', 'roc_auc']
)
print('acurácia - média: ' + str(cv_results['test_accuracy'].mean()))
print('acurácia - desvio padrão:: ' + str(statistics.stdev(cv_results['test_accuracy'])))
print('')
print('precisão - média: ' + str(cv_results['train_precision'].mean()))
print('precisão - desvio padrão:: ' + str(statistics.stdev(cv_results['test_precision'])))
print('')
print('recall - média: ' + str(cv_results['train_recall'].mean()))
print('recall - desvio padrão:: ' + str(statistics.stdev(cv_results['test_recall'])))
print('')
print('f1-score - média: ' + str(cv_results['train_f1'].mean()))
print('f1-score - desvio padrão:: ' + str(statistics.stdev(cv_results['test_f1'])))
print('')
print('roc_auc - média: ' + str(cv_results['train_roc_auc'].mean()))
print('roc_auc - desvio padrão:: ' + str(statistics.stdev(cv_results['test_roc_auc'])))

### KNN

In [None]:
steps = [('scaler', StandardScaler()), ('KNN', KNeighborsClassifier())]
pipeline = Pipeline(steps)
param_grid = { 
    'KNN__n_neighbors': np.arange(3, 10),
    'KNN__weights': ['uniform', 'distance'],
    'KNN__algorithm' : ['ball_tree', 'kd_tree', 'brute', 'auto']
}
gridsearch = GridSearchCV(pipeline, param_grid, verbose=1).fit(X_train, y_train)
gridsearch.best_params_

In [None]:
steps = [('scaler', StandardScaler()), ('KNN', KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree', weights='distance'))]
knn_clf = Pipeline(steps)
knn_clf.fit(X_train, y_train)
y_pred = knn_clf.predict(X_test)
print('acurácia: ' + str(metrics.accuracy_score(y_test, y_pred)))
print('precisão: ' + str(metrics.precision_score(y_test, y_pred)))
print('recall: ' + str(metrics.recall_score(y_test, y_pred)))
print('f1-score: ' + str(metrics.f1_score(y_test, y_pred)))

In [None]:
steps = [('scaler', StandardScaler()),
         ('KNN', KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree', weights='distance'))]
pipeline = Pipeline(steps)
cv_results = cross_validate(
    pipeline, X, y, cv=5, return_train_score=True,
    scoring=['accuracy', 'f1', 'precision', 'recall', 'roc_auc']
)
print('acurácia - média: ' + str(cv_results['test_accuracy'].mean()))
print('acurácia - desvio padrão:: ' + str(statistics.stdev(cv_results['test_accuracy'])))
print('')
print('precisão - média: ' + str(cv_results['train_precision'].mean()))
print('precisão - desvio padrão:: ' + str(statistics.stdev(cv_results['test_precision'])))
print('')
print('recall - média: ' + str(cv_results['train_recall'].mean()))
print('recall - desvio padrão:: ' + str(statistics.stdev(cv_results['test_recall'])))
print('')
print('f1-score - média: ' + str(cv_results['train_f1'].mean()))
print('f1-score - desvio padrão:: ' + str(statistics.stdev(cv_results['test_f1'])))
print('')
print('roc_auc - média: ' + str(cv_results['train_roc_auc'].mean()))
print('roc_auc - desvio padrão:: ' + str(statistics.stdev(cv_results['test_roc_auc'])))

### Random Forest

In [None]:
steps = [('scaler', StandardScaler()), ('rf', RandomForestClassifier(random_state=42))]
pipeline = Pipeline(steps)
param_grid = { 
    'rf__n_estimators': np.arange(10, 100),
    'rf__max_features': ['auto', 'sqrt', 'log2'],
    'rf__max_depth' : np.arange(5, 10),
    'rf__criterion' :['gini', 'entropy']
}
gridsearch = GridSearchCV(pipeline, param_grid, verbose=1).fit(X_train, y_train)
gridsearch.best_params_

In [None]:
pipeline.get_params().keys()

In [None]:
steps = [('scaler', StandardScaler()),
         ('RF', RandomForestClassifier(criterion='gini', max_depth=5, max_features=1, n_estimators=13, random_state=42))]
pipeline = Pipeline(steps)
cv_results = cross_validate(
    pipeline, X, y, cv=5, return_train_score=True,
    scoring=['accuracy', 'f1', 'precision', 'recall', 'roc_auc']
)
print('acurácia - média: ' + str(cv_results['test_accuracy'].mean()))
print('acurácia - desvio padrão:: ' + str(statistics.stdev(cv_results['test_accuracy'])))
print('')
print('precisão - média: ' + str(cv_results['train_precision'].mean()))
print('precisão - desvio padrão:: ' + str(statistics.stdev(cv_results['test_precision'])))
print('')
print('recall - média: ' + str(cv_results['train_recall'].mean()))
print('recall - desvio padrão:: ' + str(statistics.stdev(cv_results['test_recall'])))
print('')
print('f1-score - média: ' + str(cv_results['train_f1'].mean()))
print('f1-score - desvio padrão:: ' + str(statistics.stdev(cv_results['test_f1'])))
print('')
print('roc_auc - média: ' + str(cv_results['train_roc_auc'].mean()))
print('roc_auc - desvio padrão:: ' + str(statistics.stdev(cv_results['test_roc_auc'])))