<a href="https://colab.research.google.com/github/jscienciadados/ciencia-dados/blob/main/avaliando-titulo-capitalizacao.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

# Conhecendo o Dataset

In [None]:
df = pd.read_csv('train.csv', index_col='id')

In [None]:
df.head(5)

In [None]:
df.info()

In [None]:
df.describe(include='object')

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(data=df, x='job')
plt.ylabel('Contagem')
plt.show()

In [None]:
sns.barplot(data=df, x='marital', y='age', hue='y')
plt.show()

In [None]:
sns.figsize=(15, 5) 
sns.catplot(data=df, x='y', y='campaign', aspect=8/6, height=8);

In [None]:
df[df.y == 'yes'].describe()

In [None]:
plt.figure(figsize=(15,5))
sns.scatterplot(x='age', y='duration', data=df, hue='y')
plt.show()

# Modelando

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn import metrics

In [None]:
# codificaçãão One-hot (mas não  vamos usar no modelo)
pd.get_dummies(df, columns=['job'])

In [None]:
#Transforma as categorias em números
for c in df.columns[df.dtypes == object]: # df.dtypes == 'object'
    df[c] = df[c].astype('category')

In [None]:
df.education.cat.reorder_categories(['illiterate', 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 
       'professional.course', 'university.degree'], ordered=True, inplace=True)

In [None]:
#Transforma as categorias em números
for c in df.columns[df.dtypes == 'category']: # df.dtypes == 'object'
    df[c] = df[c].cat.codes

In [None]:
df.head()

In [None]:
y = df.y
X = df.drop('y', axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, shuffle=True, stratify=y)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape 

In [None]:
# vericando a divisão
sum(y_train)/len(y_train), sum(y_test)/len(y_test)

In [None]:
# Normalizar os dados
scaler = MinMaxScaler()
X_norm = scaler.fit_transform(X)

In [None]:
df_norm = pd.DataFrame(X_norm, columns=df.columns.drop('y'))

In [None]:
df_norm.head()

In [None]:
knn_class = KNeighborsClassifier()
knn_class.fit(X_train, y_train)

In [None]:
y_pred = knn_class.predict(X_test)

In [None]:
print(metrics.classification_report(y_test, y_pred))

# Submetendo pro Kaggle

In [None]:
aval = pd.read_csv('test.csv', index_col='id')
sub =  pd.read_csv('sample.csv', index_col='id')

In [None]:
for c in aval.columns[aval.dtypes == object]: # df.dtypes == 'object'
    aval[c] = aval[c].astype('category')

In [None]:
aval.education.cat.reorder_categories(['illiterate', 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 
       'professional.course', 'university.degree'], ordered=True, inplace=True)

In [None]:
#Transforma as categorias em números
for c in aval.columns[aval.dtypes == 'category']: # df.dtypes == 'object'
    aval[c] = aval[c].cat.codes

In [None]:
aval.head()

In [None]:
X_aval = scaler.fit_transform(aval.values)

In [None]:
aval_norm = pd.DataFrame(X_aval, columns=aval.columns)

In [None]:
aval_norm.head()

In [None]:
y_aval = knn_class.predict(X_aval)

In [None]:
sub.y = pd.Series(y_aval)

In [None]:
sub.head()

In [None]:
sub.to_csv('submission.csv')

# Random Forest

In [None]:
#Visualização do conjunto de treino
X.head()

In [None]:
#pipeline para realizar os k-folds no conjunto de treino
pipeline = make_pipeline(StandardScaler(), 
                         RandomForestClassifier(n_estimators=200))
#grid de Hiperparâmetros para serem testados
hyperparameters = { 'randomforestclassifier__max_features' : ['auto', None, 'log2'],
                  'randomforestclassifier__max_depth': [None, 3, 1]}
# Grid para k-fold de 10 dobras
clf = GridSearchCV(pipeline, hyperparameters, cv=10)

In [None]:
%%time
#Ajusta o modelo aos dados de treino
model=clf.fit(X,y)

In [None]:
p = model.predict(test)

In [None]:
#vizualização dos dados a serem submetidos
#sample.y= p
#sample.to_csv("~/submission.csv",index=False)
#sample.head()

# Métricas de avaliação

In [None]:
solution = pd.read_csv('solution.csv').y
naive = pd.read_csv('allzeros.csv').y
rf = pd.read_csv('random_forest.csv').y

In [None]:
# F1 de cada categoria
metrics.f1_score(solution, naive, pos_label=0), metrics.f1_score(solution, naive, pos_label=1)

In [None]:
metrics.f1_score(solution, naive, average=None)

In [None]:
metrics.f1_score(solution, naive, average='micro'), metrics.f1_score(solution, naive, average='macro')

In [None]:
metrics.accuracy_score(solution,naive)

In [None]:
metrics.precision_score(solution, naive, average=None), metrics.recall_score(solution, naive, average=None)

In [None]:
metrics.precision_recall_fscore_support(solution, naive)

In [None]:
print(metrics.classification_report(solution, rf))