In [None]:
# 01. importar bibliotecas Machine Learning
import pandas as pd
from pandas.plotting import scatter_matrix

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
#%% 02. importar folha de excel
filepath = 'winequality.xls'

df1 = pd.read_excel(filepath, sheet_name = 0)

In [None]:
#%% 03. renomear colunas
df1.rename(columns={'type':'tipo'}, inplace=True)
df1.rename(columns={'quality':'qualidade'}, inplace=True)

In [None]:
#%% 04. selecionar por criterio e criar nova dataframe (1)
redtot = df1.loc[df1['tipo'].isin(['red'])]
whitot = df1.loc[df1['tipo'].isin(['white'])]

In [None]:
#%% 05. isolar regiao de controlo e converter target para binario
control = df1.loc[df1['alcohol'] >= 10]
control['vintage'] = (control['qualidade'] >= 7).astype(int)
control['vintage'].sum()

In [None]:
#%% 06. selecionar por criterio e criar nova dataframe (2)
redcont = control.loc[df1['tipo'].isin(['red'])]
whicont = control.loc[df1['tipo'].isin(['white'])]

In [None]:
#%% 07. selecionar apenas algumas colunas da dataframe de controlo e criar nova dataframe
selparam = ['vintage','density','pH','alcohol']
ctl = whicont[selparam]
ctl['vintage'].sum()

In [None]:
#%% 08. scatter plot dos vários parâmetros de treino em função da classificação (seaborn)
sns.pairplot(ctl, hue='vintage')

In [None]:
#%% 09. remover todos os valores nan para para input dos modelos
ctl.dropna(inplace=True)

In [None]:
#%% 10. extrair o target da dataframe para array
target = ctl.pop('vintage').values

In [None]:
#%% 11. criar dataset de treino e teste para os targets da classificação
X_train, X_test, y_train, y_test = train_test_split(ctl, 
                                                    target, 
                                                    test_size=0.2, 
                                                    random_state=0)

In [None]:
#%% 12. scatter plot entre os parâmetros de treino (matplotlib)
scatter_matrix(X_train, c=y_train, marker='.', hist_kwds={'bins': 20}, s=30, alpha=.8)

In [None]:
#%% 13. TREINAR MODELOS DE CLASSIFICAÇÃO ###
# ------------------------------------------------
KNN1 = KNeighborsClassifier(n_neighbors=1, n_jobs=2).fit(X_train, y_train)
round(KNN1.score(X_test, y_test), 4)

KNN2 = KNeighborsClassifier(n_neighbors=2, n_jobs=2).fit(X_train, y_train)
round(KNN2.score(X_test, y_test), 4)

RFC = RandomForestClassifier(n_estimators=100, n_jobs=2).fit(X_train, y_train)
round(RFC.score(X_test, y_test), 4)

LRN = LogisticRegression(solver='newton-cg', multi_class='ovr').fit(X_train, y_train)
round(LRN.score(X_test,y_test), 4)

SVM = svm.SVC(decision_function_shape='ovr', kernel='rbf').fit(X_train, y_train)
round(SVM.score(X_test, y_test), 4)

In [None]:
#%% 14. AVALIAR CLASSIFICAÇÃO PRODUZIDA ####
# -----------------------------
predicted = KNN1.predict(ctl)
confusion_matrix(target, predicted)
accuracy_score(target, predicted)
print(classification_report(target, predicted, target_names=['class 0', 'class 1']))

predicted = KNN2.predict(ctl)
confusion_matrix(target, predicted)

predicted = RFC.predict(ctl)
confusion_matrix(target, predicted)

predicted = LRN.predict(ctl)
confusion_matrix(target, predicted)

predicted = SVM.predict(ctl)
confusion_matrix(target, predicted)

In [None]:
#%% 15. PRODUZIR NOVAS ESTIMATIVAS DE CADA CLASSIFICADOR ####
# -----------------------------------------------
df1sel = whitot[['density', 'pH', 'alcohol']]
#df1sel.drop(['vintage'], axis = 1, inplace = True)
df1sel.dropna(inplace=True)

dfpred = df1sel.copy()

dfpred['KNN1'] = KNN1.predict(df1sel)
dfpred['KNN2'] = KNN2.predict(df1sel)
dfpred['RFC']  = RFC.predict(df1sel)
dfpred['LRN']  = LRN.predict(df1sel)
dfpred['SVM']  = SVM.predict(df1sel)

dfpred.KNN1.sum()
dfpred.KNN2.sum()
dfpred.RFC.sum()
dfpred.LRN.sum()
dfpred.SVM.sum()

In [None]:
#%% 16. criar ficheiro de output para análise externa

dfpred['vintage'] = (whitot['qualidade'] >= 7).astype(int)
dfpred[['qualidade','X','Y']] = whitot[['qualidade','X','Y']]

dfpred.to_csv('exportclass.txt')