In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier
import xgboost
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score , classification_report
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC


Importamos nuestro dataset

In [3]:
df = pd.read_csv('wineclean.csv')

In [4]:
df = df.drop(columns='Unnamed: 0')

In [5]:
X = df.drop(columns='quality')
y = df['quality']

Dividimos nuestro dataset para el entrenamiento

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

Escalamos con robust escaler para eliminar outliers

In [7]:
X_test = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True,).fit_transform(X_test)
X_train = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True,).fit_transform(X_train)

Escalamos con standarscaler para normalizar nuestros valores

In [8]:
sc = StandardScaler()

In [9]:
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Random Forest Clasiffier

In [10]:
rnd_clf = RandomForestClassifier(n_estimators=500,
                                 max_leaf_nodes=16,
                                 random_state=42,
                                    
                                )
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)
print("Accuracy train",rnd_clf.score(X_train, y_train))
print("Accuracy test",rnd_clf.score(X_test, y_test))
print(classification_report(y_test, y_pred_rf))


Accuracy train 0.8434132394985705
Accuracy test 0.8307692307692308
              precision    recall  f1-score   support

           0       0.65      0.21      0.32       365
           1       0.84      0.97      0.90      1585

    accuracy                           0.83      1950
   macro avg       0.74      0.59      0.61      1950
weighted avg       0.81      0.83      0.79      1950



### Random Forest Clasiffier hyperparameters

In [11]:
forest = RandomForestClassifier(random_state = 42)
n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]
max_leaf_nodes = [8,10,14,16,20]

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf,)

gridF = RandomizedSearchCV(forest, hyperF, cv = 10, verbose = 1, 
                      n_jobs = -1)
bestF = gridF.fit(X_train, y_train)


bestF.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits


{'n_estimators': 800,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_depth': 15}

In [12]:
rnd_clf = RandomForestClassifier(n_estimators=800,
 min_samples_split=5,
 min_samples_leaf =1,
 max_depth = 25)
                                    
                                
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)
print("Accuracy train",rnd_clf.score(X_train, y_train))
print("Accuracy test",rnd_clf.score(X_test, y_test))
print(classification_report(y_test, y_pred_rf))


Accuracy train 0.9923026171101825
Accuracy test 0.8676923076923077
              precision    recall  f1-score   support

           0       0.72      0.48      0.58       365
           1       0.89      0.96      0.92      1585

    accuracy                           0.87      1950
   macro avg       0.80      0.72      0.75      1950
weighted avg       0.86      0.87      0.86      1950



## KNN

In [13]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

print("Accuracy train",knn.score(X_train, y_train))
print("Accuracy test",knn.score(X_test, y_test))
print(classification_report(y_test, y_pred_knn))


Accuracy train 0.8865185836815482
Accuracy test 0.8364102564102565
              precision    recall  f1-score   support

           0       0.58      0.47      0.52       365
           1       0.88      0.92      0.90      1585

    accuracy                           0.84      1950
   macro avg       0.73      0.70      0.71      1950
weighted avg       0.83      0.84      0.83      1950



## KNN hyperparameters

In [14]:

vecinos = KNeighborsClassifier()

n_neighbors = [2,4,6,8,10,12,14,16,20]
weights = ['uniform', 'distance']
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
leaf_size = [15,20,25,30,35,40,45]
p = [1,2]
hyperF = dict(n_neighbors=n_neighbors,weights=weights,algorithm=algorithm,leaf_size=leaf_size,p=p)

gridF = RandomizedSearchCV(vecinos, hyperF, cv = 10, verbose = 1, 
                      n_jobs = -1)
bestF = gridF.fit(X_train, y_train)

print(bestF.best_score_)
print(bestF.best_params_)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
0.8709076826257443
{'weights': 'distance', 'p': 2, 'n_neighbors': 16, 'leaf_size': 35, 'algorithm': 'auto'}


In [15]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(weights='distance', p=2, n_neighbors=20, leaf_size=40, algorithm='brute')
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

print("Accuracy train",knn.score(X_train, y_train))
print("Accuracy test",knn.score(X_test, y_test))
print(classification_report(y_test, y_pred_knn))

Accuracy train 1.0
Accuracy test 0.857948717948718
              precision    recall  f1-score   support

           0       0.67      0.47      0.55       365
           1       0.89      0.95      0.92      1585

    accuracy                           0.86      1950
   macro avg       0.78      0.71      0.73      1950
weighted avg       0.85      0.86      0.85      1950



## XGBoost

In [16]:
xgb_clas = xgboost.XGBRFClassifier(random_state=42,use_label_encoder=False)

xgb_clas.fit(X_train, y_train)
y_pred_xgb = xgb_clas.predict(X_test)

print("Accuracy train",xgb_clas.score(X_train, y_train))
print("Accuracy test",xgb_clas.score(X_test, y_test))
print(classification_report(y_test, y_pred_xgb))

Accuracy train 0.8649659115900594
Accuracy test 0.8374358974358974
              precision    recall  f1-score   support

           0       0.65      0.29      0.40       365
           1       0.85      0.96      0.91      1585

    accuracy                           0.84      1950
   macro avg       0.75      0.63      0.65      1950
weighted avg       0.82      0.84      0.81      1950



#### XGBoost hyperparameters

In [17]:
xgboost_tuning = xgboost.XGBRFClassifier(random_state=42,use_label_encoder=False)

min_child_weight=[1, 5, 10]
gamma=[0.5, 1, 1.5, 2, 5]
subsample=[0.6, 0.8, 1.0]



hyperF = dict(min_child_weight=min_child_weight,gamma=gamma,subsample=subsample)

gridF = RandomizedSearchCV(xgboost_tuning, hyperF, cv = 10, verbose = 1, 
                      n_jobs = -1)
bestF = gridF.fit(X_train, y_train)

print(bestF.best_score_)
print(bestF.best_params_)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
0.8396761388391345
{'subsample': 0.8, 'min_child_weight': 1, 'gamma': 0.5}


In [18]:
xgb_clas = xgboost.XGBRFClassifier(random_state=42,use_label_encoder=False,subsample=1,gamma=1,min_child_weight=1)

xgb_clas.fit(X_train, y_train)
y_pred_xgb = xgb_clas.predict(X_test)

print("Accuracy train",xgb_clas.score(X_train, y_train))
print("Accuracy test",xgb_clas.score(X_test, y_test))
print(classification_report(y_test, y_pred_xgb))

Accuracy train 0.8605674070815923
Accuracy test 0.8353846153846154
              precision    recall  f1-score   support

           0       0.64      0.27      0.38       365
           1       0.85      0.97      0.91      1585

    accuracy                           0.84      1950
   macro avg       0.75      0.62      0.64      1950
weighted avg       0.81      0.84      0.81      1950



## GradientBoostClassifier

In [19]:
gbct = GradientBoostingClassifier(max_depth=2,n_estimators=3,learning_rate=1.0,random_state=42)

gbct.fit(X_train, y_train)
y_pred_gbct = gbct.predict(X_test)

print("Accuracy train",gbct.score(X_train, y_train))
print("Accuracy test",gbct.score(X_test, y_test))
print(classification_report(y_test, y_pred_gbct))

Accuracy train 0.8143831097426875
Accuracy test 0.8215384615384616
              precision    recall  f1-score   support

           0       0.58      0.16      0.26       365
           1       0.83      0.97      0.90      1585

    accuracy                           0.82      1950
   macro avg       0.71      0.57      0.58      1950
weighted avg       0.79      0.82      0.78      1950



#### GradientBoostClassifier hyperparameters

In [20]:
gbct = GradientBoostingClassifier(random_state=42)


learning_rate=[0.01,0.05,0.1,1,0.5]
max_depth=[3,4,5]
min_samples_leaf=[4,5,6]
subsample=[0.6,0.7,0.8]
n_estimators=[5,10,15,20]



hyperF = dict(learning_rate=learning_rate,max_depth=max_depth,min_samples_leaf=min_samples_leaf,subsample=subsample,n_estimators=n_estimators)

gridF = RandomizedSearchCV(gbct, hyperF, cv = 10, verbose = 1, 
                      n_jobs = -1)
bestF = gridF.fit(X_train, y_train)

print(bestF.best_score_)
print(bestF.best_params_)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
0.8306554678801374
{'subsample': 0.7, 'n_estimators': 20, 'min_samples_leaf': 6, 'max_depth': 4, 'learning_rate': 0.5}


In [21]:
gbct = GradientBoostingClassifier(subsample=0.6,n_estimators=5,min_samples_leaf=5,max_depth=5,learning_rate=0.05,random_state=42)

gbct.fit(X_train, y_train)
y_pred_gbct = gbct.predict(X_test)

print("Accuracy train",gbct.score(X_train, y_train))
print("Accuracy test",gbct.score(X_test, y_test))
print(classification_report(y_test, y_pred_gbct))

Accuracy train 0.7994281944138992
Accuracy test 0.8128205128205128
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       365
           1       0.81      1.00      0.90      1585

    accuracy                           0.81      1950
   macro avg       0.41      0.50      0.45      1950
weighted avg       0.66      0.81      0.73      1950



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVC

In [22]:
svc = SVC(C=100)
svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_test)


print("Accuracy train",svc.score(X_train, y_train))
print("Accuracy test",svc.score(X_test, y_test))
print(classification_report(y_test, y_pred_svc))

Accuracy train 0.8851990323290081
Accuracy test 0.8466666666666667
              precision    recall  f1-score   support

           0       0.66      0.38      0.48       365
           1       0.87      0.96      0.91      1585

    accuracy                           0.85      1950
   macro avg       0.76      0.67      0.69      1950
weighted avg       0.83      0.85      0.83      1950



#### SVC hyperparameters

In [23]:
svc_tuning = SVC(random_state=42)

C = [1,10,100,1000]



hyperF = dict(C=C)

gridF = RandomizedSearchCV(svc_tuning, hyperF, cv = 10, verbose = 1, 
                      n_jobs = -1)
bestF = gridF.fit(X_train, y_train)

print(bestF.best_score_)
print(bestF.best_params_)

Fitting 10 folds for each of 4 candidates, totalling 40 fits




0.8361562666408482
{'C': 10}


In [24]:
svc = SVC(C=10,kernel='rbf',gamma=1)
svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_test)


print("Accuracy train",svc.score(X_train, y_train))
print("Accuracy test",svc.score(X_test, y_test))
print(classification_report(y_test, y_pred_svc))

Accuracy train 0.9879041126017154
Accuracy test 0.861025641025641
              precision    recall  f1-score   support

           0       0.65      0.57      0.60       365
           1       0.90      0.93      0.92      1585

    accuracy                           0.86      1950
   macro avg       0.77      0.75      0.76      1950
weighted avg       0.86      0.86      0.86      1950

