In [1]:
# Load Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
wine_data = pd.read_csv('winequality-white.csv',";")

In [3]:
wine_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
fixed acidity           4898 non-null float64
volatile acidity        4898 non-null float64
citric acid             4898 non-null float64
residual sugar          4898 non-null float64
chlorides               4898 non-null float64
free sulfur dioxide     4898 non-null float64
total sulfur dioxide    4898 non-null float64
density                 4898 non-null float64
pH                      4898 non-null float64
sulphates               4898 non-null float64
alcohol                 4898 non-null float64
quality                 4898 non-null int64
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


In [4]:
wine_data.corr()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
fixed acidity,1.0,-0.022697,0.289181,0.089021,0.023086,-0.049396,0.09107,0.265331,-0.425858,-0.017143,-0.120881,-0.113663
volatile acidity,-0.022697,1.0,-0.149472,0.064286,0.070512,-0.097012,0.089261,0.027114,-0.031915,-0.035728,0.067718,-0.194723
citric acid,0.289181,-0.149472,1.0,0.094212,0.114364,0.094077,0.121131,0.149503,-0.163748,0.062331,-0.075729,-0.009209
residual sugar,0.089021,0.064286,0.094212,1.0,0.088685,0.299098,0.401439,0.838966,-0.194133,-0.026664,-0.450631,-0.097577
chlorides,0.023086,0.070512,0.114364,0.088685,1.0,0.101392,0.19891,0.257211,-0.090439,0.016763,-0.360189,-0.209934
free sulfur dioxide,-0.049396,-0.097012,0.094077,0.299098,0.101392,1.0,0.615501,0.29421,-0.000618,0.059217,-0.250104,0.008158
total sulfur dioxide,0.09107,0.089261,0.121131,0.401439,0.19891,0.615501,1.0,0.529881,0.002321,0.134562,-0.448892,-0.174737
density,0.265331,0.027114,0.149503,0.838966,0.257211,0.29421,0.529881,1.0,-0.093591,0.074493,-0.780138,-0.307123
pH,-0.425858,-0.031915,-0.163748,-0.194133,-0.090439,-0.000618,0.002321,-0.093591,1.0,0.155951,0.121432,0.099427
sulphates,-0.017143,-0.035728,0.062331,-0.026664,0.016763,0.059217,0.134562,0.074493,0.155951,1.0,-0.017433,0.053678


In [5]:
wine_data.loc[(wine_data.quality<6),'taste'] = 'bad'
wine_data.loc[(wine_data.quality>=6) & (wine_data.quality<8),'taste'] = 'normal'
wine_data.loc[(wine_data.quality>=8) ,'taste'] = 'good'

In [6]:
wine_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,taste
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,normal
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,normal
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,normal
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,normal
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,normal


In [7]:
wine_data[wine_data.taste=="bad"].count()

fixed acidity           1640
volatile acidity        1640
citric acid             1640
residual sugar          1640
chlorides               1640
free sulfur dioxide     1640
total sulfur dioxide    1640
density                 1640
pH                      1640
sulphates               1640
alcohol                 1640
quality                 1640
taste                   1640
dtype: int64

In [8]:
wine_data.shape

(4898, 13)

In [9]:
X = wine_data.iloc[:,0:11]
y = wine_data.iloc[:,12]


In [10]:
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


<h3>Split Train Test</h3>

In [11]:
X_train,X_test,y_train,y_test =train_test_split(X,y, test_size = 0.3, random_state = 17)

In [12]:
X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
96,6.0,0.34,0.66,15.9,0.046,26.0,164.0,0.9979,3.14,0.5,8.8
2292,6.5,0.32,0.23,1.2,0.054,39.0,208.0,0.99272,3.18,0.46,9.9
1054,7.0,0.31,0.52,1.7,0.029,5.0,61.0,0.9918,3.07,0.43,10.4
3687,6.0,0.16,0.27,12.0,0.03,39.0,98.0,0.99402,3.15,0.34,10.8
596,6.9,0.41,0.33,10.1,0.043,28.0,152.0,0.9968,3.2,0.52,9.4


<h3>Random Forest Model</h3>

In [13]:
rf_model =  RandomForestClassifier()
rf_model.fit(X_train,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [14]:
# Scoring based on the train RF Model
predictions = rf_model.predict(X_test)

In [15]:
y_test[y_test == 'bad'].count()

504

In [16]:
rf_model.feature_importances_

array([0.06940449, 0.11323289, 0.08044451, 0.09079728, 0.08587628,
       0.09574192, 0.08504476, 0.11691717, 0.08510659, 0.07519542,
       0.1022387 ])

In [17]:
# Confusion Matrix
print(pd.crosstab(y_test, predictions))

col_0   bad  good  normal
taste                    
bad     359     0     145
good      1    20      27
normal  113     5     800


In [18]:
print(accuracy_score(y_test, predictions))

0.8020408163265306


In [19]:
rf_model.feature_importances_

array([0.06940449, 0.11323289, 0.08044451, 0.09079728, 0.08587628,
       0.09574192, 0.08504476, 0.11691717, 0.08510659, 0.07519542,
       0.1022387 ])

In [20]:
from collections import Counter
Counter(y_test)

Counter({'normal': 918, 'bad': 504, 'good': 48})

In [21]:
from xgboost import XGBClassifier

In [22]:
model=XGBClassifier()
model.fit(X_train,y_train)
y_predict = model.predict(X_test)
print(accuracy_score(y_test,y_predict))
print(confusion_matrix(y_test,y_predict))

0.7612244897959184
[[301   1 202]
 [  0   6  42]
 [106   0 812]]


In [24]:
from sklearn.cross_val_score import cross_val_score
score = cross_val_score(model, X_train, y_train, cv=5)

ModuleNotFoundError: No module named 'sklearn.cross_val_score'

In [None]:
score = cross_val_score(model, X_train, y_train, cv=5)

In [None]:
score

In [None]:
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(X_train, y_train)
print('Resampled dataset shape {}'.format(Counter(y_res)))
model=XGBClassifier()
model.fit(X_res,y_res)
y_predict = model.predict(np.array(X_test))
print(accuracy_score(y_test,y_predict))
print(confusion_matrix(y_test,y_predict))


In [None]:
X_res