
# Data Wine Exploration

In [None]:
#importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error as mse, accuracy_score
from scipy import stats
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler,LabelEncoder
from xgboost import XGBClassifier
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsClassifier

In [None]:
data='winequality-red.csv'
wine=pd.read_csv(data)

In [None]:
wine.info()

qual_space=np.linspace(3,8,4)
level=['Low','Mid','High']
wine['quality-level']=pd.cut(wine['quality'],qual_space,labels=level, include_lowest=True)

In [None]:
wine.describe(include='all').round(3)

In [None]:
wine.agg(['min','max'])

In [None]:
wine['total sulfur dioxide']=(wine['total sulfur dioxide']-wine['total sulfur dioxide'].max())/(wine['total sulfur dioxide'].max()/wine['total sulfur dioxide'].min())

In [None]:
def reg_col_against(df,col_1,col_2):
    plt.figure(figsize=(15,5))
    sns.regplot(wine[col_1],wine[col_2],line_kws={'color':'purple'})
    plt.title(f'{col_1} against {col_2}')
    plt.show()
    plt.close()

In [None]:
# col_against(wine,'fixed acidity','citric acid')
# col_against(wine,'free sulfur dioxide','total sulfur dioxide')
# col_against(wine,'alcohol','quality')

In [None]:
# plt.figure(figsize=(8,4))
plt.hist(wine['quality'])
plt.title('Count of Quality Outcome')

In [None]:
plt.figure(figsize=(5,5))
plt.pie(wine['quality'].value_counts(),labels=wine['quality'].value_counts().index,autopct='%1.2f%%',startangle=140,rotatelabels=45)
plt.axis('equal')
plt.tight_layout()
# plt.axes(p)
plt.title('Pie Chart for Wine Quality Categories' )
plt.show()

In [None]:
wine.corr()['quality'].sort_values()

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(wine.corr(),cmap='RdBu',annot=True,cbar=True,fmt='.2f')
plt.show()

In [None]:
# X=wine[['volatile acidity','citric acid', 'sulphates', 'alcohol','density','total sulfur dioxide']]
X=wine.drop(columns=['quality'],axis=1)
y=wine['quality']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.2,random_state=22,stratify=y)

In [None]:
scaler_=StandardScaler()
label_=LabelEncoder()
X_train_scaled=scaler_.fit_transform(X_train)
X_test_scaled=scaler_.fit_transform(X_test)
y_train_le=label_.fit_transform(y_train)
y_test_le=label_.fit_transform(y_test)

In [None]:
param_grid={
    'max_depth':[6,10,13,15,18,20],
    'criterion':['gini','log_loss'],
    'max_features':["sqrt", "log2"]
}

In [None]:
DTC=DecisionTreeClassifier(random_state=7,splitter='best',max_depth=5,)

In [None]:
grid_cv=GridSearchCV(DTC,param_grid,scoring='accuracy',cv=5)

In [None]:
grid_cv.fit(X_train_scaled,y_train)

In [None]:
grid_cv.best_params_,grid_cv.best_estimator_

In [None]:
y_pred_decision=grid_cv.predict(X_test_scaled)

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(y_pred_decision,color='red',hist=False)
sns.distplot(y_test,color='blue',hist=False)

In [None]:
confi_dtc=confusion_matrix(y_pred_decision,y_test)
confi_dtc=confi_dtc/confi_dtc.sum(1)[:,np.newaxis]
sns.heatmap(confi_dtc*100,annot=True,cmap='winter',xticklabels=np.unique(wine['quality']),yticklabels=np.unique(wine['quality']),fmt='.2f')
plt.xlabel('Predicted Quality')
plt.ylabel('True Quality')
plt.title('Confusion Matrix of Predicted and True Outcome of Wine Quality')
plt.show()

In [None]:
xgb=XGBClassifier(max_depth=18)

In [None]:
param_grid={
    'max_depth':[6,10,13,15,18,20]
}

In [None]:
xgb_cv=GridSearchCV(xgb,param_grid,cv=5,scoring='accuracy')
xgb_cv.fit(X_train_scaled, y_train_le)

In [None]:
xgb_cv.best_params_,
criterion='gini'
max_depth= 10
max_features='sqrt'
xgb_cv.best_score_

In [None]:
y_pred_xgb=xgb.predict(X_test_scaled)
xgb_cv.score(X_train_scaled,y_train_le)

In [None]:
sns.distplot(y_pred_xgb,hist=False)
sns.distplot(y_test_le,hist=False,color='red')
# plt.xticks([3,4,5,6,7,8])

In [None]:
cm_xgb=confusion_matrix(y_pred_xgb,y_test_le)
sns.heatmap(cm_xgb,cmap='winter',annot=True,xticklabels=np.unique(y_test.values),yticklabels=np.unique(y_test.values),fmt='.2f')
plt.xlabel('PREDICTED QUALITY')
plt.ylabel('TRUE QUALITY')

In [None]:
param_grid={
    'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
    'weights':['uniform', 'distance'], # Weighting of neighbors (uniform or distance-based)
    'n_neighbors': list(range(3,16,3)),   # Different values of k
    'metric': ['euclidean', 'manhattan']
}

In [None]:
knn1=KNeighborsClassifier()

In [None]:
knn_gcv=GridSearchCV(knn1,param_grid,cv=5)
knn_gcv.fit(X_train_scaled,y_train)

In [None]:
y_knn_pred=knn_gcv.predict(X_test_scaled)
mse(y_knn_pred,y_test), accuracy_score(y_knn_pred,y_test),knn_gcv.best_params_


In [None]:
sns.distplot(y_knn_pred,hist=False,label='Predicted')
sns.distplot(y_test,hist=False,color='red',label='True')
plt.legend()

In [None]:
confi_knn=confusion_matrix(y_test,y_knn_pred)
confi_knn=(confi_knn/confi_knn.sum(1)[:,np.newaxis])

In [None]:
sns.heatmap(confi_knn,cmap='summer',annot=True,xticklabels=np.unique(y_test.values),yticklabels=np.unique(y_test.values),fmt='.2f')
plt.xlabel('PREDICTED QUALITY')
plt.ylabel('TRUE QUALITY')
plt.title('% confusion matrix of Wine Quality')
plt.show()