In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,roc_curve,roc_auc_score,classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [None]:
df = pd.read_csv('wine.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
ord_enc = OrdinalEncoder(categories = [['Low','Medium','High']])

df1= ord_enc.fit_transform(df[['Alcohol_content']])

In [None]:
df['Alcohol_content'] = df1

In [None]:
df.head()

In [None]:
plt.figure(figsize = (15,20))
plotnumber=1

for column in df:
    if plotnumber<=13:
        ax = plt.subplot(5,3,plotnumber)
        sns.distplot(df[column])
        plt.xlabel(column,fontsize =10)
    plotnumber+=1
plt.show()

# Plotting heatmap (Correlation Matrix)

In [None]:
df_corr = df.corr().abs()

plt.figure(figsize = (14,10))

sns.heatmap(df_corr,annot = True,annot_kws = {'size':10})

plt.show()

Seems like Alcohol_content and alcohol are strongly correlated as per above heatmap. But we need more proof

In [None]:
plt.scatter(df.alcohol,df.Alcohol_content)
plt.xlabel('Alcohol')
plt.ylabel('Alcohol_Content')
plt.title('Relationship between Alcohol vs Alcohol_Content')
plt.show()

We see clear trend. As and when alcohol level increases its content also increasing. So we can delete one of them. Which one has less correlation cofficient with label Alohol = .48 and Alcohol_content =.4 . so we can delete Alcohol_content

In [None]:
X = df.drop(columns =['quality','Alcohol_content'])
y = df.quality

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = .25,random_state = 256)

In [None]:
def metric_score(clf,x_train,x_test,y_train,y_test,train = True):
    if train:
        y_pred = clf.predict(x_train)
        
        print("-----------------Train Result--------------------")
        
        print(f"Accuracy Score : {accuracy_score(y_train,y_pred)*100:.2f}%")
        
    elif train == False:
        
        pred = clf.predict(x_test)
        
        print("-----------------Test Result--------------------")
        
        print(f"Accuracy Score : {accuracy_score(y_test,pred)*100:.2f}%")
        
        print("\n Test Classification Report \n ", classification_report(y_test,pred,digits =2))
        
        

In [None]:
# Model initiation

clf = DecisionTreeClassifier()

clf.fit(x_train,y_train)

In [None]:
metric_score(clf,x_train,x_test,y_train,y_test,train = True)

metric_score(clf,x_train,x_test,y_train,y_test,train = False)

# Hyperparameter Tuning

In [None]:
grid_param = {'criterion': ['gini','entropy'],
             'max_depth': range(6,20),
              'min_samples_leaf': range(2,10),
              'min_samples_split': range(3,15),
              'max_leaf_nodes': range(5,30)
             }

In [None]:
grid_search = GridSearchCV(estimator = clf, param_grid = grid_param,cv = 5, n_jobs =-1)

In [None]:
grid_search.fit(x_train,y_train)

In [None]:
best_parameters = grid_search.best_params_
print(best_parameters)

In [None]:
clf = DecisionTreeClassifier(criterion ='gini', max_depth =9, min_samples_leaf = 8, min_samples_split = 3, max_leaf_nodes = 50)

clf.fit(x_train,y_train)

In [None]:
metric_score(clf,x_train,x_test,y_train,y_test,train = True)

metric_score(clf,x_train,x_test,y_train,y_test,train = False)