In [1]:
import pandas as pd  
df = pd.read_csv("https://raw.githubusercontent.com/insaid2018/Term-1/master/Data/Projects/winequality.csv") 

In [2]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
#We can treat this problem either as a classification problem or as a regression  problem 
#since wine quality is nothing but a real number between 0 and 10. For  simplicity, let’s 
#choose classification. This dataset, however, consists of only six  types of quality values.
#We will thus map all quality values from 0 to 5. 

#a mapping dictionary that maps the quality values from 0 to 5  
quality_mapping = {  3: 0,  4: 1,  5: 2,  6: 3,  7: 4,  8: 5  } 


In [4]:
df.loc[:, "quality"].head(15)


0     5
1     5
2     5
3     6
4     5
5     5
6     5
7     7
8     7
9     5
10    5
11    5
12    5
13    5
14    5
Name: quality, dtype: int64

In [5]:
#use the map function of pandas with  #any dictionary to convert the values in a given  
#column to values in the dictionary 

df.loc[:, "quality"] = df.quality.map(quality_mapping)

In [6]:
df.loc[:, "quality"].head()

0    2.0
1    2.0
2    2.0
3    3.0
4    2.0
Name: quality, dtype: float64

In [7]:
#use sample with frac=1 to shuffle the dataframe  
#we reset the indices since they change after  
#shuffling the dataframe  
df = df.sample(frac=1).reset_index(drop=True) 




In [8]:
#top 1000 rows are selected  #for training  
df_train = df.head(1000)  
#bottom 599 values are selected  #for testing/validation  
df_test = df.tail(599) 


In [9]:
#train DT model
from sklearn import tree
from sklearn import metrics

#initialize the decision tree classifier class with max_depth of 3

clf = tree.DecisionTreeClassifier(max_depth=3)



In [10]:
#choose the columns you want to train on  #these are the features for the model  
cols = ['fixed acidity',  'volatile acidity',  'citric acid',
        'residual sugar',  'chlorides',  'free sulfur dioxide', 
        'total sulfur dioxide',  'density',  'pH',  'sulphates',  'alcohol'] 


        





In [12]:
#train the model on the provided features and mapped quality from before

clf.fit(df_train[cols], df_train.quality)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
#Note that I have used a max_depth of 3 for the decision tree classifier. I have left  all other parameters of this model to its default value.  Now, we test the accuracy of this model on the training set and the test set: 

#generate predictions on the training sets
train_pred = clf.predict(df_train[cols])

In [None]:
#generate predictions on the test set
test_pred = clf.predict(df_test[cols])
test_pred

In [None]:
#calculate the accureacy of the prediction on trainig
train_accuracy = metrics.accuracy_score(df_train.quality, train_pred)
train_accuracy

In [None]:
#calculate the accuracy of predictions on  #test data set  
test_accuracy = metrics.accuracy_score(df_test.quality,test_pred) 



In [None]:
test_accuracy

In [None]:
#The training and test accuracies are found to be 58.9% and 54.25%. 
#Now we  increase the max_depth to 7 and repeat the process.



In [None]:
clf_7 = tree.DecisionTreeClassifier(max_depth=7)


In [None]:
clf_7.fit(df_train[cols], df_train.quality)

In [None]:
#generate predictions on the training sets
train_pred_7 = clf_7.predict(df_train[cols])

In [None]:
test_pred_7 = clf_7.predict(df_test[cols])
test_pred_7

In [None]:
#calculate the accureacy of the prediction on trainig
train_accuracy = metrics.accuracy_score(df_train.quality, train_pred_7)
train_accuracy

In [None]:
test_accuracy = metrics.accuracy_score(df_test.quality,test_pred_7)

In [None]:
test_accuracy

# Here, we have used accuracy, mainly because it  is the most straightforward metric. It might not be the best metric for this problem.  What about we calculate these accuracies for different values of max_depth and  make a plot? 



In [None]:
#import scikit-learn tree and metrics  
from sklearn import tree  
from sklearn import metrics  

#import matplotlib and seaborn  
#for plotting  import matplotlib  
import matplotlib.pyplot as plt  
import seaborn as sns 



In [None]:
#this is our global size of label text on the plots

plt.rc('xtick', labelsize = 20)
plt.rc('ytick',labelsize = 20)

In [None]:
#This line ensures that the plot is displayed 
#inside the notebook  

%matplotlib inline 



In [None]:
#initialize list to store accuracies or training and test data
#start with 50% accuracy
train_accuracy = [0.5]
test_accuracy = [0.5]



In [None]:
df_test[cols]

In [None]:
#iterate over a few depth values

for depth in range(1,25):
    #init the model
    clf = tree.DecisionTreeClassifier(max_depth=depth)
    cols = [  'fixed acidity',  'volatile acidity',  'citric acid',  'residual sugar',  'chlorides',  'free sulfur dioxide',  
            'total sulfur dioxide',  'density',  'pH',  'sulphates','alcohol'  ] 
    #fit the model on given features
    clf.fit(df_train[cols], df_train.quality)
    #create training and test predictions
    train_prediction = clf.predict(df_train[cols])
    test_prediction = clf.predict(df_test[cols])
    
    #calculate accuracies
    train_accuracies = metrics.accuracy_score(df_train.quality, train_prediction)
    test_accuracies = metrics.accuracy_score(  df_test.quality, test_prediction  )  
    #append accuracies  
    train_accuracies.append(train_accuracy)  
    test_accuracies.append(test_accuracy) 
    
    







In [None]:
#create two plots using matplotlib  
#and seaborn  
plt.figure(figsize=(10, 5))  
sns.set_style("whitegrid")  
plt.plot(train_accuracies, label="train accuracy")  
plt.plot(test_accuracies, label="test accuracy")  
plt.legend(loc="upper left", prop={'size': 15})  
plt.xticks(range(0, 26, 5))  
plt.xlabel("max_depth", size=20)  
plt.ylabel("accuracy", size=20)  
plt.show() 

