In [245]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from numpy import average
from matplotlib.pyplot import figure
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import export_graphviz
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_percentage_error as mape
import pydot

In [196]:
#import CSVs, assign headers and remove quotations from column headers
red_wine = pd.read_csv(r'C:\Users\Killian\Projects\Wine analysis\Wine data\winequality-red.csv', sep=';',header=0, engine='python')
white_wine = pd.read_csv(r'C:\Users\Killian\Projects\Wine analysis\Wine data\winequality-white.csv', sep=';',header=0, engine='python')


In [197]:
red_wine['quality_label'] = red_wine['quality'].apply(lambda value: 'low'
if value <= 4 else 'medium-low'
if value == 5 else 'medium' if value == 6 else 'medium-high' if value == 7 else 'high')

white_wine['quality_label'] = white_wine['quality'].apply(lambda value: 'low'
if value <= 4 else 'medium-low'
if value == 5 else 'medium' if value == 6 else 'medium-high' if value == 7 else 'high')

red_wine['quality_label'] = pd.Categorical(red_wine['quality_label'],
categories=['low', 'medium-low', 'medium', 'medium-high', 'high'])

white_wine['quality_label'] = pd.Categorical(white_wine['quality_label'],
categories=['low', 'medium-low', 'medium', 'medium-high', 'high'])

In [198]:
#Create a new dataset combining red and white wines
red_wine['wine_type'] = 'red'
white_wine['wine_type'] = 'white'
wines = pd.concat([red_wine, white_wine])
#wines['ql_code'] = wines['quality_label']

# re-shuffle records just to randomize data points as you may have a problem when fitting machine learning model. Dataset may not be split in a fair way.
wines = wines.sample(frac=1, random_state=42).reset_index(drop=True)

In [199]:
# Skewness is the degree of distortion from the symmetrical bell curve or the normal distribution. It measures the lack of symmetry in data distribution.
# It differentiates extreme values in one versus the other tail. A symmetrical distribution will have a skewness of 0.
# If the skewness is between -1 and -0.5(negatively skewed) or between 0.5 and 1(positively skewed), the data are moderately skewed. 
# If the skewness is less than -1(negatively skewed) or greater than 1(positively skewed), the data are highly skewed.

# Kurtosis is all about the tails of the distribution — not the peakedness or flatness. It is used to describe the extreme values
# in one versus the other tail. It is actually the measure of outliers present in the distribution.

# Mesokurtic: This distribution has kurtosis statistic similar to that of the normal distribution. It means that the extreme values of the 
# distribution are similar to that of a normal distribution characteristic. This definition is used so that the standard normal distribution has a kurtosis of three.

# Leptokurtic (Kurtosis > 3): Distribution is longer, tails are fatter. Peak is higher and sharper than Mesokurtic, which means that data are heavy-tailed or profusion 
# of outliers. Outliers stretch the horizontal axis of the histogram graph, which makes the bulk of the data appear in a narrow (“skinny”) vertical range, thereby giving 
# the “skinniness” of a leptokurtic distribution.

# Platykurtic: (Kurtosis < 3): Distribution is shorter, tails are thinner than the normal distribution. The peak is lower and broader than Mesokurtic, which means that 
# data are light-tailed or lack of outliers. The reason for this is because the extreme values are less than that of the normal distribution.


wine = []
for column in wines.columns[0:-4]:
        wine.append([wines[column].astype(float).skew().round(2), wines[column].astype(float).kurt().round(2)])

wine_stats = pd.DataFrame(wine, index=[wines.columns[0:-4]], columns=['Skewness', 'Kurtosis'])
display(wine_stats)

Unnamed: 0,Skewness,Kurtosis
fixed acidity,1.72,5.06
volatile acidity,1.5,2.83
citric acid,0.47,2.4
residual sugar,1.44,4.36
chlorides,5.4,50.9
free sulfur dioxide,1.22,7.91
total sulfur dioxide,-0.0,-0.37
density,0.5,6.61
pH,0.39,0.37
sulphates,1.8,8.65


In [227]:
#enc = OrdinalEncoder(categories=[['low', 'medium-low', 'medium', 'medium-high', 'high']])

X=wines.copy()

#wine_features[['ql_code']] = enc.fit_transform(wine_features[['ql_code']])

In [228]:
# I will keep all features for now. Will remove low correlation features later to see if there is an impact

y = X.quality_label
y = pd.get_dummies(y)

X.drop(['quality_label', 'quality'],axis=1,inplace=True)

X = pd.get_dummies(X)


In [229]:
y = np.array(y)
# Remove the labels from the features
# Saving feature names for later use
X_list = list(X.columns)
# Convert to numpy array
X = np.array(X)
print(X_list)

['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'wine_type_red', 'wine_type_white']


In [230]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [231]:
# Because of the skewedness of the model, with many outliers, it does not seem appropriate to use either normalisation or
# standardisation. I will be applying a tree based algorithm which is insensitive to the scale of features as it splits
# nodes based on single features. 

#Below I am testing to make sure the shapes are as predicted (column numbers match)

print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (5197, 13)
Training Labels Shape: (5197, 5)
Testing Features Shape: (1300, 13)
Testing Labels Shape: (1300, 5)


In [232]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train);

In [234]:
#mae(y_test, y_train)

# Use the forest's predict method on the test data
predictions = rf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))

Mean Absolute Error: 0.18


In [248]:
# Calculate mean absolute percentage error (MAPE)
#mape = 100 * (y_test - errors)
# Calculate and display accuracy
#accuracy = 100 - np.mean(mape)
#print('Accuracy:', round(accuracy, 2), '%.')
mape(y_train, y_test)

TypeError: 'tuple' object is not callable

In [None]:

# Pull out one tree from the forest
tree = rf.estimators_[5]
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = X_list, rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')
# Write graph to a png file
graph.write_png('tree.png')