In [1]:
import pandas as pd
import numpy as np

In [2]:
#used Dataset 'winequality-red.csv' from Kaggle
dataset = pd.read_csv("winequality-red.csv")
dataset.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


In [3]:
#This dataset contains a subset of the original dataset. 
small_dataset = pd.read_csv('winequality-red_small.csv')
small_dataset

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25,67,0.9968,3.2,0.68,9.8,5
2,7.8,0.58,0.02,2.0,0.073,9,18,0.9968,3.36,0.57,9.5,7
3,11.2,0.28,0.56,1.9,0.075,17,60,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5


In [4]:
#Demonstration of Random Forest Steps on small dataset

In [5]:
#create bootstrap example
from sklearn.utils import resample
bootstrap_sample = resample(small_dataset,n_samples=5)
bootstrap_sample

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
3,11.2,0.28,0.56,1.9,0.075,17,60,0.998,3.16,0.58,9.8,6
3,11.2,0.28,0.56,1.9,0.075,17,60,0.998,3.16,0.58,9.8,6
1,7.8,0.88,0.0,2.6,0.098,25,67,0.9968,3.2,0.68,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17,60,0.998,3.16,0.58,9.8,6
2,7.8,0.58,0.02,2.0,0.073,9,18,0.9968,3.36,0.57,9.5,7


In [6]:
#drop duplicates in bootstrap dataset
bootstrap_sample = bootstrap_sample.drop_duplicates()
bootstrap_sample

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
3,11.2,0.28,0.56,1.9,0.075,17,60,0.998,3.16,0.58,9.8,6
1,7.8,0.88,0.0,2.6,0.098,25,67,0.9968,3.2,0.68,9.8,5
2,7.8,0.58,0.02,2.0,0.073,9,18,0.9968,3.36,0.57,9.5,7


In [7]:
#get most relevant features (https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

X = bootstrap_sample.iloc[:,0:11]  #independent columns
y = bootstrap_sample.iloc[:,-1]    #target column i.e price range

bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
featureScores.sort_values(by=['Score'],ascending=False)

Unnamed: 0,Specs,Score
6,total sulfur dioxide,29.06207
5,free sulfur dioxide,7.529412
2,citric acid,1.044138
0,fixed acidity,0.8626866
1,volatile acidity,0.3103448
3,residual sugar,0.1323077
9,sulphates,0.01213115
8,pH,0.00691358
10,alcohol,0.006185567
4,chlorides,0.004707317


In [8]:
#get 2 random variables
sample_columns = resample(bootstrap_sample.columns.drop(['quality']), replace=False, n_samples=2)
sample_columns

Index(['volatile acidity', 'pH'], dtype='object')

In [9]:
#choose more relevant variable of random selected variables
more_relevant_variable = featureScores.loc[featureScores['Specs'].isin(sample_columns)].sort_values(by=['Score'], ascending=False)[:1].Specs
more_relevant_variable

1    volatile acidity
Name: Specs, dtype: object

In [10]:
bootstrap_sample[more_relevant_variable].describe()

Unnamed: 0,volatile acidity
count,3.0
mean,0.58
std,0.3
min,0.28
25%,0.43
50%,0.58
75%,0.73
max,0.88


In [11]:
#in our example the split criteria is the mean value of the variable
mean = bootstrap_sample[more_relevant_variable].mean().iloc[0]
mean

0.5800000000000001

In [12]:
firstlevel_left_node = bootstrap_sample.loc[bootstrap_sample[more_relevant_variable.iloc[0]] <= mean]
firstlevel_left_node

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
3,11.2,0.28,0.56,1.9,0.075,17,60,0.998,3.16,0.58,9.8,6
2,7.8,0.58,0.02,2.0,0.073,9,18,0.9968,3.36,0.57,9.5,7


In [13]:
right_node = bootstrap_sample.loc[bootstrap_sample[more_relevant_variable.iloc[0]] > mean]
right_node

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1,7.8,0.88,0.0,2.6,0.098,25,67,0.9968,3.2,0.68,9.8,5


In [14]:
#same procedure for next node (of firstlevel_left_node)

#get random variables
sample_columns = resample(bootstrap_sample.columns.drop(['quality']), replace=False, n_samples=2)
print("sample columns: ")
print(sample_columns)
print('\n')

#choose more relevant variable of random selected variables
more_relevant_variable = featureScores.loc[featureScores['Specs'].isin(sample_columns)].sort_values(by=['Score'], ascending=False)[:1].Specs
print("more_relevant_variable: ")
print(more_relevant_variable)
print('\n')

#get split value
mean = firstlevel_left_node[more_relevant_variable].mean().iloc[0]
print("mean: ")
print(mean)
print('\n')

sample columns: 
Index(['free sulfur dioxide', 'pH'], dtype='object')


more_relevant_variable: 
5    free sulfur dioxide
Name: Specs, dtype: object


mean: 
13.0




In [15]:
#split in left and right node
left_node = firstlevel_left_node.loc[firstlevel_left_node[more_relevant_variable.iloc[0]] <= mean]
left_node

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
2,7.8,0.58,0.02,2.0,0.073,9,18,0.9968,3.36,0.57,9.5,7


In [16]:
right_node = firstlevel_left_node.loc[firstlevel_left_node[more_relevant_variable.iloc[0]] > mean]
right_node

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
3,11.2,0.28,0.56,1.9,0.075,17,60,0.998,3.16,0.58,9.8,6


In [17]:
#Start with Random Forest Steps on the big dataset

In [18]:
features = dataset.drop(['quality'], axis='columns')
X = features.values
Y = dataset.quality.values

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.02, random_state=0)

In [20]:
# Feature Scaling
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
model = RandomForestClassifier(n_estimators=1000)
model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000)

In [21]:
model.score(X_test, y_test)

0.71875

In [22]:
# Feature Scaling
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
small_model = RandomForestClassifier(n_estimators=1)
small_model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1)

In [23]:
small_model.score(X_test, y_test)

0.625

In [24]:
feature_list = list(features.columns)
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot # Pull out one tree from the forest
tree = model.estimators_[5]# Import tools needed for visualization
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, class_names = dataset.quality.unique().astype(str), rounded = True, precision = 1)# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')# Write graph to a png file
graph.write_png('tree_classification.png')

In [25]:
y_predicted = model.predict(X_test)

In [26]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_predicted)
cm

array([[ 0,  2,  1,  0],
       [ 0, 11,  2,  0],
       [ 0,  3,  9,  0],
       [ 0,  0,  1,  3]], dtype=int64)

In [27]:
importances = pd.DataFrame({'feature':features.columns,'importance':np.round(model.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances.head(15)

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
alcohol,0.147
sulphates,0.111
volatile acidity,0.103
total sulfur dioxide,0.103
density,0.092
chlorides,0.08
fixed acidity,0.076
pH,0.075
citric acid,0.074
residual sugar,0.071


In [28]:
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(X_train, y_train)

RandomForestRegressor(n_estimators=1000, random_state=42)

In [29]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)# Calculate the absolute errors
errors = abs(predictions - y_test)# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 0.39 degrees.


In [30]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 92.33 %.


In [31]:
feature_list = list(features.columns)

In [32]:
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3)
rf_small.fit(X_train, y_train)

# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot # Pull out one tree from the forest
tree = rf_small.estimators_[5]# Import tools needed for visualization
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')# Write graph to a png file
graph.write_png('tree_regression.png')