## Challenge: If a tree falls in the forest...

In this script, I will create the best decision tree I can and compare it to a random forest.

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn import tree
from IPython.display import Image
import pydotplus
import graphviz
import time

In [2]:
#import data and take a look
data = pd.read_csv(r'C:\Users\jmfra\OneDrive\Documents\Thinkful Data Science Files\3.1.4 data\wine.csv', delimiter = '";"|;', engine = 'python')
data.head()

Unnamed: 0,"""fixed acidity""","""volatile acidity""","""citric acid""","""residual sugar""","""chlorides""","""free sulfur dioxide""","""total sulfur dioxide""","""density""","""pH""","""sulphates""","""alcohol""","""quality"""""""
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
#we have a clear dependent variable in quality, but all the titles are hard to use so lets rewrite them 
data=data.rename(columns = {'"fixed acidity"':'Fixed_Acidity'})
data=data.rename(columns = {'"volatile acidity"':'Volatile_acidity'})
data=data.rename(columns = {'"citric acid"':'Citric_Acid'})
data=data.rename(columns = {'"residual sugar"':'Residual_Sugar'})
data=data.rename(columns = {'"chlorides"':'Chlorides'})
data=data.rename(columns = {'"free sulfur dioxide"':'Free_Sulfur_Dioxide'})
data=data.rename(columns = {'"total sulfur dioxide"':'Total_Sulfur_Dioxide'})
data=data.rename(columns = {'"density"':'Density'})
data=data.rename(columns = {'"pH"':'pH'})
data=data.rename(columns = {'"sulphates"':'Sulphates'})
data=data.rename(columns = {'"alcohol"':'Alcohol'})
data=data.rename(columns = {'"quality"""':'Quality'})
data.head()

Unnamed: 0,Fixed_Acidity,Volatile_acidity,Citric_Acid,Residual_Sugar,Chlorides,Free_Sulfur_Dioxide,Total_Sulfur_Dioxide,Density,pH,Sulphates,Alcohol,Quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
#random forests and decision trees are much more accurate when working with a binary so I changed the quality which ranged from 
#3-8 into two groups. The weighted mean was inbetween 5 and 6, so anything 5 or below is below average and anything 6 or above 
#is above average.
Qdata = pd.DataFrame()
Qdata['Quality'] = np.where(data['Quality'] >= 6, 1, 0)
data = data.drop('Quality', 1)

In [5]:
#after some iteration, roughly .65 is the best we can do
start_time = time.time()
decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    random_state = 300
)
x = cross_val_score(decision_tree, data, Qdata, cv=10).mean()
print(x)
print("--- %s seconds ---" % (time.time() - start_time))

0.646694548615
--- 0.150404691696167 seconds ---


In [6]:
start_time = time.time()
rfc = ensemble.RandomForestClassifier()

y = cross_val_score(rfc, data, Qdata.values.ravel(), cv=10).mean()
print(y)
print("--- %s seconds ---" % (time.time() - start_time))

0.713099315403
--- 0.3304147720336914 seconds ---


In [None]:
#as you can see, random forest is more accurate at predicting than our best decision tree. The increase in prediction accuracy is
#only .06 and more than doubles the time it takes to recieve a score. While this specific dataset only has a few hundred lines,
#this is a real potential problem if you have hundreds of thousands of lines.