In [2]:
import sklearn

# In this lesson we will learn how to write a simple machine learning algorithm called a decision tree.
# This algorithm will predict a persons gender based on their height and weight. 
# The first thing we do is import scikitlearn (above). 
# We then import the decision tree from sckikitlearn and train it with sample data. 
# 'Features' are characteristics of datapoints, while 'labels' are an exclusive name assigned to a datapoint. 
# By assigning the labels to the features using the code below we train the decision tree to predict a label
# based on only a set of features. The man, for example, who is 170 lbs and 1.72 meters tall is not included in the 
# training data but is nevertheless correctly identified as male. 

In [17]:
from sklearn import tree
features = [[130, 1.75], [150, 1.7], [127, 1.57], [160, 1.77], [97, 1.57], [127, 1.6]]
labels = ['male', 'male', 'female', 'male', 'female', 'female']
clf = tree.DecisionTreeClassifier()
clf = clf.fit(features, labels)
clf.predict([[170, 1.72]])
from scikitlearn.metrics import 

array(['male'], dtype='<U6')

In [None]:
# Decision trees operate by asking yes or no questions based on your features. Therefore, the features you pick to 
# train your tree are very important in making an accurate prediction. If your classification is binary, 
# (into two groups) your features should be good distinguishers between those two things. Including features that 
# distinguish the two things can be detrimental to your tree's accuracy because it then relies on useless data, at
# least in part, to make it's decision. 
#
#

In [24]:
# Now let's try something similar with more complicated data. 
# sklearn comes with several example datasets built in. We can load one using the code below. The .target function
# allows us to see how many kinds of things we are dealing with. In this case, each of the three species 
# of flower has its own number. 

from sklearn import datasets
iris = datasets.load_iris() 
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [29]:
# Lets think of our decision tree as a function for a moment, and set the data itself equal to x (the input), and
# the label (output) to y. So, what goes into our function is a group of features and what comes out is the kind of 
# of flower, represented by a 0, 1, or 2. 

x = iris.data
y = iris.target

In [51]:
# For the purpose of training decision trees there is a great utility within scikitlearn called train_test_split.
# This utility splits data into training data and testing data. Training data is used to teach the computer to 
# recognize patterns. Testing data is used to determine how well the computer has learned. In the above example, the 
# original heights and weights assigned to the features variable are the training data. The new height and weight
# (170 lbs, 1.72 m) that is introduced to the algorithm is the testing data. The train_test_split utility divides data
# into training and testing data in whatever ratio we specify. 

x = iris.data
y = iris.target

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .5) 

from sklearn import tree

redwood = tree.DecisionTreeClassifier()
redwood.fit(x_train, y_train)
xt = redwood.predict(x_test)

# The code below runs a test that checks the test data against the training data to how accurate our algorithm is. 
# This particular algorithm is 96% accurate. Because chance is a still a factor in the accuracy of the algorithm, 
# we sometimes get different results from our test, as shown in the cell below. 

from sklearn.metrics import accuracy_score 
accuracy_score(y_test, xt) 



0.96

In [54]:
accuracy_score(y_test, xt) 

0.9333333333333333

In [44]:
# You can see here exactly where the algorithm went astray. 

In [49]:
xt

array([0, 2, 0, 1, 1, 0, 1, 2, 0, 2, 0, 1, 2, 0, 2, 1, 0, 1, 2, 1, 1, 1,
       1, 0, 1, 0, 2, 1, 1, 2, 1, 2, 1, 1, 0, 0, 1, 0, 2, 2, 2, 2, 2, 1,
       0, 0, 1, 2, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 2, 0, 0, 0, 0, 2, 2, 0,
       0, 2, 2, 1, 0, 2, 0, 0, 2])

In [50]:
y_test

array([0, 2, 0, 1, 1, 0, 1, 2, 0, 2, 0, 2, 2, 0, 2, 1, 0, 1, 2, 1, 2, 2,
       1, 0, 1, 0, 2, 1, 1, 2, 1, 2, 2, 1, 0, 0, 1, 0, 2, 1, 2, 2, 2, 1,
       0, 0, 1, 2, 0, 2, 1, 1, 1, 0, 1, 0, 1, 1, 2, 0, 0, 0, 0, 2, 2, 0,
       0, 2, 2, 1, 0, 2, 0, 0, 2])