In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
import pydot

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [6]:
train_data = pd.read_csv("/Users/qinghongxu/Documents/MATH895/project/abalone.data.txt",
                          delimiter=',' )

In [7]:
train_data['Sex'] = LabelEncoder().fit_transform(train_data['Sex'].tolist())

In [8]:
train_data.head()
Xtrain = train_data[0:3133]
Xtest = train_data[3133:4177]
#Xtrain = train_data.drop(['Rings'], axis = 1)[0:19]
#Xtest = train_data.drop(['Rings'], axis = 1)[3133:4177]
#Ytrain = train_data['Rings'][0:19]
#Ytest = train_data['Rings'][3133:4177]

In [None]:
# Classify the ages to young, adult, old
train_data['newRings'] = np.where(train_data['Rings'] > 10,1,0)
Xtrain = train_data.drop(['Rings','newRings'], axis = 1)[0:3133]
Xtest = train_data.drop(['Rings','newRings'], axis = 1)[3133:4177]
Ytrain = train_data['newRings'][0:3133]
Ytest = train_data['newRings'][3133:4177]

In [None]:
logreg = LogisticRegression()
logreg.fit(Xtrain, Ytrain)
Yprediction = logreg.predict(Xtest)
result_acc = accuracy_score(Ytest,Yprediction) 
result_acc

In [9]:
# Compute the gini index for two groups
def gini_index(groups, classes):
    gini = 0.0
    total_samples = float(sum([len(group) for group in groups]))
    for group in groups:
        size = float(len(group))
        if size == 0:
            continue
        score = 0.0
        for class_value in classes:
            proportion = [row[-1] for row in group].count(class_value) / size
            score = score + proportion * proportion
        gini = gini + (1 - score) * size/total_samples
    return gini

In [10]:
def test_split(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right

In [11]:
def get_split(dataset):
    class_values = list(set(row[-1] for row in dataset))
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    for index in range(len(dataset[0])-1):
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            gini = gini_index(groups, class_values)
            #print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], gini))
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, row[index], gini, groups
    return {'index':b_index, 'value':b_value, 'groups': b_groups, 'gini': b_score}           

In [12]:
def to_terminal(groups):
    outcomes = [row[-1] for row in groups]
    return max(set(outcomes), key = outcomes.count)

In [13]:
def split(node, max_depth, min_number, depth):
    left, right = node['groups']
    del node['groups']
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    if len(left) <= min_number:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_split(left)
        split(node['left'], max_depth, min_number, depth+1)
    if len(right) <= min_number:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_split(right)
        split(node['right'], max_depth, min_number, depth+1)

In [14]:
def build_tree(train, max_depth, min_number):
    root = get_split(train)
    split(root, max_depth, min_number, 1)
    return root

In [15]:
def print_tree(node, depth = 0):
    if isinstance(node, dict):
        print('%s[X%d < %.3f]' % (depth*' ', node['index']+1, node['value']))
        print_tree(node['left'], depth+1)
        print_tree(node['right'], depth+1)
    else:
        print('%s[%s]' % (depth*' ', node))        

In [16]:
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']

In [17]:
def prediction(tree,test_data):
    predictions = list()
    for row in test_data:
        prediction = predict(tree, row)
        predictions.append(prediction)
    return predictions

In [18]:
def accuracy(actual, predict):
    correct = 0.0
    for row in range(len(predict)):
        if actual[row] == predict[row]:
            correct += 1
    return float(correct)/len(predict) * 100

In [19]:
def decision_tree(train_data,test_data, max_depth, min_number):
    actual_train = list()
    actual_test = list()
    tree = build_tree(train_data, max_depth, min_number)
    print_tree(tree)
    predictions_train = prediction(tree,train_data)
    predictions_test = prediction(tree,test_data)
    for row in train_data:
        actual_train.append(row[-1])
    for row in test_data:
        actual_test.append(row[-1])
    accuracy_train = accuracy(actual_train, predictions_train)
    accuracy_test = accuracy(actual_test, predictions_test)
    return accuracy_train, accuracy_test, tree

In [20]:
train_data = Xtrain.values.tolist()
train_data[1:20]
test_data = Xtest.values.tolist()
test_data[1:20]
accuracy_train, accuracy_test, tree = decision_tree(train_data[1:50],test_data[1:50], 3, 1)

[X7 < 0.049]
 [X2 < 0.330]
  [X2 < 0.325]
   [5.0]
   [6.0]
  [X1 < 2.000]
   [7.0]
   [7.0]
 [X8 < 0.260]
  [X2 < 0.450]
   [10.0]
   [9.0]
  [X2 < 0.550]
   [16.0]
   [11.0]


In [30]:
menu = {'dinner':
            {'chicken':'good',
             'beef':'average',
             'vegetarian':{
                   'tofu':'good',
                   'salad':{
                            'caeser':'bad',
                            'italian':'average'}
                   },
             'pork':'bad'}
        }
def draw(parent_name, child_name):
    edge = pydot.Edge(parent_name, child_name)
    graph.add_edge(edge)

def visit(node, parent=None):
    for k,v in node.iteritems():
        if isinstance(v, dict):
            # We start with the root node whose parent is None
            # we don't want to graph the None node
            if parent:
                draw(parent, k)
            visit(v, k)
        else:
            draw(parent, k)
            # drawing the label using a distinct name
            draw(k, k+'_'+v)

graph = pydot.Dot(graph_type='graph')
visit(menu)
graph.write_png('example1_graph.png')

OSError: [Errno 2] "dot" not found in path.

AttributeError: 'Dot' object has no attribute 'info'