In [170]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
from sklearn import tree
import pydot
from sklearn.metrics import accuracy_score

In [171]:
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [172]:
train_data = pd.read_csv("/Users/qinghongxu/Documents/MATH895/project/abalone.data.txt",
                          delimiter=',' )

In [173]:
train_data['Sex'] = LabelEncoder().fit_transform(train_data['Sex'].tolist())

In [174]:
train_data.head()
Xtrain = train_data[0:3133]
Xtest = train_data[3133:4177]
Xtrain = Xtrain.values.tolist()
Xtest = Xtest.values.tolist()

In [175]:
def gini_index(groups, classes):
    all_samples = float(sum(len(group) for group in groups))
    gini = 0.0
    for group in groups:
        size = float(len(group))
        probability = 0.0
        if size == 0:
            continue
        for class_index in classes:
            p = [row[-1] for row in group].count(class_index) / size          
            probability += p * p
        gini += (1 - probability)* size/all_samples
    return gini

In [176]:
def entropy_index(groups, classes):
    all_samples = float(sum(len(group) for group in groups))
    entropy = 0.0
    for group in groups:
        size = float(len(group))
        probability = 0.0
        if size == 0:
            continue
        for class_index in classes:
            p = [row[-1] for row in group].count(class_index) / size          
            probability += p * np.log(p)
        entropy -=  probability * size/all_samples
    return entropy

In [177]:
def test_split(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right            

In [178]:
def get_split(dataset, measurement):
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    class_values = list(set(row[-1] for row in dataset))
    for index in range(len(dataset[0])-1):
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            if measurement == 1:
                score = gini_index(groups, class_values)
                #print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], score))
            else:
                score = entropy_index(groups, class_values)
                #print('X%d < %.3f Entropy=%.3f' % ((index+1), row[index], score))
            if score < b_score:
                     b_score, b_index, b_value, b_groups = score, index, row[index], groups
    return {'gini':b_score, 'index':b_index, 'value':b_value, 'groups':b_groups}

In [179]:
def terminal_outcome(node):
    outcomes = [row[-1] for row in node]
    return max(set(outcomes), key = outcomes.count)

In [180]:
def split(max_depth, min_size, node, depth, measurement):
    left, right = node['groups']
    del node['groups']
    if not left or not right:
        node['left'] = node['right'] =  terminal_outcome(right + left)
        return
    if depth >= max_depth:
        node['left'], node['right'] = terminal_outcome(left), terminal_outcome(right)
        return
    if len(left) <= min_size:
        node['left'] = terminal_outcome(left)
    else:
        node['left'] = get_split(left, measurement)
        split(max_depth, min_size, node['left'], depth+1, measurement)
    if len(right) <= min_size:
        node['right'] = terminal_outcome(right)
    else:
        node['right'] = get_split(right, measurement)
        split(max_depth, min_size, node['right'], depth+1, measurement)    

In [181]:
def build_tree(max_depth, min_size, measurement, dataset):
    root = get_split(dataset,measurement)
    split(max_depth, min_size, root, 1, measurement)
    return root

In [182]:
def print_tree(node, depth = 0):
    if isinstance(node, dict):
        print('%sX%d < %.3f' % (depth*' ', node['index'], node['value']))
        print_tree(node['left'], depth+1)
        print_tree(node['right'], depth+1)
    else:
        print('%s%d' % (depth*' ', node))

In [183]:
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']

In [184]:
def prediction(tree,test_data):
    predictions = list()
    for row in test_data:
        prediction = predict(tree, row)
        predictions.append(prediction)
    return predictions

In [185]:
def accuracy(actual, predict):
    correct = 0.0
    for row in range(len(predict)):
        if actual[row] == predict[row]:
            correct += 1
    return float(correct)/len(predict) * 100

In [186]:
def decision_tree(train_data,test_data, max_depth, min_size,measurement):
    actual_train = list()
    actual_test = list()
    tree = build_tree(max_depth, min_size, measurement,test_data)
    print_tree(tree)
    predictions_train = prediction(tree,train_data)
    predictions_test = prediction(tree,test_data)
    for row in train_data:
        actual_train.append(row[-1])
    for row in test_data:
        actual_test.append(row[-1])
    accuracy_train = accuracy(actual_train, predictions_train)
    accuracy_test = accuracy(actual_test, predictions_test)
    return accuracy_train, accuracy_test, tree

In [169]:
model = tree.DecisionTreeClassifier(max_depth=3, min_samples_split=2)
model.fit(Xtrain, Ytrain)
y_predict = model.predict(Xtest)
accuracy_score(Ytest, y_predict)

0.14

In [168]:
Xtrain = train_data.drop(['Rings'], axis = 1)[1:50]
Xtest = train_data.drop(['Rings'], axis = 1)[50:100]
Ytrain = train_data['Rings'][1:50]
Ytest = train_data['Rings'][50:100]