In [15]:
import numpy as np
import pprint
import pandas as pd
from scipy.spatial import distance
from collections import OrderedDict 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import  confusion_matrix,accuracy_score,classification_report
from collections import Counter
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.preprocessing import StandardScaler,LabelEncoder

In [16]:
def gini_equation(class_0, class_1):  
    return (1 - (class_0/(class_0+class_1))**2 - (class_1/(class_0+class_1))**2)

In [17]:
def gini_ind(train_features, train_label, current_feature, current_index,indices):
    gini_value = 1.0
    left_0 , left_1, right_0,right_1 = 0,0,0,0
    
    for i in range(len(train_features)):
        if i not in indices:
            continue
        if train_features[current_index,current_feature]>train_features[i,current_feature] and train_label[i]==0:
            left_0 +=1
        elif train_features[current_index,current_feature]>train_features[i,current_feature] and train_label[i]==1:
            left_1 +=1
        elif train_features[current_index,current_feature]<=train_features[i,current_feature] and train_label[i]==0:
            right_0 +=1
        else:
            right_1 +=1
   
            
    D = left_0+left_1+right_0+right_1
    D1 = (left_0+left_1) / D
    D2 = (right_0+right_1)/D
    
    if ((left_0+left_1) != 0 and (right_0+right_1) != 0): 
        Gini_D1,Gini_D2 = gini_equation(left_0, left_1),gini_equation(right_0, right_1)
        gini_value = D1*Gini_D1 + D2*Gini_D2
    
    return gini_value

In [18]:
def divide_values(train_inp,train_lab,r,f,ind):
    left_node,left_class,right_node, right_class= [],[],[],[]
    
    for i in range(train_inp.shape[0]):
        if (train_inp[i,f]<train_inp[r,f] and i in ind):
            left_node.append(i)
            left_class.append(train_lab[i])
        elif i in ind:
            right_node.append(i)
            right_class.append(train_lab[i])
      
    return (left_node,left_class,right_node, right_class)

In [19]:
def best_split(train_input,train_output,index):
    gini = 1
    m,n = 0,0
    
    for i in range(train_input.shape[1]):
        for j in range(train_input.shape[0]):
            if j in index:
                g = gini_ind(train_input, train_output,i, j ,index) 
                if g <gini:
                    gini = g
                    m,n = j,i
    
    return m,n

In [20]:
def recursive_tree(input_list,class_list,feature_train,label_train,row_ind,tree_depth,ver):
    
    if len(input_list)==0 or len(class_list)>1:
        if len(input_list)==0:
            return label_train[row_ind]
        elif tree_depth==0:
            return Counter(class_list).most_common(1)[0][0] 
        else:
            return partition(feature_train, label_train, input_list, tree_depth-1)        
    else:
        return list(class_list)[0]
    
    return 

In [21]:
def partition(train_feature, train_label, indices, max_depth):
    
     
    if len(indices)==0:
        return
    indices = Counter(indices)
    row_index, feature_index = best_split(train_feature,train_label,indices)
    left,left_labels,right,right_labels = divide_values(train_feature,train_label,row_index,feature_index,indices)
             

    vertice = {}
    vertice['value'] = train_feature[row_index,feature_index]
    vertice['feature'] = feature_index
    
    left_values = set(left_labels)
    right_values = set(right_labels)
 
    vertice['left'] = recursive_tree(left,left_values,train_feature,train_label,row_index,max_depth, vertice)
    vertice['right'] = recursive_tree(right,right_values,train_feature,train_label,row_index,max_depth, vertice)
    

    return vertice
    

In [22]:
def find_test_value(test_input, head_tree):
    entry = test_input
    if head_tree==None:
        return -1
    f = head_tree['feature']
        
    
    if (entry[f]<head_tree['value'])  and (head_tree['left']==0 or head_tree['left']==1):
        return head_tree['left']
    elif (entry[f]>=head_tree['value'] ) and (head_tree['right']==0 or head_tree['right']==1):
        return head_tree['right']
    elif entry[f]<head_tree['value']:
        return find_test_value(test_input, head_tree['left'])
    else:
        return find_test_value(test_input, head_tree['right'])
                      
    return -1

In [23]:
def test_output(test_input, head, nums, best_var):
    test_label = np.zeros(test_input.shape[0])
    iteration = np.zeros(nums)
    
    for i in range(len(test_input)):     
        for t in range(nums):
            output = find_test_value(test_input[i],head[t])
            
            if output==1:
                iteration[t] = best_var[t]
            else:
                iteration[t] = -1*best_var[t]
        if sum(iteration)>0:
            test_label[i] = 1
        else:
            test_label[i] = 0
    return test_label
    

In [24]:
def bootstrap_data(train_x,train_y,we):
    we =  we/sum(we)
    #print(pro)
    #print(sum(pro))
    bootstrap = np.random.choice(np.array((range(train_x.shape[0]))), size = train_x.shape[0],p =  we)
    bootstrap_train_data = []
    bootstrap_train_label = []
    for index in bootstrap:
        bootstrap_train_data.append(train_x[index,:])
        bootstrap_train_label.append(train_y[index])
        
    return np.array(bootstrap_train_data),np.array( bootstrap_train_label)

In [25]:
def  boost(train_x, train_y, test_x, dep ,number_of_trees):
    
    tree_head = []
    best_boost  = []
    w= np.ones(X_train.shape[0])/X_train.shape[0]
    number = 0
    
    while number<number_of_trees:
        inp_data, inp_label = bootstrap_data(train_x,train_y,w)
        b = partition(inp_data,inp_label,range(inp_data.shape[0]),dep)
        train_pred = np.zeros(inp_data.shape[0])
        
        train_prediction = np.logical_xor(train_pred,train_y)
        error = sum(w*train_prediction)/sum(w)
        var = 1/2 *(np.log((1-error)/ error))
        different = np.exp(var) / sum(w)
        same = np.exp(-var)/sum(w)
        
        for index,value in enumerate(train_prediction):
            if value:
                w[index] *= different
            else:
                w[index] *= same
                
        if error<0.5:
            pp = pprint.PrettyPrinter(indent=4)
            pp.pprint(b) 
            tree_head.append(b)
            best_boost.append(var)
            number +=1
    test_class =  test_output(test_x,tree_head,number_of_trees,best_boost)
              
    
    return test_class
    

In [28]:
# # This is with only one data
data  = pd.read_csv('project3_dataset1.txt',sep='\t',header=None)
# lc = LabelEncoder()
# data.iloc[:,4]= lc.fit_transform(data.iloc[:,4])
labels = data.iloc[:,-1].values
features = data.iloc[:,:-1].values
d = 4
tree = 4
X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size=0.2,random_state=21)
y_pred = boost(X_train, Y_train, X_test,d, tree)
print(y_pred)
print('accuracy' , accuracy_score(Y_test,y_pred))
p,r,f,s =  score(Y_test,y_pred)
print('precision',p.mean())
print('recall',r.mean())
print('f1measure',f.mean())


{   'feature': 22,
    'left': {   'feature': 27,
                'left': {   'feature': 23,
                            'left': 0,
                            'right': {   'feature': 1,
                                         'left': 0,
                                         'right': {   'feature': 1,
                                                      'left': 1,
                                                      'right': 0,
                                                      'value': 27.85},
                                         'value': 21.58},
                            'value': 758.2},
                'right': 1,
                'value': 0.1607},
    'right': {   'feature': 6,
                 'left': 0,
                 'right': {   'feature': 21,
                              'left': {   'feature': 6,
                                          'left': 0,
                                          'right': 1,
                                          'value': 0.1684},

In [27]:
# # This is for k fold cross validation

# from sklearn.model_selection import KFold
# data  = pd.read_csv('project3_dataset1.txt',sep='\t',header=None)
# # lc = LabelEncoder()
# # data.iloc[:,4]= lc.fit_transform(data.iloc[:,4])
# labels = data.iloc[:,-1].values
# features = data.iloc[:,:-1].values
# d = 4
# tree = 4

# kf = KFold(n_splits=10)
# precision = np.zeros(10)
# recall =  np.zeros(10)
# fscore =  np.zeros(10)
# accuracy =  np.zeros(10)
# i = 0
# for train_index, test_index in kf.split(features):
#     X_train,X_test = features[train_index],features[test_index]
#     y_train , y_test = labels[train_index], labels[test_index]
#     y_pred = boost(X_train, y_train, X_test,d, tree)
#     print(y_pred)
#     p,r,f,support = score(y_test,y_pred)
#     accuracy[i] = accuracy_score(y_test,y_pred)
#     precision[i] , recall[i], fscore[i] = p.mean(),r.mean(),f.mean()
#     i +=1
    
# print(accuracy)
# print('precision:',precision.mean())
# print('recall:',recall.mean())
# print('f1measure',fscore.mean())
# print('accuracy',accuracy.mean())
