# Use the pseudocode you came up with in class to write your own 5-fold cross-validation function that splits the data set into

- Don't forget to shuffle the input before assigning to the splits

- You can use the fit
- Test the results with the sklearn cross_val_score
- In your PR, discuss what challenges you had creating this function and if it helped you better understand cross validation

define the cross_validation function(model, target, attributes):

  -take the length of attributes array
  
  -divide length of attribute array by 5
  
  -shuffle the data based on the index
  
  -split attribute array into five subsets
  
  -for each subset of the attribute array:
  
    *assign four portions of that subset to training
    *fit model to training 
    *assign one portion of that subset to test
    *predict on test data
    *score model = compare predicted y to actual y
    

In [92]:
import pandas as pd
%matplotlib inline
from sklearn import datasets
from sklearn import tree
from sklearn import metrics
import matplotlib.pyplot as plt
from random import shuffle
import numpy as np


In [93]:
iris = datasets.load_iris()

In [94]:
iris

 'data': array([[ 5.1,  3.5,  1.4,  0.2],
        [ 4.9,  3. ,  1.4,  0.2],
        [ 4.7,  3.2,  1.3,  0.2],
        [ 4.6,  3.1,  1.5,  0.2],
        [ 5. ,  3.6,  1.4,  0.2],
        [ 5.4,  3.9,  1.7,  0.4],
        [ 4.6,  3.4,  1.4,  0.3],
        [ 5. ,  3.4,  1.5,  0.2],
        [ 4.4,  2.9,  1.4,  0.2],
        [ 4.9,  3.1,  1.5,  0.1],
        [ 5.4,  3.7,  1.5,  0.2],
        [ 4.8,  3.4,  1.6,  0.2],
        [ 4.8,  3. ,  1.4,  0.1],
        [ 4.3,  3. ,  1.1,  0.1],
        [ 5.8,  4. ,  1.2,  0.2],
        [ 5.7,  4.4,  1.5,  0.4],
        [ 5.4,  3.9,  1.3,  0.4],
        [ 5.1,  3.5,  1.4,  0.3],
        [ 5.7,  3.8,  1.7,  0.3],
        [ 5.1,  3.8,  1.5,  0.3],
        [ 5.4,  3.4,  1.7,  0.2],
        [ 5.1,  3.7,  1.5,  0.4],
        [ 4.6,  3.6,  1. ,  0.2],
        [ 5.1,  3.3,  1.7,  0.5],
        [ 4.8,  3.4,  1.9,  0.2],
        [ 5. ,  3. ,  1.6,  0.2],
        [ 5. ,  3.4,  1.6,  0.4],
        [ 5.2,  3.5,  1.5,  0.2],
        [ 5.2,  3.4,  1.4,  0.2],
      

In [95]:
iris.keys()

dict_keys(['DESCR', 'target_names', 'target', 'feature_names', 'data'])

In [96]:
#extract our x and y
x = iris.data[:,2:] 
y = iris.target

In [97]:
#we merge them
my_data = np.column_stack([x,y])    

In [98]:
my_data

array([[ 1.4,  0.2,  0. ],
       [ 1.4,  0.2,  0. ],
       [ 1.3,  0.2,  0. ],
       [ 1.5,  0.2,  0. ],
       [ 1.4,  0.2,  0. ],
       [ 1.7,  0.4,  0. ],
       [ 1.4,  0.3,  0. ],
       [ 1.5,  0.2,  0. ],
       [ 1.4,  0.2,  0. ],
       [ 1.5,  0.1,  0. ],
       [ 1.5,  0.2,  0. ],
       [ 1.6,  0.2,  0. ],
       [ 1.4,  0.1,  0. ],
       [ 1.1,  0.1,  0. ],
       [ 1.2,  0.2,  0. ],
       [ 1.5,  0.4,  0. ],
       [ 1.3,  0.4,  0. ],
       [ 1.4,  0.3,  0. ],
       [ 1.7,  0.3,  0. ],
       [ 1.5,  0.3,  0. ],
       [ 1.7,  0.2,  0. ],
       [ 1.5,  0.4,  0. ],
       [ 1. ,  0.2,  0. ],
       [ 1.7,  0.5,  0. ],
       [ 1.9,  0.2,  0. ],
       [ 1.6,  0.2,  0. ],
       [ 1.6,  0.4,  0. ],
       [ 1.5,  0.2,  0. ],
       [ 1.4,  0.2,  0. ],
       [ 1.6,  0.2,  0. ],
       [ 1.6,  0.2,  0. ],
       [ 1.5,  0.4,  0. ],
       [ 1.5,  0.1,  0. ],
       [ 1.4,  0.2,  0. ],
       [ 1.5,  0.1,  0. ],
       [ 1.2,  0.2,  0. ],
       [ 1.3,  0.2,  0. ],
 

In [99]:
#we shuffle the dataset
np.random.shuffle(my_data)

In [100]:
#let's check if the data looks different now..
my_data

array([[ 1.5,  0.4,  0. ],
       [ 4.9,  2. ,  2. ],
       [ 4.5,  1.5,  1. ],
       [ 1.4,  0.2,  0. ],
       [ 5.9,  2.1,  2. ],
       [ 5.8,  1.6,  2. ],
       [ 1.2,  0.2,  0. ],
       [ 3.9,  1.1,  1. ],
       [ 3.6,  1.3,  1. ],
       [ 4.1,  1.3,  1. ],
       [ 1.6,  0.2,  0. ],
       [ 6.7,  2. ,  2. ],
       [ 5.2,  2.3,  2. ],
       [ 5.3,  2.3,  2. ],
       [ 3. ,  1.1,  1. ],
       [ 5. ,  1.9,  2. ],
       [ 1.3,  0.4,  0. ],
       [ 5.6,  2.4,  2. ],
       [ 5.5,  1.8,  2. ],
       [ 4.3,  1.3,  1. ],
       [ 5.4,  2.3,  2. ],
       [ 1.5,  0.1,  0. ],
       [ 1.9,  0.4,  0. ],
       [ 6. ,  2.5,  2. ],
       [ 1.4,  0.2,  0. ],
       [ 5.6,  2.4,  2. ],
       [ 1.5,  0.1,  0. ],
       [ 3.5,  1. ,  1. ],
       [ 4.3,  1.3,  1. ],
       [ 1.3,  0.3,  0. ],
       [ 1.6,  0.2,  0. ],
       [ 1.5,  0.2,  0. ],
       [ 4.2,  1.3,  1. ],
       [ 1. ,  0.2,  0. ],
       [ 5.6,  2.1,  2. ],
       [ 1.4,  0.3,  0. ],
       [ 4.7,  1.4,  1. ],
 

In [101]:
#this is my attributes array
my_data[:,:2]

array([[ 1.5,  0.4],
       [ 4.9,  2. ],
       [ 4.5,  1.5],
       [ 1.4,  0.2],
       [ 5.9,  2.1],
       [ 5.8,  1.6],
       [ 1.2,  0.2],
       [ 3.9,  1.1],
       [ 3.6,  1.3],
       [ 4.1,  1.3],
       [ 1.6,  0.2],
       [ 6.7,  2. ],
       [ 5.2,  2.3],
       [ 5.3,  2.3],
       [ 3. ,  1.1],
       [ 5. ,  1.9],
       [ 1.3,  0.4],
       [ 5.6,  2.4],
       [ 5.5,  1.8],
       [ 4.3,  1.3],
       [ 5.4,  2.3],
       [ 1.5,  0.1],
       [ 1.9,  0.4],
       [ 6. ,  2.5],
       [ 1.4,  0.2],
       [ 5.6,  2.4],
       [ 1.5,  0.1],
       [ 3.5,  1. ],
       [ 4.3,  1.3],
       [ 1.3,  0.3],
       [ 1.6,  0.2],
       [ 1.5,  0.2],
       [ 4.2,  1.3],
       [ 1. ,  0.2],
       [ 5.6,  2.1],
       [ 1.4,  0.3],
       [ 4.7,  1.4],
       [ 1.6,  0.2],
       [ 3.9,  1.4],
       [ 1.7,  0.5],
       [ 6.6,  2.1],
       [ 4.6,  1.5],
       [ 4.2,  1.3],
       [ 6.1,  2.5],
       [ 4.8,  1.8],
       [ 4. ,  1.3],
       [ 5. ,  2. ],
       [ 4. ,

In [102]:
my_data

array([[ 1.5,  0.4,  0. ],
       [ 4.9,  2. ,  2. ],
       [ 4.5,  1.5,  1. ],
       [ 1.4,  0.2,  0. ],
       [ 5.9,  2.1,  2. ],
       [ 5.8,  1.6,  2. ],
       [ 1.2,  0.2,  0. ],
       [ 3.9,  1.1,  1. ],
       [ 3.6,  1.3,  1. ],
       [ 4.1,  1.3,  1. ],
       [ 1.6,  0.2,  0. ],
       [ 6.7,  2. ,  2. ],
       [ 5.2,  2.3,  2. ],
       [ 5.3,  2.3,  2. ],
       [ 3. ,  1.1,  1. ],
       [ 5. ,  1.9,  2. ],
       [ 1.3,  0.4,  0. ],
       [ 5.6,  2.4,  2. ],
       [ 5.5,  1.8,  2. ],
       [ 4.3,  1.3,  1. ],
       [ 5.4,  2.3,  2. ],
       [ 1.5,  0.1,  0. ],
       [ 1.9,  0.4,  0. ],
       [ 6. ,  2.5,  2. ],
       [ 1.4,  0.2,  0. ],
       [ 5.6,  2.4,  2. ],
       [ 1.5,  0.1,  0. ],
       [ 3.5,  1. ,  1. ],
       [ 4.3,  1.3,  1. ],
       [ 1.3,  0.3,  0. ],
       [ 1.6,  0.2,  0. ],
       [ 1.5,  0.2,  0. ],
       [ 4.2,  1.3,  1. ],
       [ 1. ,  0.2,  0. ],
       [ 5.6,  2.1,  2. ],
       [ 1.4,  0.3,  0. ],
       [ 4.7,  1.4,  1. ],
 

In [103]:
#we figure out how long is our array
array_length= int(len(my_data) / 5)
n= array_length

In [104]:
#then we can divide it in 5 subsets, we found this online, still note sure what each line of code does.
def chunks(l, n):
    n = max(1, n)
    return [l[i:i + n] for i in range(0, len(l), n)]

In [105]:
#now im checking to see if I have 5 subsets by calling the first one.
subsets= chunks(my_data,n)
subsets[1]

array([[ 1.6,  0.2,  0. ],
       [ 1.5,  0.2,  0. ],
       [ 4.2,  1.3,  1. ],
       [ 1. ,  0.2,  0. ],
       [ 5.6,  2.1,  2. ],
       [ 1.4,  0.3,  0. ],
       [ 4.7,  1.4,  1. ],
       [ 1.6,  0.2,  0. ],
       [ 3.9,  1.4,  1. ],
       [ 1.7,  0.5,  0. ],
       [ 6.6,  2.1,  2. ],
       [ 4.6,  1.5,  1. ],
       [ 4.2,  1.3,  1. ],
       [ 6.1,  2.5,  2. ],
       [ 4.8,  1.8,  1. ],
       [ 4. ,  1.3,  1. ],
       [ 5. ,  2. ,  2. ],
       [ 4. ,  1.3,  1. ],
       [ 4.1,  1. ,  1. ],
       [ 6. ,  1.8,  2. ],
       [ 1.5,  0.2,  0. ],
       [ 1.4,  0.1,  0. ],
       [ 1.5,  0.2,  0. ],
       [ 4.1,  1.3,  1. ],
       [ 4.7,  1.5,  1. ],
       [ 1.5,  0.4,  0. ],
       [ 1.5,  0.1,  0. ],
       [ 1.6,  0.2,  0. ],
       [ 1.5,  0.2,  0. ],
       [ 1.7,  0.2,  0. ]])

In [106]:
subsets[0][:,2:]

array([[ 0.],
       [ 2.],
       [ 1.],
       [ 0.],
       [ 2.],
       [ 2.],
       [ 0.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 0.],
       [ 2.],
       [ 2.],
       [ 2.],
       [ 1.],
       [ 2.],
       [ 0.],
       [ 2.],
       [ 2.],
       [ 1.],
       [ 2.],
       [ 0.],
       [ 0.],
       [ 2.],
       [ 0.],
       [ 2.],
       [ 0.],
       [ 1.],
       [ 1.],
       [ 0.]])

In [107]:
dt = tree.DecisionTreeClassifier()

In [108]:
# for the following part I must give credit to Barney! He was the one who figured it out.

In [111]:
average_list = []
for item in subsets:
    x_chunks= item[:,:2]
    y_chunks= item[:,2:]
    for x, y in zip(x_chunks, y_chunks):
        x_test = list(x_chunks).pop(0)
        x_train = x_chunks[:]   #sum(x_chunks, [])
        list(x_chunks).append(x_test)
        
        y_test = list(y_chunks).pop(0)
        y_train = y_chunks[:]#sum(y_chunks, [])
        list(y_chunks).append(y_test)
        #print("trainers: ", x_train, y_train)
        dt = dt.fit(x_train,y_train)
        y_pred= dt.predict(x_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
average_list.append(accuracy)   
print(average_list)

[1.0]


