In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cross_validation import train_test_split 
from sklearn.cross_validation import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn import metrics 
accuracy = metrics.accuracy_score



In [2]:
data = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", header = None) 
data.columns = ["Sepal Length", "Sepal Width","Petal Length","Petal Width","Class"]
data.head()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


### Apply Label Encoder

In [3]:
le = preprocessing.LabelEncoder()
le.fit(np.array(data.Class))
data["Class"] = le.transform(data.Class)
data.head()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


### Split Features And Predicted Class

In [4]:
X = data.ix[:,0:4]
target = data["Class"]

### Note: Will Focus Only On 100 Training Samples For KFolds

In [5]:
round(1-(100/len(data)), 4)

0.3333

In [6]:
train, test, targ_train, targ_test = train_test_split(X, target, test_size = round(1-(100/len(data)), 4), random_state = None)

In [7]:
print ("Train:       ", train.shape)
print ("Train Class: ", targ_train.shape)
print ("Test:        ", test.shape)
print ("Test Class:  ", targ_test.shape)

Train:        (100, 4)
Train Class:  (100,)
Test:         (50, 4)
Test Class:   (50,)


In [8]:
"""Recombine Training Set With Class"""
train = pd.DataFrame.join(train, targ_train)

In [9]:
train = train.reset_index()
test = test.reset_index()

In [10]:
del train["index"], test["index"]

### Part 1: 

Partition Training Data into 5 Folds:

- Create train_meta with same rows and IDs as training data
- Create Empty Columns: M1 and M2 (NaNs)

- Create test_meta variable with same rows and IDs as test dataset 
- Create Empty Columns: M1 and M2 (NaNs)

In [11]:
"""Create Train and Test Meta"""
train_meta = pd.DataFrame.copy(train)
train_meta["M1"] = np.nan
train_meta["M2"] = np.nan
print(train.shape)
train_meta.head()

(100, 5)


Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Class,M1,M2
0,5.0,3.5,1.3,0.3,0,,
1,4.9,3.1,1.5,0.1,0,,
2,5.3,3.7,1.5,0.2,0,,
3,6.4,3.2,5.3,2.3,2,,
4,6.1,2.6,5.6,1.4,2,,


In [12]:
test_meta = pd.DataFrame.copy(test)
# test_meta["M1"] = np.nan
# test_meta["M2"] = np.nan
print(test.shape)
test_meta.head()

(50, 4)


Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width
0,7.7,3.0,6.1,2.3
1,5.5,2.5,4.0,1.3
2,5.4,3.4,1.7,0.2
3,7.7,2.8,6.7,2.0
4,4.4,2.9,1.4,0.2


### Part 2

For each train fold: combine the other 4 into a training set and predict on each fold i 
- train fold 1 = 2,3,4,5 (Predict on fold 1)
- train fold 2 = 1,3,4,5 (Predict on Fold 2)
- train fold 3 = 1,2,4,5 (Predict on Fold 3)
- train fold 4 = 1,2,3,5 (Predict on Fold 4)
- train fold 5 = 1,2,3,4 (Predict on Fold 5)

In [13]:
from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = False)
for train_index, test_index in kf.split(train):
    print("Train:", train_index, "Test:", test_index, "\n")

Train: [20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
 95 96 97 98 99] Test: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] 

Train: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 40 41 42 43 44
 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
 95 96 97 98 99] Test: [20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39] 

Train: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 60 61 62 63 64 65 66 67 68 69
 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
 95 96 97 98 99] Test: [40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59] 

Train: [ 0  1  2  3  4  5  6  7  8

In [14]:
"""Copy and Paste Output: Change Target Index In Folds If Necessary"""
j = 1
for train_index, test_index in kf.split(train):
    print ('train_fold{} = train[{}:]'.format(j, 0))
    print ('train_fold{} = train_fold{}.drop(train_fold{}.index[{}:{}])'.format(j, j, j, min(test_index), max(test_index) + 1))
    print ('target_fold{} = train_fold{}.ix[:,4]'.format(j,j))
    print ('test_fold{} = train[{}:{}]'.format(j, min(test_index), max(test_index) + 1), "\n")
    j += 1

train_fold1 = train[0:]
train_fold1 = train_fold1.drop(train_fold1.index[0:20])
target_fold1 = train_fold1.ix[:,4]
test_fold1 = train[0:20] 

train_fold2 = train[0:]
train_fold2 = train_fold2.drop(train_fold2.index[20:40])
target_fold2 = train_fold2.ix[:,4]
test_fold2 = train[20:40] 

train_fold3 = train[0:]
train_fold3 = train_fold3.drop(train_fold3.index[40:60])
target_fold3 = train_fold3.ix[:,4]
test_fold3 = train[40:60] 

train_fold4 = train[0:]
train_fold4 = train_fold4.drop(train_fold4.index[60:80])
target_fold4 = train_fold4.ix[:,4]
test_fold4 = train[60:80] 

train_fold5 = train[0:]
train_fold5 = train_fold5.drop(train_fold5.index[80:100])
target_fold5 = train_fold5.ix[:,4]
test_fold5 = train[80:100] 



In [15]:
train_fold1 = train[0:]
train_fold1 = train_fold1.drop(train_fold1.index[0:20])
target_fold1 = train_fold1.ix[:,4]
test_fold1 = train[0:20] 

train_fold2 = train[0:]
train_fold2 = train_fold2.drop(train_fold2.index[20:40])
target_fold2 = train_fold2.ix[:,4]
test_fold2 = train[20:40] 

train_fold3 = train[0:]
train_fold3 = train_fold3.drop(train_fold3.index[40:60])
target_fold3 = train_fold3.ix[:,4]
test_fold3 = train[40:60] 

train_fold4 = train[0:]
train_fold4 = train_fold4.drop(train_fold4.index[60:80])
target_fold4 = train_fold4.ix[:,4]
test_fold4 = train[60:80] 

train_fold5 = train[0:]
train_fold5 = train_fold5.drop(train_fold5.index[80:100])
target_fold5 = train_fold5.ix[:,4]
test_fold5 = train[80:100] 

In [16]:
# for j in [1,2,3,4,5]:
#     print("print(train_fold{}.shape)".format(j))
#     print("print(target_fold{}.shape)".format(j))
#     print("print(test_fold{}.shape)".format(j), "\n")

In [17]:
print(train_fold1.shape)
print(target_fold1.shape)
print(test_fold1.shape) 

print(train_fold2.shape)
print(target_fold2.shape)
print(test_fold2.shape) 

print(train_fold3.shape)
print(target_fold3.shape)
print(test_fold3.shape) 

print(train_fold4.shape)
print(target_fold4.shape)
print(test_fold4.shape) 

print(train_fold5.shape)
print(target_fold5.shape)
print(test_fold5.shape) 

(80, 5)
(80,)
(20, 5)
(80, 5)
(80,)
(20, 5)
(80, 5)
(80,)
(20, 5)
(80, 5)
(80,)
(20, 5)
(80, 5)
(80,)
(20, 5)


### Part 3: Training Storing Data Into Train M1 and M2
- Use Model M1 to train on each train fold i and predict the excluded fold and store the values back into M1 for its fold ID
- Do the same with M2

In [18]:
"""Instantiate Models"""
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# m2 = RandomForestClassifier()
m1 = KNeighborsClassifier(n_neighbors = 12)
# m2 = GradientBoostingClassifier()
m2 = LogisticRegression()


In [19]:
"""Train Data On First Model"""
train_folds = [train_fold1, train_fold2, train_fold3, train_fold4, train_fold5]
target_folds = [target_fold1, target_fold2, target_fold3, target_fold4, target_fold5]
test_folds = [test_fold1, test_fold2, test_fold3, test_fold4, test_fold5]

y_pred1 = []
for i, j in zip(train_folds, target_folds):
    m1.fit(i, j)
for k in test_folds:
    target_pred = m1.predict(k)
    y_pred1.append(target_pred) 
    
y_pred1 = np.concatenate(y_pred1, axis=0)
y_pred1 = pd.DataFrame(y_pred1)
train_meta["M1"] = y_pred1

In [20]:
"""Train Data On Second Model"""
y_pred2 = []
for i, j in zip(train_folds, target_folds):
    m2.fit(i, j)
for k in test_folds:
    target_pred2 = m2.predict(k)
    y_pred2.append(target_pred2) 
    
y_pred2 = np.concatenate(y_pred2, axis=0)
y_pred2 = pd.DataFrame(y_pred2)
train_meta["M2"] = y_pred2

### Part 5: Predict On Test_Meta
- Predict m1 on test_meta
- Predict m2 on test_meta

Part 4
Use another model (S) and use train_meta to make final predictions on test_meta 

In [21]:
# test_meta

### Separate Targets (M1, M2, Class) From Train Set and Predict On Test

In [22]:
train_meta_Class = train_meta["Class"]
train_meta_M1 = train_meta["M1"]
train_meta_M2 = train_meta["M2"]

del train_meta["Class"]
training = pd.DataFrame.copy(train_meta)
del train_meta["M1"], train_meta["M2"]

In [23]:
"""Train Data On Third Model"""
from sklearn.linear_model import LogisticRegression
m1 = GradientBoostingClassifier()

m1.fit(train_meta, train_meta_M1)
target_pred_M1 = m1.predict(test_meta)

m2.fit(train_meta, train_meta_M2)
target_pred_M2 = m2.predict(test_meta)

test_meta["M1"] = target_pred_M1
test_meta["M2"] = target_pred_M2

In [24]:
training.head()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,M1,M2
0,5.0,3.5,1.3,0.3,0,0
1,4.9,3.1,1.5,0.1,0,0
2,5.3,3.7,1.5,0.2,0,0
3,6.4,3.2,5.3,2.3,2,2
4,6.1,2.6,5.6,1.4,2,2


In [25]:
test_meta.head()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,M1,M2
0,7.7,3.0,6.1,2.3,2,2
1,5.5,2.5,4.0,1.3,1,1
2,5.4,3.4,1.7,0.2,0,0
3,7.7,2.8,6.7,2.0,2,2
4,4.4,2.9,1.4,0.2,0,0


### Part 6 Predict On Actual Testing Set

In [26]:
s = GradientBoostingClassifier()
s.fit(training, train_meta_Class)
target_pred_Class = s.predict(test_meta)

test_meta["Class"] = target_pred_Class

In [27]:
print(round(accuracy(targ_test, target_pred_Class)*100, 2), "% Accuracy")

96.0 % Accuracy


In [28]:
test_meta

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,M1,M2,Class
0,7.7,3.0,6.1,2.3,2,2,2
1,5.5,2.5,4.0,1.3,1,1,1
2,5.4,3.4,1.7,0.2,0,0,0
3,7.7,2.8,6.7,2.0,2,2,2
4,4.4,2.9,1.4,0.2,0,0,0
5,5.6,3.0,4.1,1.3,1,1,1
6,4.6,3.4,1.4,0.3,0,0,0
7,5.5,3.5,1.3,0.2,0,0,0
8,6.9,3.1,4.9,1.5,1,1,1
9,6.0,3.0,4.8,1.8,1,2,2
