In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cross_validation import train_test_split 
from sklearn.cross_validation import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn import metrics 
accuracy = metrics.accuracy_score



In [2]:
data = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", header = None) 
data.columns = ["Sepal Length", "Sepal Width","Petal Length","Petal Width","Class"]
data.head()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
le = preprocessing.LabelEncoder()
le.fit(np.array(data.Class))
data["Class"] = le.transform(data.Class)
data.head()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [4]:
X = data.ix[:,0:4]
target = data["Class"]

In [5]:
round(1-(100/len(data)), 4)

0.3333

In [6]:
train, test, targ_train, targ_test = train_test_split(X, target, test_size = round(1-(100/len(data)), 4), random_state = 0)

In [7]:
"""Note: We Will Focus Only On 100 Training Samples For KFolds"""
print ("Train:       ", train.shape)
print ("Train Class: ", targ_train.shape)
print ("Test:        ", test.shape)
print ("Test Class:  ", targ_test.shape)

Train:        (100, 4)
Train Class:  (100,)
Test:         (50, 4)
Test Class:   (50,)


In [8]:
"""Recombine Training Set With Class"""
train = pd.DataFrame.join(train, targ_train)

In [9]:
train = train.reset_index()
test = test.reset_index()

In [10]:
del train["index"]
del test["index"]

### Part 1: 

Partition Training Data into 5 Folds:

- Create train_meta with same rows and IDs as training data
- Create Empty Columns: M1 and M2 (NaNs)

- Create test_meta variable with same rows and IDs as test dataset 
- Create Empty Columns: M1 and M2 (NaNs)

In [11]:
"""Create Train and Test Meta"""
train_meta = pd.DataFrame.copy(train)
train_meta["M1"] = np.nan
train_meta["M2"] = np.nan
print(train.shape)
train_meta.head()

(100, 5)


Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Class,M1,M2
0,5.6,2.5,3.9,1.1,1,,
1,7.7,3.0,6.1,2.3,2,,
2,6.3,3.3,4.7,1.6,1,,
3,5.5,2.4,3.8,1.1,1,,
4,6.3,2.7,4.9,1.8,2,,


In [12]:
test_meta = pd.DataFrame.copy(test)
test_meta["M1"] = np.nan
test_meta["M2"] = np.nan
print(test.shape)
test_meta.head()

(50, 4)


Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,M1,M2
0,5.8,2.8,5.1,2.4,,
1,6.0,2.2,4.0,1.0,,
2,5.5,4.2,1.4,0.2,,
3,7.3,2.9,6.3,1.8,,
4,5.0,3.4,1.5,0.2,,


### Part 2

For each train fold: combine the other 4 into a training set and predict on each fold i 
- train fold 1 = 2,3,4,5 (Predict on fold 1)
- train fold 2 = 1,3,4,5 (Predict on Fold 2)
- train fold 3 = 1,2,4,5 (Predict on Fold 3)
- train fold 4 = 1,2,3,5 (Predict on Fold 4)
- train fold 5 = 1,2,3,4 (Predict on Fold 5)

In [13]:
from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle=False)
for train_index, test_index in kf.split(train):
    print("TRAIN:", train_index, "TEST:", test_index, "\n")

TRAIN: [20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
 95 96 97 98 99] TEST: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] 

TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 40 41 42 43 44
 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
 95 96 97 98 99] TEST: [20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39] 

TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 60 61 62 63 64 65 66 67 68 69
 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
 95 96 97 98 99] TEST: [40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59] 

TRAIN: [ 0  1  2  3  4  5  6  7  8

In [14]:
"""Self Created Cross Validation"""
"""Copy and Paste Output"""
j = 1
for train_index, test_index in kf.split(train):
    print ('train_fold{} = train[{}:{}]'.format(j, 0, max(train_index) + 1))
    print ('train_fold{} = train_fold{}.drop(train_fold{}.index[{}:{}])'.format(j, j, j, min(test_index), max(test_index) + 1))
    print ('target_fold{} = train_fold{}.ix[:,4]'.format(j,j))
    print ('test_fold{} = train[{}:{}]'.format(j, min(test_index), max(test_index) + 1), "\n")
    j += 1

train_fold1 = train[0:100]
train_fold1 = train_fold1.drop(train_fold1.index[0:20])
target_fold1 = train_fold1.ix[:,4]
test_fold1 = train[0:20] 

train_fold2 = train[0:100]
train_fold2 = train_fold2.drop(train_fold2.index[20:40])
target_fold2 = train_fold2.ix[:,4]
test_fold2 = train[20:40] 

train_fold3 = train[0:100]
train_fold3 = train_fold3.drop(train_fold3.index[40:60])
target_fold3 = train_fold3.ix[:,4]
test_fold3 = train[40:60] 

train_fold4 = train[0:100]
train_fold4 = train_fold4.drop(train_fold4.index[60:80])
target_fold4 = train_fold4.ix[:,4]
test_fold4 = train[60:80] 

train_fold5 = train[0:80]
train_fold5 = train_fold5.drop(train_fold5.index[80:100])
target_fold5 = train_fold5.ix[:,4]
test_fold5 = train[80:100] 



In [15]:
train_fold1 = train[0:100]
train_fold1 = train_fold1.drop(train_fold1.index[0:20])
target_fold1 = train_fold1.ix[:,4]
test_fold1 = train[0:20] 

train_fold2 = train[0:100]
train_fold2 = train_fold2.drop(train_fold2.index[20:40])
target_fold2 = train_fold2.ix[:,4]
test_fold2 = train[20:40] 

train_fold3 = train[0:100]
train_fold3 = train_fold3.drop(train_fold3.index[40:60])
target_fold3 = train_fold3.ix[:,4]
test_fold3 = train[40:60] 

train_fold4 = train[0:100]
train_fold4 = train_fold4.drop(train_fold4.index[60:80])
target_fold4 = train_fold4.ix[:,4]
test_fold4 = train[60:80] 

train_fold5 = train[0:80]
train_fold5 = train_fold5.drop(train_fold5.index[80:100])
target_fold5 = train_fold5.ix[:,4]
test_fold5 = train[80:100]

In [16]:
# for j in [1,2,3,4,5]:
#     print("print(train_fold{}.shape)".format(j))
#     print("print(target_fold{}.shape)".format(j))
#     print("print(test_fold{}.shape)".format(j), "\n")

In [17]:
# print(train_fold1.shape)
# print(target_fold1.shape)
# print(test_fold1.shape) 

# print(train_fold2.shape)
# print(target_fold2.shape)
# print(test_fold2.shape) 

# print(train_fold3.shape)
# print(target_fold3.shape)
# print(test_fold3.shape) 

# print(train_fold4.shape)
# print(target_fold4.shape)
# print(test_fold4.shape) 

# print(train_fold5.shape)
# print(target_fold5.shape)
# print(test_fold5.shape) 

### Part 3
- Use Model M1 to train on each train fold i and predict the excluded fold and store the values back into M1 for its fold ID
- Do the same with M2

In [18]:
"""Instantiate Models"""
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

m1 = KNeighborsClassifier()
m2 = GradientBoostingClassifier()
s = LogisticRegression()

In [19]:
# j = 1
# while j < 6:
#     print('m1.fit(train_fold{}, target_fold{})'.format(j,j))
#     print('target_pred{} = pd.DataFrame(m1.predict(test_fold{}))'.format(j,j))
#     print('print(target_pred{})'.format(j), "\n")
#     j += 1

In [20]:
# m1.fit(train_fold1, target_fold1)
# target_pred1 = pd.DataFrame(m1.predict(test_fold1))
# print(target_pred1) 

# m1.fit(train_fold2, target_fold2)
# target_pred2 = pd.DataFrame(m1.predict(test_fold2))
# print(target_pred2) 

# m1.fit(train_fold3, target_fold3)
# target_pred3 = pd.DataFrame(m1.predict(test_fold3))
# print(target_pred3) 

# m1.fit(train_fold4, target_fold4)
# target_pred4 = pd.DataFrame(m1.predict(test_fold4))
# print(target_pred4) 

# m1.fit(train_fold5, target_fold5)
# target_pred5 = pd.DataFrame(m1.predict(test_fold5))
# print(target_pred5) 

In [21]:
train_folds = [train_fold1, train_fold2, train_fold3, train_fold4, train_fold5]
target_folds = [target_fold1, target_fold2, target_fold3, target_fold4, target_fold5]
test_folds = [test_fold1, test_fold2, test_fold3, test_fold4, test_fold5]

y_pred = []
for i, j in zip(train_folds, target_folds):
    m1.fit(i, j)
for k in test_folds:
    target_pred = m1.predict(k)
    y_pred.append(target_pred) 

In [22]:
y_pred = np.concatenate(y_pred, axis=0)

In [30]:
y_pred = pd.DataFrame(y_pred)

In [32]:
train_meta["M1"] = y_pred

In [33]:
train_meta

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Class,M1,M2
0,5.6,2.5,3.9,1.1,1,1,
1,7.7,3.0,6.1,2.3,2,2,
2,6.3,3.3,4.7,1.6,1,1,
3,5.5,2.4,3.8,1.1,1,1,
4,6.3,2.7,4.9,1.8,2,2,
5,6.3,2.8,5.1,1.5,2,2,
6,4.9,2.5,4.5,1.7,2,2,
7,6.3,2.5,5.0,1.9,2,2,
8,7.0,3.2,4.7,1.4,1,1,
9,6.5,3.0,5.2,2.0,2,2,


### Part 4

Fit m1 Model to train_meta
- Put new predictions into train_meta m1 column
Fit m2 Model to train_meta
- Put new predictions into train_meta m2 column

### Part 5
- Predict m1 on test_meta
- Predict m2 on test_meta

Part 4
Use another model (S) and use train_meta to make final predictions on test_meta 

### Part 6 Predict On Actual Testing Set

In [25]:
# actual = pd.DataFrame(target_test)
# actual = actual.reset_index(drop=True)
# actual.head()

In [26]:
# predictions = pd.DataFrame(target_pred)
# predictions.columns = ["Predictions"]
# predictions.head()

In [27]:
# pd.DataFrame.join(actual, predictions)