In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
import numpy as np
import os
%matplotlib inline
data = "admission.csv"
admissions = pd.read_csv(os.path.relpath("Data\\"+data))
admissions['actual_label'] = admissions['admit']
admissions.drop('admit', axis=1, inplace=True)
admissions.head()

Unnamed: 0,gpa,gre,actual_label
0,3.177277,594.102992,0
1,3.412655,631.528607,0
2,2.728097,553.714399,0
3,3.093559,551.089985,0
4,3.141923,537.184894,0


<h3>K-Fold</h3>
<p><a href="https://en.wikipedia.org/wiki/Cross-validation_(statistics)#k-fold_cross-validation">K-fold cross-validation</a> consists of partitioning a dataset into $k-$equally sized partitions and then training a model on each of $k-1$ of the partitions, retaining the remaining partition for testing</p>
<p>Ultimately, this means repeating the model training process k-1 times until each of the folds has been used as a test set. At each iteration some (or several) accuracy measurements are documented, allowing for a mean to be calculated for each of the k error measurements</p>
<table>
<caption>Example of k=5 k-fold validation</caption>
<tr>
    <th>Iteration</th><th>Partition 1</th><th>Partition 2</th><th>Partition 3</th><th>Partition 4</th><th>Partition 5</th>
</tr>
<tr><td>1</td><td><b>Train</b></td><td>Test</td><td>Test</td><td>Test</td><td>Test</td></tr>
<tr><td>2</td><td>Test</td><td><b>Train</b></td><td>Test</td><td>Test</td><td>Test</td></tr>
<tr><td>3</td><td>Test</td><td>Test</td><td><b>Train</b></td><td>Test</td><td>Test</td></tr>
<tr><td>4</td><td>Test</td><td>Test</td><td>Test</td><td><b>Train</b></td><td>Test</td></tr>
<tr><td>5</td><td>Test</td><td>Test</td><td>Test</td><td>Test</td><td><b>Train</b></td></tr>
</table>


In [3]:

# Randomize index
shuffled_index = np.random.permutation(admissions.index)

# Randomize Data
shuffled_admissions = admissions.loc[shuffled_index]

# Reset Index
admissions = shuffled_admissions.reset_index()



In [4]:
# Assign Folds
print(admissions.shape[0]/5)
# folds will be index 0:128, 129:257, 258:386, 387:514, 515:644
k_indices_starts = [0,129,258,387,515]
k_indices_stops = [128,257,386,514,644]

for i in range(5):
    admissions.ix[k_indices_starts[i]:k_indices_stops[i],'fold'] = i+1

admissions.head()
admissions['fold'] = admissions['fold'].astype(int)

128.8


In [5]:
def accuracy(matches, total):
    return matches/total

# Recall we have a column 'fold' that contains the labels for each fold

def k_fold_testing(fold_ids: list, x: str, y: str, df):
    accuracies = dict(accuracies_list=[])
    for fold in fold_ids:
        train = df[df['fold'] != fold]
        test = df[df['fold'] == fold]
        model = LogisticRegression()
        model.fit(train[[x]],train[y])
        predictions = model.predict(test[[x]])
        test['predicted_labels'] = predictions
        matches = test['predicted_labels']==test[y]
        correct_predictions = test[matches]
        accuracy = correct_predictions.shape[0]/test.shape[0]
        accuracies[fold] = accuracy
        accuracies['accuracies_list'].append(accuracy)
    accuracies['Mean Accuracy'] = np.mean(accuracies['accuracies_list'])
    return accuracies

test_iteration = k_fold_testing([1,2,3,4,5], 'gpa','actual_label',admissions)

for key, value in test_iteration.items():
    print("{}: {}".format(key, value))

    
    

accuracies_list: [0.6976744186046512, 0.6434108527131783, 0.6124031007751938, 0.6484375, 0.6046511627906976]
1: 0.6976744186046512
2: 0.6434108527131783
3: 0.6124031007751938
4: 0.6484375
5: 0.6046511627906976
Mean Accuracy: 0.6413154069767442


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


<h3>What just happened?</h3>
<p>Above we defined a function that runs through testing validation for each of the k fold ids</p>
<p>In pseudo code:</p>
<p>````For each fold:
       train a model
       predict values in test set
       return accuracy````</p>
<p>Fortunately we have a scikitlearn class, <a href="http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html">KFold</a> that can take care of this for us</p>
<p>The KFold class is instantiated with ````KFold(n ,n_folds, shuffle=False, random_state=None)````</p>
<p>Where: n is the number of observations, n_folds is the number of folds, shuffle (duh), random_state is the seed value for shuffle if it is used</p>

<p>The KFold class only returns an iterator object for running through the k-iterations of testing. To cross validate a model using the KFold class, we use the <a href="http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.cross_val_score.html">````sklearn.cross_validation.cross_validation_score(estimator, X, Y, scoring=None, cv=None)````</a> function. Where:</p>
<ul>
<li>estimator is the instance of sklearn predictive model to be applied</li>
<li>X is the array of the independent variable</li>
<li>Y is the dependent variable column</li>
<li>scoring is the <a href="http://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values">type of scoring</a> to perform</li>
<li>cv is the number of folds, it will accept an integer representing the no. of folds or an instance of the KFold class</li>
</ul>

In [6]:
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score

# Create instance of KFold class
kf = KFold(admissions.shape[0], 5, shuffle=True, random_state=8)

# Create instance of LogisticRegression
lr = LogisticRegression()

# Use cross_val_score function to get accuracies (note: cv=kf)
accuracies = cross_val_score(lr, admissions[['gpa']], admissions['actual_label'], scoring='accuracy', cv=kf)
average_accuracy = np.mean(accuracies)

print(accuracies)
print(average_accuracy)



[ 0.60465116  0.6744186   0.60465116  0.65116279  0.6484375 ]
0.636664244186
