In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold

In [2]:
# Load datasets
occupancy_df_training = pd.read_csv("occupancy_data/datatraining.txt", header=0)

print(occupancy_df_training.head())

occupancy_df_test1 = pd.read_csv("occupancy_data/datatest.txt", header=0)
occupancy_df_test2 = pd.read_csv("occupancy_data/datatest2.txt", header=0)

spambase_df = pd.read_csv("spambase/spambase.data", header=None)

print(spambase_df.head())

                  date  Temperature  Humidity  Light     CO2  HumidityRatio  \
1  2015-02-04 17:51:00        23.18   27.2720  426.0  721.25       0.004793   
2  2015-02-04 17:51:59        23.15   27.2675  429.5  714.00       0.004783   
3  2015-02-04 17:53:00        23.15   27.2450  426.0  713.50       0.004779   
4  2015-02-04 17:54:00        23.15   27.2000  426.0  708.25       0.004772   
5  2015-02-04 17:55:00        23.10   27.2000  426.0  704.50       0.004757   

   Occupancy  
1          1  
2          1  
3          1  
4          1  
5          1  
     0     1     2    3     4     5     6     7     8     9  ...    48     49  \
0  0.00  0.64  0.64  0.0  0.32  0.00  0.00  0.00  0.00  0.00 ...  0.00  0.000   
1  0.21  0.28  0.50  0.0  0.14  0.28  0.21  0.07  0.00  0.94 ...  0.00  0.132   
2  0.06  0.00  0.71  0.0  1.23  0.19  0.19  0.12  0.64  0.25 ...  0.01  0.143   
3  0.00  0.00  0.00  0.0  0.63  0.00  0.31  0.63  0.31  0.63 ...  0.00  0.137   
4  0.00  0.00  0.00  0.0  0.63

In [3]:
occupancy_y = occupancy_df_training.Occupancy

occupancy_df_training = occupancy_df_training.drop("Occupancy", axis=1)
occupancy_df_training = occupancy_df_training.drop("date", axis=1)

spambase_y = spambase_df[57]
spambase_df = spambase_df.drop(57, axis=1)

In [4]:
# Split with KFolds
kf = KFold(10, True, 1)

In [5]:
# Create a K Neighbors classifier
neighbors = 3
clf = KNeighborsClassifier(n_neighbors=neighbors)

In [6]:
overall_accuracy = 0
overall_f1_score = 0
for train, test in kf.split(occupancy_df_training):
    
    X_train, X_test, y_train, y_test = occupancy_df_training.iloc[train], occupancy_df_training.iloc[test], occupancy_y.iloc[train], occupancy_y.iloc[test]
    
    clf.fit(X_train, y_train)
    
    pred = clf.predict(X_test)
    
    # Model Accuracy: how often is the classifier correct?
    accuracy = accuracy_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    
    overall_accuracy += accuracy
    overall_f1_score += f1
    
print("accuracy:   %0.3f, f1_score:   %0.3f" % (overall_accuracy/10, overall_f1_score/10))

accuracy:   0.989, f1_score:   0.975


In [7]:
overall_accuracy = 0
overall_f1_score = 0
for train, test in kf.split(spambase_df):
    
    X_train, X_test, y_train, y_test = spambase_df.iloc[train], spambase_df.iloc[test], spambase_y.iloc[train], spambase_y.iloc[test]
    
    clf.fit(X_train, y_train)
    
    pred = clf.predict(X_test)
    
    # Model Accuracy: how often is the classifier correct?
    accuracy = accuracy_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    
    overall_accuracy += accuracy
    overall_f1_score += f1
    
print("accuracy:   %0.3f, f1_score:   %0.3f" % (overall_accuracy/10, overall_f1_score/10))

accuracy:   0.810, f1_score:   0.758
