# Problem 1: Data with R

In [1]:
%load_ext rpy2.ipython

In [5]:
%%R

library(tidyverse)
library(ggplot2)

In [23]:
%%R
# make sure to get your data in the same folder, next to the notebook file
# so that this will run!


dat <- read_csv('classification_data.csv')

In [73]:
%%R

# Your data consists of an "X" column and a "label" column
# Use ggplot to plot a kernel density estimate of each label (overlayed on the same plot)
# hint: Don't forget to make sure the labels are factors!
# hint hint: Read the errors carefully! 
# Read the documentation for parse_factor (readr) if confused!

#dat %>% group_by (label)
#mutate("label" = parse_factor(label, NULL))

dat %>% mutate(label = parse_factor(label, na = c("", "NA"))) %>%
        group_by (label)
print(dat)

ggplot(dat, aes(x=X, y=label, color=label)) +
      geom_point()


Error in parse_vector(x, col_factor(levels, ordered, include_na), na = na,  : 
  is.character(x) is not TRUE


# Problem 2: Using Classes in Python

In this problem, you will use the classes in the following cell. **You do not need to modify the classes or touch the code in the following cell in any way!**

In [17]:
from abc import ABC, abstractmethod
from math import sqrt

def smart_zip(a,b):
    try:
        return zip(a, b)
    except TypeError:
        return zip([a], [b])

class AbstractKNeighbors(ABC):
    def __init__(self, K):
        self.K = K

    def fit(self, X, y):
        """ Train the model!

        X should be a list of data points
        y should be a list of labels
        """
        self.X = X
        self.y = y
        return self

    @abstractmethod
    def _make_prediction(self, labels):
        pass

    def predict(self, new_x):
        """Find the nearest K neighbors
        
        new_x should be a single data point
        """

        dists = [sqrt(sum(([(i-j)**2 for i,j in smart_zip(x,new_x)])))
                 for x in self.X]
        sorted_neighbors = sorted(enumerate(dists),
                                  key=lambda t: t[1])
        labels = [self.y[i] for i,_ in sorted_neighbors]
        return self._make_prediction(labels)


class KNearestNeighbors(AbstractKNeighbors):
    def _make_prediction(self, labels):
        avg = sum(labels[:self.K])/self.K
        return round(avg)

class KFurthestNeighbors(AbstractKNeighbors):
    def _make_prediction(self, labels):
        avg = sum(labels[-self.K:])/self.K
        return round(1 - avg)

In [8]:
from csv import reader

with open('classification_data.csv', 'r') as f:
    dat = list(reader(f))[1:]
    dat = [[float(x), int(label)] for x,label in dat]

## Problem 2.1: Shuffling!

In [20]:
# In your data, "X" is a data point that is nothing more than
# a single number. 
# Shuffle your data into a random order (use random.shuffle!)


import pandas
import random
random.shuffle(dat, random=None)
#rounding = dat.round()
#?random.shuffle


In [11]:
# If you shuffled your data, this test should pass
# (i.e. not throw an error)

assert(sum([label for x,label in dat[:50]]) != 0)

## Problem 2.2: Splitting!

In [14]:
len(dat)

1000

In [16]:
# Split your data, which is now a list, into 2 sublists:
# "train" and "test"
# The "train" group should have 700 elements
# The test group should have 300 elements
# Each group should have the same format as the original data


train = dat[:700]
test = dat[:-300]


[[4.009249660621473, 0],
 [1.07327318225092, 1],
 [0.5402481667590708, 1],
 [-0.17296267405218624, 1],
 [3.2592395528793725, 1],
 [1.6647431750633035, 0],
 [1.9248657131609075, 0],
 [0.8631917755200313, 0],
 [-1.3751297761001913, 1],
 [2.600149966272811, 0],
 [1.7792134190591424, 1],
 [-1.0854010197731676, 0],
 [2.843853142881426, 0],
 [2.358544270246075, 0],
 [0.6902944720927202, 1],
 [-2.335763569062687, 1],
 [2.7309005563942503, 0],
 [2.2091176130847043, 0],
 [1.706844050018611, 0],
 [-0.9960850825498907, 1],
 [4.879477484514744, 0],
 [3.7633867025348344, 0],
 [-2.6841924558683483, 1],
 [-2.26244226120225, 1],
 [0.9540999708420179, 1],
 [1.9152521415403263, 0],
 [-2.2467346239793784, 1],
 [1.9483552867803047, 1],
 [-2.730135384654908, 1],
 [2.756167282688504, 0],
 [1.856923683627106, 0],
 [0.4043996002098069, 1],
 [-2.9355940305255954, 1],
 [3.4293127687000737, 1],
 [-0.44710138374942243, 0],
 [0.535204043331087, 1],
 [3.7898745590995095, 0],
 [2.460710942935767, 0],
 [-4.7946411176

In [26]:
# Now you will need to make another split, within the groups!
# For each group ("train" and "test") split the X's from the labels.
train_a = [pair[0] for pair in train]
train_b = [pair[1] for pair in train]

test_a = [pair[0] for pair in test]
test_b = [pair[1] for pair in test]

## Problem 2.3: Testing models!

In [32]:
# For each model: 
# 1. Create an instance the class, with constructor parameters: K=5
# 2. Train the instance on the "train" groups X's and labels (y's)
# 3. Test how well the instance does: 
#    A. Use the trained instance to predict the label of each "X" in the "test" group
#    B. Use your "test" labels to see if the predicted label is equal the true label

knn = KNearestNeighbors(5)
kfn = KFurthestNeighbors(5)

knn.fit(train_a,train_b)
kfn.fit(train_a,train_b)

knn_a_pred = [knn.predict(x) for x in test_a]
kfn_b_pred = [kfn.predict(x) for x in test_b]

knn_a_pred

[0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,


In [None]:
# Compare the two classes of models!