In [14]:
# Run some setup code for this notebook.
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import torch
import pickle
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
# from cs231n.data_utils import load_CIFAR10
# import matplotlib.pyplot as plt

# # This is a bit of magic to make matplotlib figures appear inline in the notebook
# # rather than in a new window.
# %matplotlib inline
# plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
# plt.rcParams['image.interpolation'] = 'nearest'
# plt.rcParams['image.cmap'] = 'gray'

# # Some more magic so that the notebook will reload external python modules;
# # see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
# %load_ext autoreload
# %autoreload 2

In [2]:
# Load the raw CIFAR-10 data.
# data_dir = '/home/ubuntu/CS231N/data/split_datasets/'
data_dir = "../../data/split-datasets/"

X_train = pd.read_pickle(data_dir + "train_data.pkl").to_numpy()
y_train = pd.read_pickle(data_dir + "train_labels.pkl").to_numpy()
X_valid = pd.read_pickle(data_dir + "valid_data.pkl").to_numpy()
y_valid = pd.read_pickle(data_dir + "valid_labels.pkl").to_numpy()
X_test = pd.read_pickle(data_dir + "test_data.pkl").to_numpy()
y_test = pd.read_pickle(data_dir + "test_labels.pkl").to_numpy()

y_train = y_train.flatten().astype(np.int64)
y_valid = y_valid.flatten().astype(np.int64)
y_test = y_test.flatten().astype(np.int64)

# As a sanity check, we print out the size of the training and test data.
print('Training data shape: ', X_train.shape)
print('Training labels shape: ', y_train.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)

Training data shape:  (64000, 49152)
Training labels shape:  (64000,)
Test data shape:  (20000, 49152)
Test labels shape:  (20000,)


In [3]:
X_train = X_train.reshape(-1, 3, 128, 128)

In [2]:
channel_means = pd.read_csv("../data_preprocessing/X_train_channel_means.csv", index_col=0)
channel_sds = pd.read_csv("../data_preprocessing/X_train_channel_stds.csv", index_col=0)

In [3]:
class YourDataset(Dataset):

    def __init__(self, X_Train, Y_Train, transform=None):
        self.X_Train = X_Train
        self.Y_Train = Y_Train
        self.transform = transform

    def __len__(self):
        return len(self.X_Train)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        x = self.X_Train[idx]
        y = self.Y_Train[idx]

        if self.transform:
            x = self.transform(x)
            y = self.transform(y)

        return x, y

In [34]:
class MyDataset(Dataset):
    def __init__(self, data, targets, transform=None):
        self.data = data
        self.targets = torch.LongTensor(targets)
        self.transform = transform
        
    def __getitem__(self, index):
        x = self.data[index]
        y = self.targets[index]
        
        if self.transform:
            x = Image.fromarray(self.data[index].astype(np.uint8).transpose(1,2,0))
            x = self.transform(x)
        
        return x, y
    
    def __len__(self):
        return len(self.data)

# Let's create 10 RGB images of size 128x128 and 10 labels {0, 1}
# data = list(np.random.randint(0, 255, size=(10, 3, 128, 128)))
# targets = list(np.random.randint(2, size=(10)))

data_dir = "../../data/split-datasets/"
file = open(data_dir + 'train_data.pkl', 'rb')
X_train = pickle.load(file).to_numpy()
X_train = X_train.reshape(-1, 3, 128, 128)
file.close()

file = open(data_dir + 'train_labels.pkl', 'rb')
y_train = pickle.load(file).to_numpy()
y_train = y_train.flatten().astype(np.int64)
file.close()

transform = transforms.Compose(
    [
        transforms.Resize(128), 
        transforms.ToTensor(),
        transforms.Normalize(channel_means.to_numpy().reshape(-1), channel_sds.to_numpy().reshape(-1))
        ])
dataset = MyDataset(X_train, y_train, transform=transform)
dataloader = DataLoader(dataset, shuffle=False, batch_size=5)

In [38]:
inputs, classes = next(iter(dataloader))  

In [39]:
inputs[0]

tensor([[[0.7961, 0.8078, 0.8235,  ..., 0.5216, 0.5098, 0.4980],
         [0.8078, 0.8196, 0.8353,  ..., 0.5333, 0.5216, 0.5098],
         [0.8235, 0.8353, 0.8510,  ..., 0.5569, 0.5333, 0.5255],
         ...,
         [0.0471, 0.0588, 0.0667,  ..., 0.0745, 0.0902, 0.0863],
         [0.0667, 0.0706, 0.0784,  ..., 0.0667, 0.0784, 0.0745],
         [0.0824, 0.0863, 0.0902,  ..., 0.0588, 0.0706, 0.0667]],

        [[0.7725, 0.7843, 0.8000,  ..., 0.4588, 0.4471, 0.4353],
         [0.7843, 0.7961, 0.8118,  ..., 0.4706, 0.4588, 0.4471],
         [0.8039, 0.8157, 0.8314,  ..., 0.4902, 0.4745, 0.4667],
         ...,
         [0.0549, 0.0667, 0.0745,  ..., 0.0824, 0.0980, 0.0941],
         [0.0745, 0.0784, 0.0863,  ..., 0.0745, 0.0863, 0.0824],
         [0.0902, 0.0941, 0.0980,  ..., 0.0667, 0.0784, 0.0745]],

        [[0.7765, 0.7882, 0.8039,  ..., 0.4314, 0.4196, 0.4078],
         [0.7882, 0.8000, 0.8157,  ..., 0.4431, 0.4314, 0.4196],
         [0.8078, 0.8196, 0.8353,  ..., 0.4549, 0.4392, 0.

In [40]:
classes[0]

tensor(44)

In [19]:
classes

tensor([44, 37, 46, 88, 51])

In [4]:
data_dir = "../../data/split-datasets/"
file = open(data_dir + 'train_data.pkl', 'rb')
X_train = pickle.load(file).to_numpy()
# X_train = X_train.reshape(-1, 128, 128, 3)
file.close()

In [5]:
X_train = torch.tensor(X_train, dtype=torch.float)

In [6]:
file = open(data_dir + 'train_labels.pkl', 'rb')
y_train = pickle.load(file).to_numpy()
y_train = y_train.flatten().astype(np.int64)
file.close()

In [7]:
X_train.shape

torch.Size([64000, 49152])

In [10]:
your_dataset = YourDataset(X_train, y_train, transform=transforms.Compose(
    [
        # transforms.ToPILImage(),
        transforms.Resize((128, 128)),
        # transforms.Normalize(channel_means.values, channel_sds.values)
        ]
    ))

your_data_loader = DataLoader(your_dataset, batch_size=1, shuffle=True, num_workers=0)

In [11]:
inputs, classes = next(iter(your_data_loader))  

TypeError: Tensor is not a torch image.

In [7]:
X_train = torch.tensor(X_train, dtype=torch.float)

In [8]:
train_normalize = T.Normalize(channel_means.values, channel_sds.values)

In [2]:
data_dir = "../../data/split-datasets/"

X_train = pd.read_pickle(data_dir + "train_data.pkl").to_numpy()
X_train = X_train.reshape(-1, 3, 128, 128)
X_train_0 = X_train[:, 0, : :]
X_train_1 = X_train[:, 1, : :]
X_train_2 = X_train[:, 2, : :]
del X_train

In [3]:
# Subsample the data for more efficient code execution in this exercise
num_training = 5000
mask = list(range(num_training))
X_train = X_train[mask]
y_train = y_train[mask]

num_valid = 5000
mask = list(range(num_valid))
X_valid = X_valid[mask]
y_valid = y_valid[mask]

num_test = 5000
mask = list(range(num_test))
X_test = X_test[mask]
y_test = y_test[mask]

# Reshape the image data into rows
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_valid = np.reshape(X_valid, (X_valid.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))
print(X_train.shape, X_valid.shape, X_test.shape)

(5000, 49152) (5000, 49152) (5000, 49152)


In [22]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)

In [24]:
# k_s = range(1, 11)
# k_scores = {}
# for k in k_s:
#     print(k)
#     knn = KNeighborsClassifier(n_neighbors=k)
#     scores = cross_val_score(knn, X_valid, y_valid, cv=5, scoring='accuracy')
#     k_scores[k] = scores.mean()

1
2
3
4
5
6
7
8
9
10


In [25]:
# k_scores

{1: 0.027200000000000002,
 2: 0.0308,
 3: 0.034,
 4: 0.032,
 5: 0.0306,
 6: 0.0288,
 7: 0.029199999999999997,
 8: 0.0306,
 9: 0.0316,
 10: 0.0332}

In [None]:
# knn = KNeighborsClassifier(n_neighbors=3)
# neigh.fit(X_train, y_train)

In [4]:
# Full training set, ~55 mins
# neigh.score(X_test, y_test)

0.03775

In [5]:
from joblib import dump, load
dump(neigh, 'knn_model.joblib') 

In [7]:
neigh_load = load("knn_model.joblib")

In [None]:
neigh_load.predict(X_test[0:2])