# ML Challenge (Optional)

Train, test, optimize, and analyze the performance of a classification model using a methodology of your choice for the randomly generated moons dataset.

You are not being evaluated for the performance of your model. Instead, we are interested in whether you can implement a simple but rigorous ML workflow.

Show all of your work in this notebook.

In [None]:
# you are free to use any package you deem fit

import torch
from torch import nn, float64
from torch.utils.data import Dataset, DataLoader
import numpy as np
from random import randint
from math import floor, ceil


## Dataset

In [None]:
# DO NOT MODIFY
from sklearn.datasets import make_moons

X, Y = make_moons(random_state=42, n_samples=(50, 450), noise=0.25)


In [None]:
# print(X)
# print(Y)


In [None]:
# print(len(X))
# print(len(Y))
# print(type(X))
# print(type(Y))
# print(X.dtype)
# print(Y.dtype)


In [None]:
TEST_RATIO = 0.2

num = len(X)
test = []
while (len(test) < TEST_RATIO * num):
    a = randint(0, num - 1)
    if (a not in test):
        test.append(a)

train_X = []
train_Y = []
test_X = []
test_Y = []

for i in range(0, num):
    if (i in test):
        test_X.append(X[i])
        test_Y.append(Y[i])
    else:
        train_X.append(X[i])
        train_Y.append(Y[i])

train_X = np.array(train_X, dtype=np.float64)
train_Y = np.array(train_Y, dtype=np.float64)
test_X = np.array(test_X, dtype=np.float64)
test_Y = np.array(test_Y, dtype=np.float64)


## Training

In [None]:
class DataSet(Dataset):
    def __init__(self, num: int, _X, _Y):
        self.num = num
        self.X = _X
        self.Y = _Y

    def __getitem__(self, index):
        x = self.X[index].reshape(1, 1, 2)
        y = self.Y[index].reshape(1)
        return (torch.from_numpy(x), torch.tensor(y))

    def __len__(self):
        return self.num


In [None]:
train_set = DataSet(ceil(num - TEST_RATIO * num), train_X, train_Y)
test_set = DataSet(floor(TEST_RATIO * num), test_X, test_Y)
train_loader = DataLoader(train_set, batch_size=5, shuffle=True, num_workers=0)
test_loader = DataLoader(test_set, batch_size=1, shuffle=True, num_workers=0)


In [None]:
class Reshape(nn.Module):
    def __init__(self):
        super(Reshape, self).__init__()
        pass

    def forward(self, x):
        y = x.transpose(-1, -2)
        return y

    def extra_repr(self) -> str:
        return 'new_shape={}'.format(
            self.shape
        )


class Model(nn.Module):
    # 输入 1 * 2, 输出一个数
    def __init__(self):
        super(Model, self).__init__()
        self.model = nn.Sequential(
            Reshape(),
            nn.Linear(1, 10),
            nn.ReLU(),
            Reshape(),
            nn.Linear(2, 15),
            nn.ReLU(),
            nn.Conv2d(1, 3, (3, 3), padding=1, padding_mode='zeros'),
            nn.ReLU(),
            nn.Conv2d(3, 3, (5, 5), padding=2, padding_mode='zeros'),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(450, 30),
            nn.ReLU(),
            nn.Linear(30, 1),
            nn.Sigmoid()
        )
        self.double()

    def forward(self, x):
        return self.model(x)


In [None]:
model = Model()
loss_fn = nn.BCELoss()
optim = torch.optim.SGD(model.parameters(), lr=0.01)
for epoch in range(0, 200):
    # print(epoch, ": ", end='')
    all_loss = 0
    zero_loss = 0
    for data in train_loader:
        x, y = data
        outputs = model(x)
        loss = loss_fn(outputs, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
        all_loss += loss
    # print(all_loss)


## Testing / Optimization

In [None]:
correct_num = 0
for data in test_loader:
    x, y = data
    output = model(x)
    # print(output[0][0],y[0][0])
    result = 0
    if (output[0][0] > 0.5):
        result = 1
    if (abs(result - y[0][0]) < 0.01):
        correct_num += 1
print(correct_num / (num * TEST_RATIO))


## Performance Analysis

I have run the above code 10 continuous times. The correctness of each time is: 0.97, 0.95, 0.96, 0.94, 0.98, 0.97, 0.98, 0.98, 0.98, 0.96. The average is 96.7%, which is higher than the correctness of all guessing 1 (0.9). Therefore, I think this model can perform well in classifying these data.