In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
!ls /content/gdrive/MyDrive/'machine learning'

In [None]:
PATH = "/content/gdrive/MyDrive/machine learning/"

In [None]:
! pip install -q kaggle
! mkdir ~/.kaggle
! rm ~/.kaggle/kaggle.json
! cp /content/gdrive/MyDrive/'machine learning'/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c galaxy-zoo-the-galaxy-challenge --force

In [None]:
import numpy as np
import pandas as pd

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

In [None]:
import random
from PIL import Image
from cv2 import imread
import matplotlib.pyplot as plt

In [None]:
%%capture
!unzip images_training_rev1.zip
!unzip training_solutions_rev1.zip

In [None]:
train_set = pd.read_csv('training_solutions_rev1.csv')
files = os.listdir('./images_training_rev1')

train_set.head()

In [None]:
plt.figure(1, figsize=(9, 9))
plt.axis('off')
n = 0
for i in range(16):
  n += 1
  random_img = './images_training_rev1/'+random.choice(files)
  imgs = imread(random_img)
  plt.subplot(4, 4, n)
  plt.axis('off')
  plt.imshow(imgs)

plt.show()

In [None]:
%%capture
!pip install torchsummary

In [None]:
import torch
import torchvision
from torchsummary import summary
from tqdm import tqdm
import torchvision.models as models
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
from torch.utils.data.sampler import SubsetRandomSampler

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device: {device}')

In [None]:
train_folder = './images_training_rev1'
test_folder = './images_test_rev1'

In [None]:
transform = transforms.Compose([transforms.Resize(255),
    transforms.CenterCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.RandomGrayscale(),
    transforms.RandomAffine(translate=(0.05,0.05), degrees=0),
    transforms.ToTensor()
])

In [None]:
class GalaxyDataSet(torch.utils.data.Dataset):
    def __init__(self, df, image_folder, transform = None):
        self.df = df
        self.image_folder = image_folder
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        filename = self.df.loc[index, 'GalaxyID']
        label = torch.Tensor(self.df.values.tolist()[index][1:])
        image = Image.open(os.path.join(self.image_folder, str(filename)+'.jpg'))
        if self.transform is not None:
            image = self.transform(image)
        return image, label

In [None]:
TrainDataSet = GalaxyDataSet(train_set, train_folder, transform)

In [None]:
batch_size = 128
validation_split = .3
shuffle_dataset = True

dataset_size = len(TrainDataSet)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))

def randomsampler():
    np.random.seed(random.randrange(20, 50, 1))
    np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]
    # train_indices = random.sample(train_indices, random.randrange(12, 15))
    # val_indices = random.sample(train_indices, random.randrange(2, 4))
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)
    return train_sampler, valid_sampler

In [None]:
model = models.resnet18(pretrained = True)
model.fc = nn.Sequential(
    nn.Softmax(dim=1),
    nn.Linear(512, 37))
model.load_state_dict(torch.load(PATH+'galaxy_classifier_model.pt', map_location=device))
model = model.to(device)
summary(model, input_size = (3, 224, 224))

In [None]:
epochs = 1
alpha = 0.001
optimizer = optim.Adam(model.parameters(), lr = alpha)
criterion = nn.MultiLabelSoftMarginLoss()

In [None]:
def eval_loop(model, valloader, criterion, optimizer):
    model.eval()
    tr_loss, tr_acc = 0.0, 0.0
    ep = 0
    with torch.no_grad():
        for i, (data, label) in enumerate(tqdm(valloader)):
            ep = i
            data, label = data.to(device), label.to(device)
            pred = model(data)
            loss = criterion(pred, label)
            tr_loss += loss.item()
            diff = pred - loss
            for ft in diff:
                tr_acc += len(ft[abs(ft) <= 0.2])/len(ft)
            tr_acc = tr_acc/len(diff)
    return tr_loss/len(valloader.dataset), 100*tr_acc/(ep+1)

In [None]:
def training_loop(model, trainloader, criterion, optimizer):
    model.train()
    tr_loss, tr_acc = 0.0, 0.0
    ep = 0
    for i, (data, label) in enumerate(tqdm(trainloader)):
        ep = i
        data, label = data.to(device), label.to(device)
        optimizer.zero_grad()
        pred = model(data)
        loss = criterion(pred, label)
        loss.backward()
        optimizer.step()
        
        tr_loss += loss.item()
        diff = pred - loss
        for ft in diff:
            tr_acc += len(ft[abs(ft) <= 0.2])/len(ft)
        tr_acc = tr_acc/len(diff)
    return tr_loss/len(trainloader.dataset), 100*tr_acc/(ep+1)

In [None]:
def train_model(model, criterion, optimizer, epoch):
    print('-'*5+'Training'+'-'*5)
    stats = {
        't_loss' : [],
        'v_loss' : [],
        't_acc'  : [],
        'v_acc'  : []
    }
    for ep in range(epoch):
        torch.cuda.empty_cache()
        print(f'Training epoch: {ep+1}')
        train_sampler, val_sampler = randomsampler()
        trainloader = torch.utils.data.DataLoader(TrainDataSet, batch_size=batch_size, sampler=train_sampler)
        valloader = torch.utils.data.DataLoader(TrainDataSet, batch_size=batch_size, sampler=val_sampler)
        t_loss, t_acc = training_loop(model, trainloader,criterion, optimizer)
        v_loss, v_acc = eval_loop(model, valloader, criterion, optimizer)
        print(f'Training loss:{t_loss} Training accuracy: {t_acc}')
        print(f'Validation loss: {v_loss} Validation accuracy: {v_acc}')
        stats['t_loss'].append(t_loss)
        stats['v_loss'].append(v_loss)
        stats['v_acc'].append(v_acc)
        stats['t_acc'].append(t_acc)
        torch.save(model.state_dict(), PATH+'galaxy_classifier_model_1.pt')
        print(f'Epoch {ep+1}: Model saved')
    print('Finished Training')
    return stats

In [None]:
hist = train_model(model, criterion, optimizer, epochs)

In [None]:
model.eval()
dummy_input, _ = DataLoader(TrainDataSet[0])
torch.onnx.export(model,
         dummy_input.cuda() if device == "cuda" else dummy_input,
         PATH+"galaxy_classification_1.onnx",
         export_params=True,
         opset_version=10,
         do_constant_folding=True,
         input_names = ['modelInput'],
         output_names = ['modelOutput'],
         dynamic_axes={'modelInput' : {0 : 'batch_size'},
                                'modelOutput' : {0 : 'batch_size'}}) 
print('Model has been converted to ONNX')

In [None]:
%%capture
!unzip images_test_rev1.zip

In [None]:
testfiles = os.listdir(test_folder)

In [None]:
class GalaxyDataTestSet(torch.utils.data.Dataset):
    def __init__(self,files, image_folder, transform = None):
        self.files = files
        self.image_folder = image_folder
        self.transform = transform

    def __len__(self):
        return len(self.files)

    def __getitem__(self, index):
        filename = self.files[index]
        image = Image.open(os.path.join(self.image_folder, str(filename)))
        if self.transform is not None:
            image = self.transform(image)
        return image

In [None]:
testset = GalaxyDataTestSet(testfiles, test_folder, transform)
test_dl = DataLoader(testset, batch_size=1)

In [None]:
@torch.no_grad()
def test_loop(model, testdata, loss_fn, t_gpu):
    print('*'*5+'Testing Started'+'*'*5)
    model.train(False)
    model.eval()
    
    full_pred = []

    for data in tqdm(testdata):
        if t_gpu:
            data = data.cuda()

        output = model(data)
        full_pred.append(output)
    return full_pred

In [None]:
pred = test_loop(model, test_dl, criterion, device == "cuda")

In [None]:
predictions = []
for pr in pred:
    predictions.append(pr.cpu().numpy().reshape(-1))

def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

predictions_nm = NormalizeData(np.array(predictions)).astype('float128')

In [None]:
gids = [int(i[:-4]) for i in testfiles]

In [None]:
df = pd.DataFrame(predictions_nm, columns = train_set.columns[1:])
df.insert(0, 'GalaxyID', gids)
df.head()

In [None]:
df.to_csv(PATH+'normalize_file.csv', index = False)

In [None]:
sft_pred = torch.nn.functional.softmax(torch.from_numpy(np.array(predictions)), dim = 1).cpu().numpy()

In [None]:
df = pd.DataFrame(sft_pred, columns = train_set.columns[1:])
df.insert(0, 'GalaxyID', gids)
df.head()

In [None]:
df.to_csv(PATH+'soft_max_file.csv', index = False)

In [None]:
df = pd.DataFrame(predictions, columns = train_set.columns[1:])
df.insert(0, 'GalaxyID', gids)
df.head()

In [None]:
df.to_csv(PATH+'unchanged_file.csv', index = False)