# Exploring the Data

In [None]:
import os
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt

import PIL
from PIL import Image
from io import BytesIO

import numpy as np
import torch
from torchvision import transforms

from ipynb.fs.defs.utils import *

In [None]:
df=pd.read_csv("/Users/jianggh/Desktop/Gravity Spy Dataset/Data/H1_O1.csv")
df.head()

In [None]:
print(df.columns)
print(len(df.columns))
print(df.index)

In [None]:
example_url=df.loc[1000,'url1']
print(example_url)

In [None]:
example = gray_scale(img_cut(load_html_sample(example_url)))

example = Image.fromarray(example)

example.show()

# Model and Train

In [None]:
import os
import numpy as np
from pathlib import Path

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
import torch.nn.functional as F

from ipynb.fs.defs.utils import *

In [None]:
class ImageClassificationBase(nn.Module):
    def training_step(self, batch):
        images, labels = batch 
        out = self(images)                  # Generate predictions
        loss = F.cross_entropy(out, labels) # Calculate loss
        return loss
    
    def validation_step(self, batch):
        images, labels = batch 
        out = self(images)                    # Generate predictions
        loss = F.cross_entropy(out, labels)   # Calculate loss
        acc = accuracy(out, labels)           # Calculate accuracy
        return {'val_loss': loss.detach(), 'val_acc': acc}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        batch_accs = [x['val_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_accs).mean()      # Combine accuracies
        return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
    
    def epoch_end(self, epoch, result):
        print("Epoch [{}], train_loss: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}".format(
            epoch, result['train_loss'], result['val_loss'], result['val_acc']))


class Baseline1(ImageClassificationBase):
    def __init__(self):
        super().__init__()
        self.network = nn.Sequential(
            nn.Conv2d(3, 8, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # output: 16 x 128 x 128

            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # output: 32 x 64 x 64

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # output: 64 x 32 x 32

            nn.Flatten(), 
            nn.Linear(64*32*32, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 196))
        
    def forward(self, xb):
        return self.network(xb)

class Baseline2(nn.Module):
    def __init__(self):

        super(Baseline2, self).__init__()

        Nfilters = [8, 16, 16, 32, 64, 64, 128, 128]
        filter_size = [(1, 32)] + [(1, 16)] * 3 + [(1, 8)] * 2 + [(1, 4)] * 2
        filter_stride = [(1, 1)] * 8
        dilation = [(1, 1)] * 8
        pooling = [1, 0, 0, 0, 1, 0, 0, 1]
        pool_size = [[1, 8]] + [(1, 1)] * 3 + [[1, 6]] + [(1, 1)] * 2 + [[1, 4]]
        pool_stride = [[1, 8]] + [(1, 1)] * 3 + [[1, 6]] + [(1, 1)] * 2 + [[1, 4]]

        self.layers = nn.ModuleList()

        for i in range(8):
            # 添加卷积层
            self.layers.append(nn.Conv2d(
                in_channels=1 if i == 0 else Nfilters[i-1],  # Number of channels in the input image
                out_channels=Nfilters[i],  # Number of channels produced by the convolution
                kernel_size=filter_size[i],  # Size of the convolving kernel
                stride=filter_stride[i],  # Stride of the convolution
                padding=0,  # Zero-padding added to both sides of the input
                dilation=dilation[i],  # Spacing between kernel elements
                groups=1,  # Number of blocked connections from input channels to output channels
                bias=True,  # If True, adds a learnable bias to the output
                padding_mode='zeros',  # Specifies the type of padding, 'zeros' pads with zero
            ))
            # 添加ELU激活函数，alpha参数为0.01
            self.layers.append(nn.ELU(0.01))
            # 添加批量归一化层，特征数量为Nfilters[i]
            self.layers.append(nn.BatchNorm2d(num_features=Nfilters[i]))
            # 如果pooling[i]为真，添加最大池化层
            if pooling[i]:
                # 最大池化层的参数：核大小为pool_size[i]，步长为pool_stride[i]，填充为0
                self.layers.append(nn.MaxPool2d(
                    kernel_size=pool_size[i],
                    stride=pool_stride[i],
                    padding=0,
                ))

        # 添加Flatten层，将输入展平
        self.layers.append(nn.Flatten())
        # 添加全连接层，输入维度为20224，输出维度为64
        self.layers.append(nn.Linear(20224, 64))
        # 添加ELU激活函数，alpha参数为0.01
        self.layers.append(nn.ELU(0.01))
        # 添加Dropout层，丢弃率为0.5
        self.layers.append(nn.Dropout(0.5))
        # 添加全连接层，输入维度为64，输出维度为2
        self.layers.append(nn.Linear(64, 2))

    def forward(self, x):
        # 前向传播函数
        for layer in self.layers:
            x = layer(x)
        return x

class Baseline3(nn.Module):
    def __init__(self):
        super(Baseline3, self).__init__()

        #输入960*1150
        self.features = nn.Sequential(
            
            nn.Conv2d(1, 16, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(3),
            nn.Dropout2d(0.5),
            
            nn.Conv2d(16, 32, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(3),
            nn.Dropout2d(0.5),
            
            nn.Conv2d(32, 64, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(3),
            nn.Dropout2d(0.5),
            
            nn.Conv2d(64, 64, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(3),
            nn.Dropout2d(0.5),

            nn.Conv2d(64, 64, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout2d(0.5),

            nn.Conv2d(64, 64, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout2d(0.5)
            
        )
        #得到2*3
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 2 * 3, 256), 
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256,24),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

In [None]:
class Accumulator:
    def __init__(self, n):
        self.data = [0.0] * n

    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]

    def reset(self):
        self.data = [0.0] * len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


def evaluate_accuracy_gpu(net, data_iter, loss_func, device=None): 
    all_preds = []
    all_labels = []
    
    net.eval()
    metric = Accumulator(3)
    with torch.no_grad():
        for X, y in data_iter:
            X = X.to(device).to(torch.float)
            y = y.to(device).to(torch.long)
            y_hat = net(X)
            loss = loss_func(y_hat, y)
            metric.add(num_accurate(y_hat, y), y.numel(), loss.sum())
            
            preds = torch.argmax(y_hat, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())
            
    return metric[0] / metric[1], metric[2] / metric[1], all_preds, all_labels

def num_accurate(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return (preds == labels).sum().item()


def save_model(epoch, model, optimizer, scheduler, checkpoint_dir, train_loss_history, filename):

    p = Path(checkpoint_dir)
    p.mkdir(parents=True, exist_ok=True)

    assert '.pt' in filename
    for f in [f for f in os.listdir(p) if '.pt' in f]:
        os.remove(p / f)

    np.save(p / 'train_loss_history_cnn', train_loss_history)

    output = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'optimizer_type': type(optimizer).__name__,
        'epoch': epoch,
    }

    if scheduler is not None:
        output['scheduler_state_dict'] = scheduler.state_dict()
        output['scheduler_type'] = type(scheduler).__name__
        
    torch.save(output, p / filename)

def load_model(checkpointdir):

    net = MyNet()
    
    if checkpointdir is not None:
        p = Path(checkpointdir)
        if not p.is_dir():
            print('Checkpoint Error')
            return None
    
        files = [f for f in os.listdir(p) if f.endswith('.pt')]
        if not files:
            print('No model file found')
            return None
    
        checkpoint_path = p / files[0]
        checkpoint = torch.load(checkpoint_path)
    
        net.load_state_dict(checkpoint['model_state_dict'])
    
        optimizer_type = checkpoint.get('optimizer_type')
        if optimizer_type:
            optimizer = getattr(optim, optimizer_type)(net.parameters())
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        else:
            optimizer = None
    
        scheduler_type = checkpoint.get('scheduler_type')
        if scheduler_type:
            scheduler = getattr(optim.lr_scheduler, scheduler_type)(optimizer)
            scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        else:
            scheduler = None
    
        epoch = checkpoint['epoch']
        train_loss_history = checkpoint['train_loss_history']
    
        print('Load Successful')
        
        return net, optimizer, scheduler, epoch, train_loss_history

    else:
        return net, 0, []

def train(net, lr, epoch, total_epochs, dataset_train, data_loader, test_iter, train_loss_history, checkpoint_dir, device):
    '''
    net: 模型实例
    lr: Learning Rate
    epoch: 已经完成训练的epoch数量
    total_epochs: 需要训练的epoch数量
    dataset_train: 训练集
    data_loader: DataLoader的实例, 用于加载训练集
    test_iter: DataLoader的实力，用于加载训练集
    train_loss_history: list, 损失随epoch变化的记录
    checkpoint_dir: train loss history的输出地址
    device: 用于训练的设备
    '''

    net.to(device)
    
    loss_func = nn.MSELoss()  
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)  
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                        optimizer,
                        T_max=total_epochs,  
                    )
    
    for epoch in range(epoch, epoch + total_epochs +1):

        torch.cuda.empty_cache() 
        
        metric = Accumulator(3)

        net.train()
        
        for batch_idx, (x, y) in enumerate(data_loader):
            
            optimizer.zero_grad()

            data = x.to(device, non_blocking=True).to(torch.float)
            label = y.to(device, non_blocking=True).to(torch.long)

            pred = net(data)
            
            loss = loss_func(pred, label)

            with torch.no_grad():
                metric.add(loss.sum(), accuracy(pred, label), x.shape[0])

            loss.backward()

            optimizer.step()

            train_l = metric[0] / metric[2]
            train_acc = metric[1] / metric[2]

        scheduler.step()

        test_acc, test_l = evaluate_accuracy_gpu(net, test_iter, loss_func, device)

        train_loss_history.append([epoch+1, train_l, test_l, train_acc, test_acc])

        if (test_l <= min(np.asarray(train_loss_history)[:,1])):
            save_model(epoch, net, optimizer, scheduler, 
                       checkpoint_dir=checkpoint_dir,
                       train_loss_history=train_loss_history,
                       filename=f'model_e{epoch}.pt',)
            
        print('Epoch: '+ epoch)
        print(f'loss {train_l:.4f}, train acc {train_acc:.3f}, '
              f'test acc {test_acc:.3f}')

    return train_lost_history

# Training

In [None]:
import os
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt

import PIL
from PIL import Image
from io import BytesIO

import numpy as np
import torch
from torchvision import transforms

from ipynb.fs.defs.utils import *

import os
import numpy as np
from pathlib import Path

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
import torch.nn.functional as F

from ipynb.fs.full.utils import *

In [None]:
'''
from google.colab import drive
drive.mount('/content/drive')
'''

def concat_csv_in_folder(folder_path):
    csv_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]
    df_concatenated = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)
    return df_concatenated

## directory
data = concat_csv_in_folder("/Users/jianggh/Desktop/Gravity Spy Dataset/Data")

data = data[['ml_label','ml_confidence','url1','url2','url3','url4']]


#Change ml_label column into one hot expression.
unique_values = data['ml_label'].unique()

'''
one_hot_vectors = {val: np.zeros(len(unique_values)) for val in unique_values}
for i, val in enumerate(unique_values):
    one_hot_vectors[val][i] = 1

data['ml_label'] = data['ml_label'].apply(lambda x: one_hot_vectors[x])

#one hot into tensor. The loss function MSELoss in PyTorch accepts two tensor type.


data['ml_label'] = data['ml_label'].apply(lambda x: torch.tensor(x, dtype=torch.float32))
'''
data.dropna()
print(data.info())

data['ml_confidence'] = data['ml_confidence'].astype(float)

In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
if torch.cuda.is_available():
  device=torch.device('cuda')
else:
  device=torch.device('cpu')

In [None]:
checkpoint_dir = '/Users/jianggh/Desktop/Gravity Spy Dataset/Data'

net = Baseline3()

lr = 0.003
total_epochs = 30

In [None]:
train_dataset = GS_Simple_Dataset3(train_data)
test_dataset = GS_Simple_Dataset3(test_data)

In [None]:
traindl = DataLoader(train_dataset, batch_size = 16, shuffle = True)
testdl = DataLoader(test_dataset, batch_size = 16, shuffle = True)

import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

history = train(net, lr, epoch = 0, total_epochs = total_epochs, dataset_train = train_dataset, data_loader = traindl, test_iter = testdl, train_loss_history = [],
                checkpoint_dir = checkpoint_dir, device = device)

In [None]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize

import psutil
import humanize
import os
import GPUtil as GPU

GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
    process = psutil.Process(os.getpid())
    print("Gen RAM Free: " + humanize.naturalsize(psutil.virtual_memory().available), " |     Proc size: " + humanize.naturalsize(process.memory_info().rss))
    print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total     {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

### Analysis and Visualization

In [None]:
trainacc, trainloss, train_preds, train_labels = evaluate_accuracy_gpu(net, traindl, nn.CrossEntropyLoss(), device=device)

all_preds = test_preds + train_preds
all_labels = test_labels + train_labels

n=0
for i in range (len(all_preds)):
    if all_preds[i]==all_labels[i]:
        n+=1
print(n,n/len(all_labels))

In [None]:
num_classes = 24
confusion_matrix = ConfusionMatrix(task='multiclass', num_classes=num_classes)
confusion_matrix = confusion_matrix(torch.tensor(all_preds), torch.tensor(all_labels))

confusion_matrix_row_normalized = confusion_matrix.float() 
row_sums = confusion_matrix_row_normalized.sum(dim=1, keepdim=True)
confusion_matrix_row_normalized = confusion_matrix_row_normalized / row_sums
confusion_matrix_row_normalized = confusion_matrix_row_normalized.numpy()

acc = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average=None)
recall = recall_score(all_labels, all_preds, average=None)
f1 = f1_score(all_labels, all_preds, average=None)

In [None]:
labels = data['ml_label'].unique()
ylabels = list(labels)
ylabels.extend(['Recall', 'Precision'])
sns.set(style="white")

plt.figure(figsize=(15, 12))

recall_matrix = recall.reshape(1,-1)
precision_matrix = precision.reshape(1,-1)
extended_matrix = np.vstack((confusion_matrix_row_normalized, recall_matrix, precision_matrix))


ax = sns.heatmap(extended_matrix, annot=True, cmap='viridis', fmt=".2f",
                 xticklabels=labels, yticklabels=ylabels,
                 cbar_kws={'label': 'Value'},annot_kws={'size': 10, 'color': 'black'})

ax.set_xlabel('Predicted Class')
ax.set_ylabel('True Class')

plt.show()

In [None]:
data = pd.read_csv("/share/GS Testing Data/0.samples.csv")

result_df = pd.DataFrame()

grouped = data.groupby('ml_label')

for label, group in grouped:
    part1 = group[group['ml_confidence'] <= 0.90].sample(n=100, random_state=42,replace=True)
    part2 = group[group['ml_confidence'] > 0.90].sample(n=0, random_state=42)    
    result_df = pd.concat([result_df, part1, part2])

from tqdm import tqdm

model = torch.load('/share/GS Testing Data/model_6.19_3epochs0.01.pt')
umapdataset = GS_Simple_Dataset(result_df,dic)
dataloader = DataLoader(umapdataset, batch_size=32, shuffle=False)

last_conv_outputs = []
labels = []

def hook_fn(module, input, output):
    last_conv_outputs.append(output.detach().cpu().numpy())

last_conv_layer = model.features[-3]

hook = last_conv_layer.register_forward_hook(hook_fn)

model.eval()

with torch.no_grad():
    for data, label in tqdm(dataloader):
        data = data.to(device).to(torch.float)
        label = label.to(device).to(torch.long)
        _ = model(data)
        labels.extend(label.cpu().numpy())

hook.remove()

last_conv_outputs = np.concatenate([arr.reshape(arr.shape[0], -1) for arr in last_conv_outputs], axis=0)

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, perplexity=49.0, n_iter=1000)
data_tsne = tsne.fit_transform(last_conv_outputs)

plt.figure(figsize=(10, 5))

unique_labels = np.unique(labels)

colors = plt.cm.tab20b(np.linspace(0, 1, len(unique_labels)))
for i, label in enumerate(unique_labels):
    indices = np.where(labels == label)[0]
    plt.scatter(data_tsne[indices, 0], data_tsne[indices, 1], s=100, color=colors[i], label=str(label), alpha=0.9)

    
    for x, y in data_tsne[indices]:
        plt.text(x, y, str(label), fontsize=6, ha='center', va='center')
    
    
plt.xticks([])
plt.yticks([])
plt.title('TSNE')

In [None]:
from umap import UMAP

data = np.load('/Users/jianggh/Desktop/lastconvoutputs.npy')
labels = np.load('/Users/jianggh/Desktop/labels.npy')

reducer = UMAP(n_components=2, n_neighbors=40, min_dist=0.1)
data_umap = reducer.fit_transform(data)
data_umap.shape

import matplotlib.pyplot as plt
unique_labels = np.unique(labels)
label_indices = {label: np.where(labels == label)[0] for label in unique_labels}

# 绘制每个类别的点
for label, indices in label_indices.items():
    plt.scatter(data_umap[indices, 0], data_umap[indices, 1], label=str(label))

# 添加图例
plt.legend()

# 设置标题和坐标轴
plt.title('UMAP Visualization')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')

# 显示图形
plt.show()