In [2]:
import os
import numpy as np
import re
from glob import glob
import matplotlib.pyplot as plt
import pandas as pd
import gc

import PIL
import torch
import torchvision

from torch.utils.data import Dataset, DataLoader
import cv2
from PIL import Image

import torchvision.models as models
import torch.nn as nn
from torch.nn import Linear, ReLU, CrossEntropyLoss, Conv2d, MaxPool2d, Module, Flatten
from torch.optim import Adam
from tqdm.notebook import tqdm as tqdm
from ipywidgets import IntProgress

import json

In [12]:
class Dataset(Dataset):
    def __init__(self, x, y, transform):
        self.x = x
        self.y = y
        self.transform = transform

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        img = cv2.imread(self.x[idx], cv2.IMREAD_COLOR)
        if img is None:
            print('Not found img : ', self.x[idx])
        img = Image.fromarray(img)
        img = self.transform(img)
        return img, self.y[idx]

In [48]:
def load_nail_csv(folder='./ML_hw2/學生的training_data/'):
    """
    intro:
        先存入原圖位置，壓縮照片後再存入./ML_hw2/學生的training_data/resize/
        回傳 path , label
    aug:
        folder = 讀入資料之目的資料夾
        batch_size = batch_size
    output:
        path: 照片路徑 : ./ML_hw2/resize/id
        label: 標籤  : float number
    """
    path = []
    label = []
    slice_csv = re.sub('學生的', "" ,folder.split('/')[-2] ) #提取training_data或test_data
    csv_path = f'{folder}{slice_csv}.csv'
    resize_folder = f'{folder}resize/'
    if not os.path.isdir(resize_folder):
        os.makedirs(resize_folder)
    with open(csv_path, 'r', encoding='utf8') as f:        
        f.readline()
        for line in tqdm(f):
            clean_line = line.replace('\n', '').replace('\ufeff', '').split(',')
            # [id, light, ground_truth, grade]
            curr_img_path = f'{folder}{clean_line[1]}/{clean_line[0]}'
            new_img_path = f'{resize_folder}{clean_line[0]}'
            if not os.path.isfile(curr_img_path):
                print(f'No file for path : {curr_img_path}')
                continue
            #將未處理照片存入新資料夾位置：./ML_hw2/resize/
            if not os.path.isfile(new_img_path):
                img = cv2.imread(curr_img_path, cv2.IMREAD_COLOR)
                img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_AREA)
                cv2.imwrite(new_img_path, img)
            path.append(new_img_path)
            label.append(float(clean_line[2]))

    print('data size: ')
    print(len(path), len(label))
    print(path[:3])
    print(label[:3])
    print(type(path),type(label))
    print()
    return path, label

In [14]:
def dataloader_prepare(folder='./ML_hw2/學生的training_data/', batch_size=8):
    """
    intro:
        使用load_nail_csv準備照片
        Dataset轉為dataset型式
        切 train, validation set , DataLoader存入
    aug:
        folder = 讀入資料之目的資料夾
        batch_size = batch_size
    output:
        train_dataloader, valid_dataloader
    """

    transform = torchvision.transforms.Compose([
        #torchvision.transforms.Resize((224,224)),
        torchvision.transforms.RandomHorizontalFlip(p = 0.5),
        torchvision.transforms.RandomRotation(15, resample=PIL.Image.BILINEAR),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

    path, label = load_nail_csv(folder)
    augment_dataset = Dataset(path, label, transform)
    
    #切分70%當作訓練集、30%當作驗證集
    train_size = int(0.7 * len(augment_dataset))
    valid_size = len(augment_dataset) - train_size
    train_data, valid_data = torch.utils.data.random_split(augment_dataset, [train_size, valid_size])
    
    train_dataloader = DataLoader( train_data , batch_size=batch_size, shuffle=True)
    valid_dataloader = DataLoader( valid_data , batch_size=batch_size, shuffle=True)
    
    return train_dataloader, valid_dataloader

In [15]:
def train(model,name,n_epochs,train_loader,valid_loader,optimizer,criterion,batch_size,patience):
    """
    intro:
        每次epoch都 train the model , validate the model
        印出 train_loss , val_loss 
        回傳 model
    aug:
        model,n_epochs,train_loader,valid_loader,optimizer,criterion,batch_size
    output:
        model
    """
    print(f'Start to run {name}')
#     best_train_loss = 100
#     best_train_acc = 0
#     best_val_loss = 100
#     best_val_acc = 0
#     best_F1 = 0
#     last_epoch = 0

    history = {
        'train_loss':[],
        'valid_loss':[],
    }
    
    if torch.cuda.is_available():
        model.cuda()
    else:
        print('no gpu use')
    for epoch in range(1, n_epochs+1):
        # keep track of training and validation loss
        train_loss,valid_loss = 0.0,0.0
        train_losses,valid_losses=[],[]

        print(f'running epoch: {epoch}/{n_epochs}')
        #############################################################################################################
        #                                              train the model                                              #
        #############################################################################################################
        model.train()
        for num, (data, target) in enumerate(train_loader):
            # move tensors to GPU if CUDA is available
            if torch.cuda.is_available():#train_on_gpu
                data, target = data.cuda(), target.cuda()
            else:
                print('1')
            # forward pass: compute predicted outputs by passing inputs to the model
            output = model(data)
            # calculate the batch loss
            loss = criterion(output.flatten(), target.float())
            # backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()
            # perform a single optimization step (parameter update)
            optimizer.step()
            # update training loss
            train_losses.append(loss.item())
            # clear the gradients of all optimized variables
            optimizer.zero_grad()
            
            if num%10 == 0 :
                print(f'train stage：{num}/{len(train_loader)}', end='\r')
        #############################################################################################################
        #                                            validate the model                                             #
        #############################################################################################################
        model.eval()
        for num, (data, target) in enumerate(valid_loader):
            # move tensors to GPU if CUDA is available
            if torch.cuda.is_available():#train_on_gpu
                data, target = data.cuda(), target.cuda()
                
            # forward pass: compute predicted outputs by passing inputs to the model
            output = model(data)
            # calculate the batch loss
            loss = criterion(output.flatten(), target.float())
            # update validation loss
            valid_losses.append(loss.item())
                        
            if num%10 == 0 :
                print(f'Valid stage：{num}/{len(valid_loader)}', end='\r')
        #############################################################################################################
        #                                     print train/val/cmt epoch result                                      #
        #############################################################################################################
        # calculate average losses
        train_loss=np.average(train_losses)
        valid_loss=np.average(valid_losses)
        print(f'Training Loss: {train_loss:.3f} \tValidation Loss: {valid_loss:.3f}\n')
        
        history['train_loss'].append(train_loss)
        history['valid_loss'].append(valid_loss)
        
    
        #############################################################################################################
        #                                                Draw picture                                               #
        #############################################################################################################
    
    with open(f'./result/{name}/result.json', 'w') as json_file:
        json.dump(history, json_file)
    
    x = np.arange(1,n_epochs+1,1)
    train_loss = history['train_loss']
    valid_loss = history['valid_loss']
    
    fig = plt.figure(figsize=(12,4))
    fig.subplots_adjust(hspace=0.4, wspace=0.3)

    plt.subplot(1,1,1)
    plt.title(f"{name} Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.plot(x, train_loss, label='training')
    plt.plot(x, valid_loss, label='validation')
    plt.legend(loc='upper right')
    
    #save result
    plt.savefig(f'./result/{name}/plot.png')
    
    return model

In [16]:
def testing_result(model): 
    """
    intro:
        讀取'./ML_hw2/學生的testing_data/'
        並將照片處理，拿原本模型預測後輸出文件
    aug:
        model
    result:
        ./HW3_E24056954.csv
    """
    #############################################################################################################
    #                                   loading and resize testing picture                                      #
    #############################################################################################################
    testing_path = []
    testing_write = []
    folder = './ML_hw2/學生的testing_data/'
    slice_csv = 'testing_data'#提取testing_data
    csv_path = './HW2_E24056954.csv'
    resize_folder = f'{folder}resize/'
    if not os.path.isdir(resize_folder):
        os.makedirs(resize_folder)
    with open(csv_path, 'r', encoding='utf8') as f:   
        testing_write.append(f.readline())
        for line in f:
            clean_line = line.replace('\n', '').replace('\ufeff', '').split(',')
            # [id, light, ground_truth, grade]
            testing_write.append(clean_line)
            curr_img_path = f'{folder}{slice_csv}/{clean_line[0]}'
            new_img_path = f'{resize_folder}{clean_line[0]}'
            if not os.path.isfile(curr_img_path):
                print(curr_img_path)
                print('catch')
                continue
            if not os.path.isfile(new_img_path):
                img = cv2.imread(curr_img_path, cv2.IMREAD_COLOR)
                img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_AREA)
                cv2.imwrite(new_img_path, img)
            testing_path.append(new_img_path)
    print('data size: ')
    print(f'testing數量：{len(testing_path)}')
    
    #############################################################################################################
    #                                   use hypothesis model predict testing set                                #
    #############################################################################################################
    transform = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])
    model.eval()
    pred_regression=[]
    for path in testing_path:
        img = cv2.imread(path, cv2.IMREAD_COLOR)
        img = transform(img).cuda()
        img = img.unsqueeze(0)
        with torch.no_grad(): 
            output=model(img)
        output = float(output.squeeze(0)[0])
        pred_regression.append(output)
        print(f'{path} / {output}')
    #############################################################################################################
    #                                             output require csv                                            #
    #############################################################################################################
    with open('HW3_E24056954.csv', 'w', encoding='utf8') as wp:
        wp.write(testing_write[0])
        for pred_regression_,testing_write_ in zip(pred_regression,testing_write[1:]):
            wp.write(f'{testing_write_[0]},{testing_write_[1]},{pred_regression_},{testing_write_[3]}\n')

# Try wide_resnet50_2 finetune

In [17]:
name = 'HW3_model_ft_wide_resnet50_2'
model_ft = models.wide_resnet50_2(pretrained=True)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs,1)

# Try vgg19 finetune

In [None]:
# name = 'HW3_model_ft_vgg19'
# model_ft = models.vgg19(pretrained=True)
# num_ftrs = model_ft.classifier[6].in_features
# model_ft.classifier[6] = nn.Linear(num_ftrs,1)

In [18]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print('GPU State:', device)
model_ft=model_ft.to(device)# 放入裝置
model_ft = model_ft.float()
n_epochs = 12
batch_size = 32
train_dataloader, valid_dataloader = dataloader_prepare(folder='./ML_hw2/學生的training_data/',batch_size = batch_size)
optimizer = torch.optim.Adam([
    {'params':model_ft.parameters()}
], lr=0.0001)
criterion = nn.MSELoss()
patience = 3
if not os.path.isdir(f'./result/{name}/'):
    os.makedirs(f'./result/{name}/')
model_ft = train(model_ft,
      name,
      n_epochs,
      train_dataloader,
      valid_dataloader,
      optimizer,
      criterion,
      batch_size,
      patience)

GPU State: cuda:0


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

No file for path : ./ML_hw2/學生的training_data/B/17871372_B_2514_20200703_094952_9.9.jpg
No file for path : ./ML_hw2/學生的training_data/B/17871372_B_7322_20200925_114302_8.8.jpg

data size: 
2026 2026
['./ML_hw2/學生的training_data/resize/00130747_A_3457_20200715_100727_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_3458_20200715_100736_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_4810_20200812_112534_7.9.jpg']
[7.5, 7.5, 7.9]
<class 'list'> <class 'list'>

Start to run HW3_model_ft_wide_resnet50_2
running epoch: 1/12
train stage：0/45

RuntimeError: CUDA out of memory. Tried to allocate 14.00 MiB (GPU 0; 10.76 GiB total capacity; 3.79 GiB already allocated; 8.25 MiB free; 3.87 GiB reserved in total by PyTorch)

# Print testing data result

In [None]:
testing_result(model_ft)

# test CNN + XGBoost

In [132]:
from sklearn.model_selection import train_test_split
import pandas as pd
import xgboost as xg 
from sklearn.metrics import mean_squared_error as MSE 

model_cnn = torch.load('./result/HW3_model_ft_wide_resnet50_2/model.pt')
model_cnn_extractor = nn.Sequential(*list(model_cnn.children())[:-1]) # strips off last linear layer



def trainsform_data( address_set , transform , model):
    data = []
    for add in tqdm(address_set):
        img = cv2.imread(add, cv2.IMREAD_COLOR)
        img = transform(img).cuda()
        img = img.unsqueeze(0)
        with torch.no_grad(): 
            output=model(img)
        output = output.squeeze(0).squeeze(1).squeeze(1).tolist()
        data.append(output)
    return pd.DataFrame(data)

for seed in range(10):
    seed = seed*10
    add_x , label_y = load_nail_csv(folder='./ML_hw2/學生的training_data/')
    train_add_x , test_add_x , train_y , test_y = train_test_split(add_x,label_y,test_size=0.2,random_state=seed)

    transform = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

    train_X = trainsform_data( train_add_x , transform , model_cnn_extractor)
    test_X = trainsform_data( test_add_x , transform , model_cnn_extractor)
    train_y = pd.DataFrame(train_y)
    test_y = pd.DataFrame(test_y)
#     print(f'train_X len:{len(train_X)}')
#     print(f'train_y len:{len(train_y)}')
#     print(f'test_X len:{len(test_X)}')
#     print(f'test_y len:{len(test_y)}')

#     print(f'train_X type:{type(train_X)}')
#     print(f'train_y type:{type(train_y)}')
#     print(f'test_X type:{type(test_X)}')
#     print(f'test_y type:{type(test_y)}')

    # Instantiation 
    xgb_r = xg.XGBRegressor(objective ='reg:linear', 
                      n_estimators = 100, seed = 50) 

    # Fitting the model 
    xgb_r.fit(train_X, train_y) 


    # Predict the model 
    pred = xgb_r.predict(test_X) 

    # for pred_ , test_ in zip(pred,test_y[0]):
    #     print(f'pred , test : {pred_:.3f}\t{test_}')

    # RMSE Computation 
    mse = MSE(test_y, pred)
    print(f"MSE : {mse}") 

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

No file for path : ./ML_hw2/學生的training_data/B/17871372_B_2514_20200703_094952_9.9.jpg
No file for path : ./ML_hw2/學生的training_data/B/17871372_B_7322_20200925_114302_8.8.jpg

data size: 
2026 2026
['./ML_hw2/學生的training_data/resize/00130747_A_3457_20200715_100727_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_3458_20200715_100736_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_4810_20200812_112534_7.9.jpg']
[7.5, 7.5, 7.9]
<class 'list'> <class 'list'>



HBox(children=(FloatProgress(value=0.0, max=1620.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=406.0), HTML(value='')))


MSE : 0.5993374042116644


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

No file for path : ./ML_hw2/學生的training_data/B/17871372_B_2514_20200703_094952_9.9.jpg
No file for path : ./ML_hw2/學生的training_data/B/17871372_B_7322_20200925_114302_8.8.jpg

data size: 
2026 2026
['./ML_hw2/學生的training_data/resize/00130747_A_3457_20200715_100727_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_3458_20200715_100736_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_4810_20200812_112534_7.9.jpg']
[7.5, 7.5, 7.9]
<class 'list'> <class 'list'>



HBox(children=(FloatProgress(value=0.0, max=1620.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=406.0), HTML(value='')))


MSE : 0.5566511014432637


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

No file for path : ./ML_hw2/學生的training_data/B/17871372_B_2514_20200703_094952_9.9.jpg
No file for path : ./ML_hw2/學生的training_data/B/17871372_B_7322_20200925_114302_8.8.jpg

data size: 
2026 2026
['./ML_hw2/學生的training_data/resize/00130747_A_3457_20200715_100727_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_3458_20200715_100736_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_4810_20200812_112534_7.9.jpg']
[7.5, 7.5, 7.9]
<class 'list'> <class 'list'>



HBox(children=(FloatProgress(value=0.0, max=1620.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=406.0), HTML(value='')))


MSE : 0.5695938448414936


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

No file for path : ./ML_hw2/學生的training_data/B/17871372_B_2514_20200703_094952_9.9.jpg
No file for path : ./ML_hw2/學生的training_data/B/17871372_B_7322_20200925_114302_8.8.jpg

data size: 
2026 2026
['./ML_hw2/學生的training_data/resize/00130747_A_3457_20200715_100727_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_3458_20200715_100736_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_4810_20200812_112534_7.9.jpg']
[7.5, 7.5, 7.9]
<class 'list'> <class 'list'>



HBox(children=(FloatProgress(value=0.0, max=1620.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=406.0), HTML(value='')))


MSE : 0.6484847049546669


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

No file for path : ./ML_hw2/學生的training_data/B/17871372_B_2514_20200703_094952_9.9.jpg
No file for path : ./ML_hw2/學生的training_data/B/17871372_B_7322_20200925_114302_8.8.jpg

data size: 
2026 2026
['./ML_hw2/學生的training_data/resize/00130747_A_3457_20200715_100727_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_3458_20200715_100736_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_4810_20200812_112534_7.9.jpg']
[7.5, 7.5, 7.9]
<class 'list'> <class 'list'>



HBox(children=(FloatProgress(value=0.0, max=1620.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=406.0), HTML(value='')))


MSE : 0.6328475406123292


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

No file for path : ./ML_hw2/學生的training_data/B/17871372_B_2514_20200703_094952_9.9.jpg
No file for path : ./ML_hw2/學生的training_data/B/17871372_B_7322_20200925_114302_8.8.jpg

data size: 
2026 2026
['./ML_hw2/學生的training_data/resize/00130747_A_3457_20200715_100727_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_3458_20200715_100736_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_4810_20200812_112534_7.9.jpg']
[7.5, 7.5, 7.9]
<class 'list'> <class 'list'>



HBox(children=(FloatProgress(value=0.0, max=1620.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=406.0), HTML(value='')))


MSE : 0.6771308866682039


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

No file for path : ./ML_hw2/學生的training_data/B/17871372_B_2514_20200703_094952_9.9.jpg
No file for path : ./ML_hw2/學生的training_data/B/17871372_B_7322_20200925_114302_8.8.jpg

data size: 
2026 2026
['./ML_hw2/學生的training_data/resize/00130747_A_3457_20200715_100727_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_3458_20200715_100736_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_4810_20200812_112534_7.9.jpg']
[7.5, 7.5, 7.9]
<class 'list'> <class 'list'>



HBox(children=(FloatProgress(value=0.0, max=1620.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=406.0), HTML(value='')))


MSE : 0.6319547735996011


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

No file for path : ./ML_hw2/學生的training_data/B/17871372_B_2514_20200703_094952_9.9.jpg
No file for path : ./ML_hw2/學生的training_data/B/17871372_B_7322_20200925_114302_8.8.jpg

data size: 
2026 2026
['./ML_hw2/學生的training_data/resize/00130747_A_3457_20200715_100727_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_3458_20200715_100736_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_4810_20200812_112534_7.9.jpg']
[7.5, 7.5, 7.9]
<class 'list'> <class 'list'>



HBox(children=(FloatProgress(value=0.0, max=1620.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=406.0), HTML(value='')))


MSE : 0.519745091898487


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

No file for path : ./ML_hw2/學生的training_data/B/17871372_B_2514_20200703_094952_9.9.jpg
No file for path : ./ML_hw2/學生的training_data/B/17871372_B_7322_20200925_114302_8.8.jpg

data size: 
2026 2026
['./ML_hw2/學生的training_data/resize/00130747_A_3457_20200715_100727_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_3458_20200715_100736_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_4810_20200812_112534_7.9.jpg']
[7.5, 7.5, 7.9]
<class 'list'> <class 'list'>



HBox(children=(FloatProgress(value=0.0, max=1620.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=406.0), HTML(value='')))


MSE : 0.5568559526796997


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

No file for path : ./ML_hw2/學生的training_data/B/17871372_B_2514_20200703_094952_9.9.jpg
No file for path : ./ML_hw2/學生的training_data/B/17871372_B_7322_20200925_114302_8.8.jpg

data size: 
2026 2026
['./ML_hw2/學生的training_data/resize/00130747_A_3457_20200715_100727_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_3458_20200715_100736_7.5.jpg', './ML_hw2/學生的training_data/resize/00130747_A_4810_20200812_112534_7.9.jpg']
[7.5, 7.5, 7.9]
<class 'list'> <class 'list'>



HBox(children=(FloatProgress(value=0.0, max=1620.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=406.0), HTML(value='')))


MSE : 0.6048103722101078


In [141]:
def testing_result_cnn_xgb(XGR , model_extraction , folder = './testing_data_part2/' , csv_path = './testing_data_part2.csv'): 
    """
    intro:
        讀取'./ML_hw2/學生的testing_data/'
        並將照片處理，拿原本模型預測後輸出文件
    aug:
        model
    result:
        ./testing_data_part2.csv
    """
    #############################################################################################################
    #                                   loading and resize testing picture                                      #
    #############################################################################################################
    testing_path = []
    testing_write = []
    resize_folder = f'{folder}resize/'
    if not os.path.isdir(resize_folder):
        os.makedirs(resize_folder)
    with open(csv_path, 'r', encoding='utf8') as f:   
        testing_write.append(f.readline())
        for line in f:
            clean_line = line.replace('\n', '').split(',')
            # [id, light, ground_truth, grade]
            testing_write.append(clean_line)
            curr_img_path = f'{folder}/{clean_line[0]}'
            new_img_path = f'{resize_folder}{clean_line[0]}'
            if not os.path.isfile(curr_img_path):
                print(curr_img_path)
                print('catch')
                continue
            if not os.path.isfile(new_img_path):
                img = cv2.imread(curr_img_path, cv2.IMREAD_COLOR)
                img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_AREA)
                cv2.imwrite(new_img_path, img)
            testing_path.append(new_img_path)
    print('data size: ')
    print(f'testing數量：{len(testing_path)}')
    print(testing_path[:5])
    #############################################################################################################
    #                                               cnn extraction                                              #
    #############################################################################################################
    transform = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])
    test_X = trainsform_data( testing_path , transform , model_cnn_extractor)
    
    #############################################################################################################
    #                                             xgboost prediction                                            #
    #############################################################################################################
    pred = XGR.predict(test_X) 
    #############################################################################################################
    #                                             output require csv                                            #
    #############################################################################################################
    with open('HW3_CNN_XGB_E24056954.csv', 'w', encoding='utf8') as wp:
        wp.write(testing_write[0])
        for pred_regression_,testing_write_ in zip(pred,testing_write[1:]):
            wp.write(f'{testing_write_[0]},{pred_regression_}\n')

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import xgboost as xg 
from sklearn.metrics import mean_squared_error as MSE 

model_cnn = torch.load('./result/HW3_model_ft_wide_resnet50_2/model.pt')
model_cnn_extractor = nn.Sequential(*list(model_cnn.children())[:-1]) # strips off last linear layer



def trainsform_data( address_set , transform , model):
    data = []
    for add in tqdm(address_set):
        img = cv2.imread(add, cv2.IMREAD_COLOR)
        img = transform(img).cuda()
        img = img.unsqueeze(0)
        with torch.no_grad(): 
            output=model(img)
        output = output.squeeze(0).squeeze(1).squeeze(1).tolist()
        data.append(output)
    return pd.DataFrame(data)


add_x , label_y = load_nail_csv(folder='./ML_hw2/學生的training_data/')

transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

train_X = trainsform_data( add_x , transform , model_cnn_extractor)
train_y = pd.DataFrame(label_y)
#     print(f'train_X len:{len(train_X)}')
#     print(f'train_y len:{len(train_y)}')
#     print(f'test_X len:{len(test_X)}')
#     print(f'test_y len:{len(test_y)}')

#     print(f'train_X type:{type(train_X)}')
#     print(f'train_y type:{type(train_y)}')
#     print(f'test_X type:{type(test_X)}')
#     print(f'test_y type:{type(test_y)}')

# Instantiation 
xgb_r = xg.XGBRegressor(objective ='reg:linear', 
                  n_estimators = 100, seed = 50) 

# Fitting the model 
xgb_r.fit(train_X, train_y) 

In [142]:
testing_result_cnn_xgb(xgb_r , model_cnn_extractor)

data size: 
testing數量：534
['./testing_data_part2/resize/wslOYFyS.jpg', './testing_data_part2/resize/mjEZWfu2.jpg', './testing_data_part2/resize/0d2zTSnx.jpg', './testing_data_part2/resize/cvzGTNxF.jpg', './testing_data_part2/resize/DYqR4jzb.jpg']


HBox(children=(FloatProgress(value=0.0, max=534.0), HTML(value='')))


