<a href="https://colab.research.google.com/github/karlmaji/pytorch_learning/blob/master/%E4%BD%BF%E7%94%A8pytoch_lightning%E5%81%9Akaggle%E4%B8%AD%E7%9A%84%E6%89%8B%E5%86%99%E6%95%B0%E5%AD%97%E8%AF%86%E5%88%AB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install einops
!pip install pytorch-lightning

In [None]:
%config Completer.use_jedi = False #kaggle中用于函数提示

In [None]:
from einops import rearrange,reduce #爱因斯坦标识库 可以非常方便地对tensor做reshape
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
from torchvision import transforms
from torch.utils.data import random_split
import pytorch_lightning as pl
# 为了模型训练能够复现 需要设置随机种子
# Function for setting the seed
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():  # GPU operation have separate seed
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)


set_seed(42)
"""
benchmark 参数为True 那么cuda每次都会去测试最优的卷积内核，\
如果模型结构变化不大，那么设置True 会提高训练速度（开始慢后面快）

torch.backends.cudnn.deterministic置为True的话，每次返回的卷积算法将
是确定的，即默认算法。如果配合上设置 Torch 的随机种子为固定值的话，
应该可以保证每次运行网络的时候相同输入的输出是固定的
"""
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False
#选择加速器
device = torch.device("cpu") if not torch.cuda.is_available() else torch.device("cuda:0")
print("Using device", device)

# pytorch训练第一步，构建dataset,dataloader。
> 1.继承torch.utils.data中的Dataset类,init载入data或路径， len方法需要获取data个数，getitem方法根据索引获取实际数据供给后续的DataLoader使用  

> 2.可以使用torch.utils.data 中的 random_split函数对dataset进行分割  

> 3.使用torch.utils.data 中的DataLoader 构建DataLoader 使用x,y = next(iter(dataloader)) 查看一个batch的数据

In [None]:
class mydataset(Dataset):
    def __init__(self,data_dir,transpose):
        data = pd.read_csv(data_dir)
        
        x= data.drop(columns = 'label')
        x= torch.from_numpy(np.array(x)).to(torch.float)
        bs,n = x.shape
        w = h = int(np.sqrt(n))
        self.x = rearrange(x, 'bs (w h) -> bs 1 w h',w = w,h = h)
        
        y = data['label']
        self.y = torch.from_numpy(np.array(y))
        
    def __len__(self):
        return len(self.y)
    def __getitem__(self,idx):
        return self.x[idx] , self.y[idx]
    

dataset = mydataset('/kaggle/input/digit-recognizer/train.csv',None)
split_size = int(len(dataset) *0.7)
train_dataset,val_dataset = random_split(dataset,[split_size ,len(dataset)-split_size])


train_dataloader = DataLoader(train_dataset,batch_size=32 ,shuffle =True)
val_dataloader = DataLoader(val_dataset,batch_size=32 ,shuffle =False)



# 第二步 使用pytorch-lightning 构建网络模型
> why? pytoch-lightning提供了非常丰富的API，可以简化复杂的网络调参及训练过程，例如使用callback方法在训练中保存最优模型，根据数据集的特征修正网络模型中的层数和每层神经元量等超参数

# use pytorch-lightning to build model

In [None]:
# 与传统方法不同，原先需要继承nn.Module类，pytoch-lightning 需要继承自pl.LightningModule类
class ConvModel(pl.LightningModule):
    # init构建及forward与原先相同
    def __init__(self,in_channels):
        super(ConvModel,self).__init__()
        self.conv1 = nn.Conv2d(in_channels = in_channels, out_channels = 2 * in_channels,kernel_size=(7,7))
        self.conv2 = nn.Conv2d(in_channels = 2 * in_channels , out_channels = 4 * in_channels,kernel_size =(3,3))
        self.conv3 = nn.Conv2d(in_channels = 4 * in_channels , out_channels = 8 * in_channels,kernel_size =(3,3))
        self.conv4 = nn.Conv2d(in_channels = 8 * in_channels , out_channels = 16 * in_channels,kernel_size =(3,3))
        self.L1 = nn.Linear(in_features = 4096 ,out_features= 1024)
        
        self.L2 = nn.Linear(in_features = 1024,out_features=  128)
        self.L3 = nn.Linear(in_features = 128,out_features=  10)
    def forward(self,x):
        bs,c,h,w  = x.shape
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x  = x.reshape(bs,-1)
        x = F.relu(self.L1(x))
        x = F.relu(self.L2(x))
        x = self.L3(x)
        return x
    #optimizer的构建 包装在Module类中（可以有多个optimizer）
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)
    #每一个batch的训练过程 输入为batch，batchidx 需要返回loss
    def training_step(self,batch,batch_idx):
        x,y = batch
        logits =self(x)
        loss = F.cross_entropy(logits,y)
        self.log("train_loss",loss)
        return loss
    #每一个batch的验证过程 同train 可以不return
    #需要监视的参数 使用self.log()方法播报 这里播报了val_loss 
    def validation_step(self, batch, batch_idx):
        x,y = batch
        num = x.shape[0]
        logits = self(x)
        self.log("val_loss",F.cross_entropy(logits,y))
        y_pre = logits.argmax(dim = -1)
        correct = torch.eq(y_pre,y).sum().item()
        return correct,num
    #验证过程中的每一个epoch结束调用下列函数， val_step_outputs 中存储的是每一个验证batch中return的参数的列表
    #此处val_step_outputs中每一个元素为（correct,num） 构成的元组
    def validation_epoch_end(self, val_step_outputs):
        correct_sum = 0
        num = 0
        for correct,n in val_step_outputs:
            correct_sum += correct
            num += n
        val_acc = correct_sum/num
        #播报val_acc
        self.log("val_acc",val_acc)


In [None]:
from pytorch_lightning.callbacks import ModelCheckpoint
#从callbacks 中 导入 checkpoint 用于保存训练中的checkpoint
#这里监控了val_acc参数 保存该参数最大的4个模型
checkpointCallback = ModelCheckpoint(dirpath='./',
                                     filename='{epoch}-{val_acc:.2f}-{val_loss:.2f}',
                                    monitor='val_acc',
                                    save_top_k = 4)
#实例化模型
model = ConvModel(1)
#实例化训练器
trainer = pl.Trainer(gpus=1,callbacks = [checkpointCallback])
#喂入数据
trainer.fit(model,train_dataloader,val_dataloader)

---
# 以上为训练过程

In [None]:
#训练完成，载入val_acc 最优的模型 用于test
model.load_from_checkpoint("./epoch=31-val_acc=0.97-val_loss=0.22.ckpt",in_channels = 1)



In [None]:
#定义预测函数，（看官方文档也可以放入model中的方法）
def predict(model,test_dir,output_dir):
    test_data = pd.read_csv(test_dir)
    x= torch.from_numpy(np.array(test_data)).to(torch.float)
    bs,n = x.shape
    w = h = int(np.sqrt(n))
    x = rearrange(x, 'bs (w h) -> bs 1 w h',w = w,h = h)
    x = rearrange(x,'bs 1 w h -> bs 1 1 w h')
    
    
    model = model.to(device)
    x = x.to(device)
    y_pre_ls =[]
    for i in range(x.shape[0]):
        logits = model(x[i])
        y_pre = logits.argmax(dim=-1)
        y_pre_ls.append((i,y_pre.item()))
    with open(output_dir,'w+') as f:
        f.write(f'ImageId,Label\n')
        for index,y in y_pre_ls:
            f.write(f'{index+1},{y}\n')
            
#对测试集进行预测并产生需要提交的文件格式            
predict(model,"../input/digit-recognizer/test.csv","/kaggle/working/result.csv")

---
# 下面是使用传统的方法构建模型和训练过程

In [None]:
class ConvModel(nn.Module):
    def __init__(self,in_channels):
        super(ConvModel,self).__init__()
        self.conv1 = nn.Conv2d(in_channels = in_channels, out_channels = 2 * in_channels,kernel_size=(7,7))
        self.conv2 = nn.Conv2d(in_channels = 2 * in_channels , out_channels = 4 * in_channels,kernel_size =(3,3))
        self.conv3 = nn.Conv2d(in_channels = 4 * in_channels , out_channels = 8 * in_channels,kernel_size =(3,3))
        self.conv4 = nn.Conv2d(in_channels = 8 * in_channels , out_channels = 16 * in_channels,kernel_size =(3,3))
        self.L1 = nn.Linear(in_features = 4096 ,out_features= 512)
        self.L2 = nn.Linear(in_features = 512 ,out_features=  10)
    def forward(self,x):
        bs,c,h,w  = x.shape
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x  = x.reshape(bs,-1)
        x = F.relu(self.L1(x))
        x = self.L2(x)
        return x
    
        
model = ConvModel(1)

parameters_num = 0
for x,m in model.named_parameters():
    parameters_num += m.numel()
    
print(parameters_num)

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(),lr=1e-3)


In [None]:
def train(model,train_dataloader,val_dataloader,epoch,optimizer,save_dir):
    model =model.to(device)
    loss_ls = []
    model.train()
    for i in range(epoch):
        
        for batch,(x,y) in enumerate(train_dataloader):
            x,y = x.to(device) ,y.to(device)
            logits = model(x)
            loss = F.cross_entropy(logits,y)
            loss_ls.append( loss.item() )
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if batch % 50 == 0:
                loss = loss.item()
                print(f"------------train_loss: {loss:>7f}------------")
        
        correct = 0
        for batch,(x,y) in enumerate(val_dataloader):
            x,y = x.to(device) ,y.to(device)
            logits = model(x)
            y_pre = logits.argmax(dim=-1)         
            correct += torch.eq(y_pre,y).sum().item()
            
        correct = correct/len(val_dataloader.dataset)
        print(f"corect is {correct:2f}")
        
        torch.save({
            "model_weight":model.state_dict(),
            "optimizer":optimizer.state_dict(),
            "loss":loss_ls,
            "epoch":epoch
        },f'{save_dir}/epoch-{i}.ckpt')
        
        
        
    

In [None]:
def predict(model,test_dir,output_dir):
    test_data = pd.read_csv(test_dir)
    x= torch.from_numpy(np.array(test_data)).to(torch.float)
    bs,n = x.shape
    w = h = int(np.sqrt(n))
    x = rearrange(x, 'bs (w h) -> bs 1 w h',w = w,h = h)
    x = rearrange(x,'bs 1 w h -> bs 1 1 w h')
    
    
    model = model.to(device)
    x = x.to(device)
    y_pre_ls =[]
    for i in range(x.shape[0]):
        logits = model(x[i])
        y_pre = logits.argmax(dim=-1)
        y_pre_ls.append((i,y_pre.item()))
    with open(output_dir,'w+') as f:
        f.write(f'ImageId,Label\n')
        for index,y in y_pre_ls:
            f.write(f'{index+1},{y}\n')

In [None]:
train(model,train_dataloader,val_dataloader,30,optimizer,"/kaggle/working")

In [None]:
predict(model,"../input/digit-recognizer/test.csv","/kaggle/working/result.csv")

In [None]:
test_data = pd.read_csv("../input/digit-recognizer/test.csv")
x= torch.from_numpy(np.array(test_data)).to(torch.float)
bs,n = x.shape
w = h = int(np.sqrt(n))
x = rearrange(x, 'bs (w h) -> bs 1 w h',w = w,h = h)
 
plt.imshow(x.numpy()[0][0])

In [None]:
plt.imshow(x.numpy()[5][0])

In [None]:
test = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')

x = torch.from_numpy(np.array(test))
bs,n = x.shape
h =w = int(np.sqrt(n))
x = rearrange(x, 'bs (w h) -> bs 1 w h',w = w,h = h)
plt.imshow(x.numpy()[5][0])