### 结构化数据

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import torch 
import torchkeras 
from torch import nn 
from torch.utils.data import Dataset,DataLoader,TensorDataset


In [None]:
dl_train = DataLoader(TensorDataset(torch.tensor(x_train).float(),torch.tensor(y_train).float()),
                     shuffle = True, batch_size = 8)
dl_val = DataLoader(TensorDataset(torch.tensor(x_test).float(),torch.tensor(y_test).float()),
                     shuffle = False, batch_size = 8)


#### 训练数据

In [None]:
import os,sys,time
import numpy as np
import pandas as pd
import datetime
from tqdm import tqdm  #时间进度条

import torch
import torch.nn as nn
import copy as deepcopy
from torchkeras.metrics import Accuracy

In [None]:
def printlog(info):
    nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print('\n'+'========'*8 + '%s'%nowtime)
    print(str(info)+'\n')

In [None]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
metrics_dict = {'acc':Accuracy()}

In [None]:
epochs = 20
ckpt_path = 'checkpoint.pt'

In [None]:
#early stopping相关设置
monitor = 'val_acc'
patience = 5
mode = 'max'

history = {}

In [None]:
for epoch in range(epochs):
    printlog("Epoch{0}/{1}".format(epoch+1,epochs))
    
    #train
    model.train()
    
    total_loss,step = 0,0
    loop = tqdm(enumerate(dl_train), total=len(dl_train))
    train_metrics_dict = deepcopy(metrics_dict) #deepcopy深度复制一个对象
    
    for i,data in loop:
        inputs,labels = data
        #forward
        preds = model(inputs)
        loss = criterion(preds,labels)
        #backward
        optimizer.zero_grad()
        loss.backward()
        #update
        optimizer.step()
        
        #metrics
        step_metrics = {"train_"+name: metric_fn(preds,labels).item()
                       for name,metric_fn in train_metrics_dict.item()}
        
        step_log = dict({"train_loss":loss.item()}, **step_metrics) #**表示传入字典
        
        total_loss += loss.item()
        
        step += 1
        if i!=len(dl_train)-1:
            loop.set_postfix(**step_log)  #set_postfix动态进度条
        else:
            epoch_loss = total_loss/step
            epoch_metrics = {"train_"+name:metric_fn.compute().items()
                            for name,metric_fn in train_metrics_dict.items()}
            epoch_log = dict({"train_loss":epoch_loss},**epoch_metrics)
            loop.set_postfix(**epoch_log)
            
            for name,metric_fn in train_metrics_dict.items():
                metric_fn.reset() #用完恢复默认
                
    for name, metric in epoch_log.items():
        history[name] = history.get(name, []) + [metric]
        
    #validation
    model.eval()
    
    total_loss,step = 0,0
    loop = tqdm(enumerate(dl_val), total = len(dl_val))
    
    val_metrics_dict = deepcopy(metrics_dict)
    
    with torch.no_grad():
        for i,data in loop:
            
            inputs,labels = data
            
            #forward
            preds = model(inputs)
            loss = criterion(preds,labels)
            
            #metrics
            step_metrics = {"val_"+name:metric_fn(preds,labels).items()
                           for name,metric_fn in val_metrics_dict.items()}
            step_log = dict({"val_loss":loss.items()},**step_metrics)
            
            total_loss += loss.item()
            step += 1
            if i!=len(dl_val)-1:
                loop.set_postfix(**step_log)
            else:
                epoch_loss = total_loss/step
                epoch_metrics = {"val_"+name:metric_fn.compute().item() 
                                 for name,metric_fn in val_metrics_dict.items()}
                epoch_log = dict({"val_loss":epoch_loss},**epoch_metrics)
                loop.set_postfix(**epoch_log)

                for name,metric_fn in val_metrics_dict.items():
                    metric_fn.reset()
    epoch_log["epoch"] = epoch
    for name, metric in epoch_log.items():
        history[name] = history.get(name, []) + [metric]
        
    #early_stopping
    arr_scores = history[monitor]
    best_score_idx = np.argmax(arr_scores) if mod=='max' else np.argmin(arr_scores)
    if best_score_idx == len(arr_scores)-1:
        torch.save(model.state_dict(),ckpt_path)
        print("<<<<<< reach best {0} : {1} >>>>>>".format(monitor,
             arr_scores[best_score_idx]),file=sys.stderr)
    if len(arr_scores)-best_score_idx > patience:
        print("<<<<<< {} without improvement in {} epoch, early stopping >>>>>>".format(
            monitor,patience),file=sys.stderr)
        break
    model.load_state_dict(torch.load(ckpt_path))
    
dfhistory = pd.DataFrame(history)
            
            
            
            
            
            

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import matplotlib.pyplot as plt

def plot_metric(dfhistory, metric):
    train_metrics = dfhistory["train_"+metric]
    val_metrics = dfhistory['val_'+metric]
    epochs = range(1, len(train_metrics) + 1)
    plt.plot(epochs, train_metrics, 'bo--')
    plt.plot(epochs, val_metrics, 'ro-')
    plt.title('Training and validation '+ metric)
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend(["train_"+metric, 'val_'+metric])
    plt.show()

In [None]:
plot_metric(dfhistory,"loss")

In [None]:
plot_metric(dfhistory,"acc")

### 使用模型
通过调用pickle序列化方法实现的

In [None]:
y_pred = model(torch.tensor(x_test[0:10]).float())
y_pred_probs = torch.sigmoid(y_pred)

In [None]:
y_preds = torch.where(y_pred_probs>0.5,  #可以修改阈值
                     torch.ones_like(y_pred_probs),torch.zeros_like(y_pred_probs))
y_preds

### 保存模型

#### 1、保存模型参数（推荐）

In [None]:
print(model.state_dict().keys())

In [None]:
#保存模型参数
torch.save(model.state_dict(),"model_parameter.pt")
model_clone = create_model()
model_loaded = torch.load("model_parameter.pt")
model_clone.load_state_dict(model_loaded)

torch.sigmoid(model_clone.forward(torch.tensor(x_test[0:10]).float()))


#### 2、保存完整模型（不推荐）

In [None]:
torch.save(model, "mymodel.pt")
model_loaded = torch.load("mymodel.pt")
torch.sigmoid(model_loaded(torch.tensor(x_test[0:10]).float())).data

### 图片数据

In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset,DataLoader
from torchvision import transforms as T
from torchvision import datasets

In [None]:
transform_img = T.Compose([T.ToTensor()])

def transform_label(x):
    return torch.tensor([x]).float()

In [None]:
ds_train = datasets.ImageFolder("./eat_pytorch_datasets/cifar2/train/",
                               transform = transform_img,target_transform = transform_label)

ds_val = datasets.ImageFolder("./eat_pytorch_datasets/cifar2/test/",
                             transform = transform_img,target_transform = transform_label)
print(ds_train.class_to_idx)

In [None]:
dl_train = DataLoader(ds_train,
                      batch_size=50,
                     shuffle=True)
dl_val  = DataLoader(ds_val,
                    batch_size=50,
                    shuffle=False)

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

from matplotlib import pyplot as plt
plt.figure(figsize=(8,8))
for i in range(9):
    img,label = ds_train[i]
    img = img.permute(1,2,0)
    ax = plt.subplot(3,3,i+1)
    ax.imshow(img.numpy())
    ax.set_title("label = %d"%label.item())
    ax.set_xticks([])
    ax.set_yticks([])
    
plt.show

In [None]:
for features,labels in dl_train:
    print(features.shape,label.shape)
    break

In [None]:
class Net(nn.Module):
    
    def __init__(self):
        super(Net,self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3,out_channels=32,kernel_size=3)
        self.pool = nn.MaxPool2d(kernel_size=2,stride=2)
        self.conv2 = nn.Conv2d(in_channels=32,out_channels=64,kernel_size=5)
        self.dropout = nn.Dropout2d(p=0.2)
        self.adaptive_pool = nn.AdaptiveMaxPool2d((1,1))
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(64,32)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(32,1)
        
    def forward(self,x):
        x = self.conv1(x)
        x = self.pool(x)
        x = self.con2v(x)
        x = self.pool(x)
        x = self.dropout(x)
        x = self.adaptive_pool(x)
        x = self.flatten(x)
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        return x
    
model = Net()

        

### 文本数据

文本数据预处理较为繁琐，包括文本切词，构建词典，编码转换，序列填充，构建数据管道等等。

In [3]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset,DataLoader,TensorDataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator


In [16]:
minfreq = 30 #仅考虑词频超过30的词
maxlen = 200  #每个样本保留200个词的长度
batch_size = 20

In [7]:
df_train = pd.read_csv("/Users/hwangsheep/PycharmProjects/torch_test/starting/eat_pytorch_datasets 2/imdb/train.tsv",
                       sep="\t",
                       header = None,
                       names = ["label","text"])
df_val = pd.read_csv("/Users/hwangsheep/PycharmProjects/torch_test/starting/eat_pytorch_datasets 2/imdb/test.tsv",
                       sep="\t",
                       header = None,
                       names = ["label","text"])

In [6]:
print(df_train)

       label                                               text
0          0  It really boggles my mind when someone comes a...
1          0  Mary Pickford becomes the chieftain of a Scott...
2          0  Well, at least my theater group did, lol. So o...
3          1  I must give How She Move a near-perfect rating...
4          0  I must say, when I read the storyline on the b...
...      ...                                                ...
19995      1  Simple, meaningful and delivers an emotional p...
19996      1  I'm fan of ART, I like anything about Art, I l...
19997      0  Despite being a sequel to the more potent orig...
19998      0  Also known in a different form as "House of Ex...
19999      0  This has the absolute worst performance from R...

[20000 rows x 2 columns]


In [8]:
#文本切词
tokenizer = get_tokenizer('basic_english')

In [9]:
#构建词典
pad_idx,unk_idx = 0,1
special_symbols = ['<pad>','<unk']

#yield就是return返回一个值，并且记住这个返回的位置，下次迭代就从这个位置后(下一行)开始
def yield_tokens(dfdata):  
    for text in dfdata['text']:
        yield tokenizer(text)

In [10]:
yield_tokens(df_train)

<generator object yield_tokens at 0x7fc53800e740>

In [17]:
#vocab对象，将每个词映射成对应数字

vocab = build_vocab_from_iterator(
    yield_tokens(df_train),
    min_freq = minfreq,
    specials = special_symbols,
    special_first=True)

#text_pipeline = lambda x : vocab(tokenizer(x))

vocab.set_default_index(unk_idx)
vocab_size = len(vocab)
print("vocab_size = "+str(vocab_size))

vocab_size = 8813


In [20]:
#查看词典前20个词
#itos:index to string 查看字典(列表形式)

#stoi:string to index 查看词典(字典形式)
print("vocab.get_itos():\n",vocab.get_itos()[:20])
print("vocab.get_stoi()['<pad>']:\n",vocab.get_stoi()['<pad>'])

vocab.get_itos():
 ['<pad>', '<unk', 'the', '.', ',', 'and', 'a', 'of', 'to', "'", 'is', 'it', 'in', 'i', 'this', 'that', 's', 'was', 'as', 'for']
vocab.get_stoi()['<pad>']:
 0


In [24]:
#序列填充
def pad(seq,max_length,pad_value=0):
    n = len(seq)
    result = seq + [pad_value]*(max_length-n)
    return result

In [25]:
#编码转换
def text_pipeline(text):
    words = tokenizer(text)
    tokens = vocab(words)
    result = pad(tokens,maxlen,pad_idx)
    return result

print(len(text_pipeline('this is an example')))

200


In [None]:
#构建管道
class ImdbDataset(Dataset):
    def __init__(self,df):
        self.df = df
    def __len__(self):
        return len(self.df)
    def __getitem__(self,index):
        text = self.df["text"].iloc[index]
        tokens = torch.tensor(text_pipeline(text)).int()
        label = torch.tensor([self.df['label'].iloc[index]]).float()
        return tokens,label
    
ds_train = ImdbDataset(df_train)
ds_val = ImdbDataset(df_val)

In [None]:
dl_train = DataLoader(ds_train,
                     batch_size=50,
                     shuffle=True)
dl_val = DataLoader(ds_val,
                   batch_size=50,
                   shuffle=True)

In [26]:
import torch
from torch import nn 
torch.manual_seed(42)

<torch._C.Generator at 0x7fc561a60970>

In [None]:
class Net(nn.Module):
    
    def __init__(self):
        super(Net,self).__init__()
        
        self.embedding = nn.Embedding(num_embeddings = vocab_size,embedding_dim=3,padding_idx=0)
        
        self.conv = nn.Sequential(
            nn.Conv1d(in_channels=3,out_channels=16,kernel_size=5),
            nn.MaxPool1d(kernel_size=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=16,out_channels=128,kernel_size=2),
            nn.MaxPool1d(kernel_size=2),
            nn.ReLU()
        )
        
        self.dense = nn.Sequential(
            nn.Flatten(),
            nn.Linear(6144,1)
        )
    
    def forward(self,x):
        x = self.embedding(x).transpose(1,2)
        x = self.conv(x)
        y = self.dense(x)
        return y
    
net = Net()


In [None]:
from torchkeras import summary
summary(net,input_shape=(3,32,32))