In [1]:
import numpy as np 
import torch 
from torch.utils.data import TensorDataset,Dataset, DataLoader, random_split

In [3]:
from sklearn import datasets


iris = datasets.load_iris()
ds_iris = TensorDataset(torch.tensor(iris.data),torch.tensor(iris.target))

nb_train = int(0.8 * len(ds_iris))
nb_valid = len(ds_iris) - nb_train

ds_train, ds_valid = random_split(ds_iris, [nb_train, nb_valid])

print(type(ds_iris))
print(type(ds_train))

<class 'torch.utils.data.dataset.TensorDataset'>
<class 'torch.utils.data.dataset.Subset'>


In [4]:
dl_train = DataLoader(ds_train, batch_size=8)
dl_valid = DataLoader(ds_valid, batch_size=8)

for features, labels in dl_train:
    print(features, labels)
    break

tensor([[4.6000, 3.2000, 1.4000, 0.2000],
        [6.0000, 3.4000, 4.5000, 1.6000],
        [6.2000, 2.2000, 4.5000, 1.5000],
        [4.6000, 3.4000, 1.4000, 0.3000],
        [6.7000, 3.0000, 5.2000, 2.3000],
        [6.5000, 3.0000, 5.5000, 1.8000],
        [7.7000, 2.8000, 6.7000, 2.0000],
        [5.7000, 4.4000, 1.5000, 0.4000]], dtype=torch.float64) tensor([0, 1, 1, 0, 2, 2, 2, 0], dtype=torch.int32)


In [5]:
ds_data = ds_train + ds_valid

print('len(ds_train) =', len(ds_train))
print('len(ds_valid) =', len(ds_valid))
print('len(ds_train+ds_valid) =', len(ds_data))

print(type(ds_data))

len(ds_train) =  120
len(ds_valid) =  30
len(ds_train+ds_valid) =  150
<class 'torch.utils.data.dataset.ConcatDataset'>


In [10]:
# 自定义数据结构
import numpy as np 
import pandas as pd 
from collections import OrderedDict
import re, string

MAX_WORDS = 10000  # 仅考虑最高频的10000个词
MAX_LEN = 200  # 每个样本保留200个词的长度
BATCH_SIZE = 20 

train_data_path = '../data/imdb/train.tsv'
test_data_path = '../data/imdb/test.tsv'
train_token_path = '../data/imdb/train_token.tsv'
test_token_path = '../data/imdb/test_token.tsv'
train_samples_path = '../data/imdb/train_samples/'
test_samples_path = '../data/imdb/test_samples/'

In [11]:
word_count_dict = dict()

def clean_text(text):
    lowercase = text.lower().replace("\n"," ")
    stripped_html = re.sub('<br />', ' ',lowercase)
    cleaned_punctuation = re.sub('[%s]'%re.escape(string.punctuation),'',stripped_html)
    return cleaned_punctuation

with open(train_data_path,"r",encoding = 'utf-8') as f:
    for line in f:
        label,text = line.split("\t")
        cleaned_text = clean_text(text)
        for word in cleaned_text.split(" "):
            word_count_dict[word] = word_count_dict.get(word,0)+1 

df_word_dict = pd.DataFrame(pd.Series(word_count_dict,name = "count"))
df_word_dict = df_word_dict.sort_values(by = "count",ascending =False)

df_word_dict = df_word_dict[0:MAX_WORDS-2] #  
df_word_dict["word_id"] = range(2,MAX_WORDS) #编号0和1分别留给未知词<unkown>和填充<padding>

word_id_dict = df_word_dict["word_id"].to_dict()

df_word_dict.head(10)

Unnamed: 0,count,word_id
the,268230,2
and,129713,3
a,129479,4
of,116497,5
to,108296,6
is,85615,7
,84074,8
in,74715,9
it,62587,10
i,60837,11


In [12]:
def pad(data_list, pad_length):
    padded_list = data_list.copy()
    if len(data_list)> pad_length:
         padded_list = data_list[-pad_length:]
    if len(data_list)< pad_length:
         padded_list = [1]*(pad_length-len(data_list))+data_list
    return padded_list

def text_to_token(text_file, token_file):
    with open(text_file,"r",encoding = 'utf-8') as fin,\
      open(token_file,"w",encoding = 'utf-8') as fout:
        for line in fin:
            label,text = line.split("\t")
            cleaned_text = clean_text(text)
            word_token_list = [word_id_dict.get(word, 0) for word in cleaned_text.split(" ")]
            pad_list = pad(word_token_list,MAX_LEN)
            out_line = label+"\t"+" ".join([str(x) for x in pad_list])
            fout.write(out_line+"\n")
        
text_to_token(train_data_path, train_token_path)
text_to_token(test_data_path, test_token_path)

In [13]:
import os

if not os.path.exists(train_samples_path):
    os.mkdir(train_samples_path)
    
if not os.path.exists(test_samples_path):
    os.mkdir(test_samples_path)
    
    
def split_samples(token_path, samples_dir):
    with open(token_path, "r", encoding='utf-8') as fin:
        i = 0
        for line in fin:
            with open(samples_dir + "%d.txt" % i, "w", encoding="utf-8") as fout:
                fout.write(line)
            i = i+1

split_samples(train_token_path, train_samples_path)
split_samples(test_token_path, test_samples_path)

In [14]:
print(os.listdir(train_samples_path)[0:100])

['0.txt', '1.txt', '10.txt', '100.txt', '1000.txt', '10000.txt', '10001.txt', '10002.txt', '10003.txt', '10004.txt', '10005.txt', '10006.txt', '10007.txt', '10008.txt', '10009.txt', '1001.txt', '10010.txt', '10011.txt', '10012.txt', '10013.txt', '10014.txt', '10015.txt', '10016.txt', '10017.txt', '10018.txt', '10019.txt', '1002.txt', '10020.txt', '10021.txt', '10022.txt', '10023.txt', '10024.txt', '10025.txt', '10026.txt', '10027.txt', '10028.txt', '10029.txt', '1003.txt', '10030.txt', '10031.txt', '10032.txt', '10033.txt', '10034.txt', '10035.txt', '10036.txt', '10037.txt', '10038.txt', '10039.txt', '1004.txt', '10040.txt', '10041.txt', '10042.txt', '10043.txt', '10044.txt', '10045.txt', '10046.txt', '10047.txt', '10048.txt', '10049.txt', '1005.txt', '10050.txt', '10051.txt', '10052.txt', '10053.txt', '10054.txt', '10055.txt', '10056.txt', '10057.txt', '10058.txt', '10059.txt', '1006.txt', '10060.txt', '10061.txt', '10062.txt', '10063.txt', '10064.txt', '10065.txt', '10066.txt', '1006

In [15]:
import os


class imdbDataset(Dataset):
    def __init__(self, samples_dir):
        self.samples_dir = samples_dir
        self.samples_paths = os.listdir(samples_dir)
    
    def __len__(self):
        return len(self.samples_paths)
    
    def __getitem__(self, index):
        path = self.samples_dir + self.samples_paths[index]
        with open(path, "r", encoding = "utf-8") as f:
            line = f.readline()
            label,tokens = line.split("\t")
            label = torch.tensor([float(label)], dtype=torch.float)
            feature = torch.tensor([int(x) for x in tokens.split(" ")], dtype=torch.long)
            return (feature, label)

In [16]:
ds_train = imdbDataset(train_samples_path)
ds_test = imdbDataset(test_samples_path)

In [17]:
print(len(ds_train))
print(len(ds_test))

20000
5000


In [18]:
dl_train = DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=True)
dl_test = DataLoader(ds_test, batch_size=BATCH_SIZE)

for features, labels in dl_train:
    print(features)
    print(labels)
    break

tensor([[ 461,  410,    2,  ...,    8,    8,    8],
        [   1,    1,    1,  ...,   56, 1544,    8],
        [   1,    1,    1,  ...,    2,  126,    8],
        ...,
        [   1,    1,    1,  ...,   10,  171,    8],
        [   1,    1,    1,  ..., 6415,  358,    8],
        [  54, 2538,   46,  ...,  710,   13,    8]])
tensor([[0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.]])


In [19]:
import torch
from torch import nn 
import importlib 
from torchkeras import Model, summary


class Net(Model):
    def __init__(self):
        super(Net, self).__init__()
        
        #设置padding_idx参数后将在训练过程中将填充的token始终赋值为0向量
        self.embedding = nn.Embedding(num_embeddings=MAX_WORDS, embedding_dim=3, padding_idx=1)
        self.conv = nn.Sequential()
        self.conv.add_module("conv_1", nn.Conv1d(in_channels=3, out_channels=16, kernel_size=5))
        self.conv.add_module("pool_1", nn.MaxPool1d(kernel_size=2))
        self.conv.add_module("relu_1", nn.ReLU())
        self.conv.add_module("conv_2", nn.Conv1d(in_channels=16, out_channels=128, kernel_size=2))
        self.conv.add_module("pool_2", nn.MaxPool1d(kernel_size=2))
        self.conv.add_module("relu_2", nn.ReLU())
        
        self.dense = nn.Sequential()
        self.dense.add_module("flatten", nn.Flatten())
        self.dense.add_module("linear", nn.Linear(6144, 1))
        self.dense.add_module("sigmoid", nn.Sigmoid())
        
    def forward(self,x):
        x = self.embedding(x).transpose(1, 2)
        x = self.conv(x)
        y = self.dense(x)
        return y
        
model = Net()
print(model)

model.summary(input_shape=(200, ), input_dtype=torch.LongTensor)

Net(
  (embedding): Embedding(10000, 3, padding_idx=1)
  (conv): Sequential(
    (conv_1): Conv1d(3, 16, kernel_size=(5,), stride=(1,))
    (pool_1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (relu_1): ReLU()
    (conv_2): Conv1d(16, 128, kernel_size=(2,), stride=(1,))
    (pool_2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (relu_2): ReLU()
  )
  (dense): Sequential(
    (flatten): Flatten(start_dim=1, end_dim=-1)
    (linear): Linear(in_features=6144, out_features=1, bias=True)
    (sigmoid): Sigmoid()
  )
)
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
         Embedding-1               [-1, 200, 3]          30,000
            Conv1d-2              [-1, 16, 196]             256
         MaxPool1d-3               [-1, 16, 98]               0
              ReLU-4               [-1, 16, 98]               0
            Conv1d-5      

In [21]:
def accuracy(y_pred, y_true):
    y_pred = torch.where(y_pred > 0.5, \
        torch.ones_like(y_pred, dtype=torch.float32), torch.zeros_like(y_pred, dtype=torch.float32))
    acc = torch.mean(1 - torch.abs(y_true - y_pred))
    return acc

model.compile(loss_func=nn.BCELoss(), optimizer=torch.optim.Adagrad(model.parameters(), lr=0.02), \
    metrics_dict={"accuracy": accuracy})

In [22]:
dfhistory = model.fit(10, dl_train, dl_val=dl_test, log_step_freq=200)

Start Training ...

{'step': 200, 'loss': 0.726, 'accuracy': 0.511}
{'step': 400, 'loss': 0.708, 'accuracy': 0.521}
{'step': 600, 'loss': 0.701, 'accuracy': 0.53}
{'step': 800, 'loss': 0.693, 'accuracy': 0.545}
{'step': 1000, 'loss': 0.686, 'accuracy': 0.558}

 +-------+-------+----------+----------+--------------+
| epoch |  loss | accuracy | val_loss | val_accuracy |
+-------+-------+----------+----------+--------------+
|   1   | 0.686 |  0.558   |  0.661   |    0.602     |
+-------+-------+----------+----------+--------------+

{'step': 200, 'loss': 0.619, 'accuracy': 0.655}
{'step': 400, 'loss': 0.607, 'accuracy': 0.667}
{'step': 600, 'loss': 0.599, 'accuracy': 0.673}
{'step': 800, 'loss': 0.594, 'accuracy': 0.678}
{'step': 1000, 'loss': 0.591, 'accuracy': 0.682}

 +-------+-------+----------+----------+--------------+
| epoch |  loss | accuracy | val_loss | val_accuracy |
+-------+-------+----------+----------+--------------+
|   2   | 0.591 |  0.682   |  0.575   |    0.705     |

In [23]:
dfhistory

Unnamed: 0,loss,accuracy,val_loss,val_accuracy
0,0.68575,0.55795,0.660555,0.6016
1,0.591386,0.6821,0.574622,0.7048
2,0.520254,0.74205,0.564227,0.7172
3,0.469431,0.7767,0.510692,0.7566
4,0.429008,0.8013,0.503139,0.7586
5,0.394574,0.82295,0.497348,0.7686
6,0.366846,0.83865,0.481817,0.7794
7,0.342159,0.85185,0.474267,0.7842
8,0.321482,0.8623,0.473824,0.7904
9,0.302618,0.87335,0.468441,0.793
