<a href="https://colab.research.google.com/github/haokeliu/colaboratory/blob/master/DeepLearn_Notes/%E9%9D%A2%E5%90%91%E5%AF%B9%E8%B1%A1%E7%9A%84%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###面向对象的机器学习
本文中，我们将学习如何适当地创建和使用类与函数，基于 PyTorch 完成机器学习任务。在后续的 notebooks中，都会按照这样的实现结构。
####概览
我们将会用到的类和函数总计如下：


*   词汇表 Vocabulary：用于原始输入和数字形式的相互转换。
*   向量器 Vectorizer：输入和输出的 vocabulary 类实例，并为模型把数据向量化。


*   数据集 Dataset：处理和拆分数据的 vectorizers。
*   模型 Model：处理输入和返回预测值的 PyTorch模型。





In [2]:
# 加载 PyTorch 库
!pip3 install torch



In [0]:
import os
from argparse import Namespace
import collections
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import torch

In [0]:
# 设置 Numpy 和 PyTorch 种子
def set_seeds(seed,cuda):
  np.random.seed(seed)
  torch.manual_seed(seed)
  if cuda:
    torch.cuda.manual_seed_all(seed)
# 创建目录
def create_dirs(dirpath):
  if not os.path.exists(dirpath):
    os.makedirs(dirpath)

In [5]:
# 参数
args = Namespace(
    seed=1234,
    cuda=False,
    shuffle=True,
    data_file="names.csv",
    split_data_file="split_names.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="names",
    train_size=0.7,
    val_size=0.15,
    test_size=0.15,
    num_epochs=20,
    early_stopping_criteria=5,
    learning_rate=1e-3,
    batch_size=64,
    hidden_dim=300,
    dropout_p=0.1,
)
# 设置种子
set_seeds(seed = args.seed,cuda=args.cuda)
# 创建保存路径
create_dirs(args.save_dir)
# 展开文件路径
args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)
args.model_state_file = os.path.join(args.save_dir, args.model_state_file)
# 检查 CUDA
if not torch.cuda.is_available():
  args.cuda = False
args.device = torch.device('cuda' if args.cuda else 'cpu')
print('Using CUDA:{}'.format(args.cuda))

Using CUDA:False


In [0]:
import re
import urllib

In [0]:
# 从 GitHub 上传数据到 notebook 的目录
url = "https://raw.githubusercontent.com/GokuMohandas/practicalAI/master/data/surnames.csv"
response = urllib.request.urlopen(url)
html = response.read()
with open(args.data_file,'wb')as fp:
  fp.write(html)

In [8]:
df = pd.read_csv(args.data_file,header=0)
df.head()

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian


In [9]:
# 基于国籍拆分数据
by_nationality = collections.defaultdict(list)
for _, row in df.iterrows():
    by_nationality[row.nationality].append(row.to_dict())
for nationality in by_nationality:
    print ("{0}: {1}".format(nationality, len(by_nationality[nationality])))

English: 2972
French: 229
Arabic: 1603
Russian: 2373
Japanese: 775
Chinese: 220
Italian: 600
Czech: 414
Irish: 183
German: 576
Greek: 156
Spanish: 258
Polish: 120
Dutch: 236
Vietnamese: 58
Korean: 77
Portuguese: 55
Scottish: 75
