两篇论文:  
[https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html](https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html)  
[http://vita.had.co.nz/papers/tidy-data.html](http://vita.had.co.nz/papers/tidy-data.html)


### 从网络获取数据

In [5]:
import os
import requests
import pandas as pd

# 发送get请求，获取网页数据（tsv文件）
url = 'https://raw.githubusercontent.com/udacity/new-dand-advanced-china/master/%E6%95%B0%E6%8D%AE%E6%B8%85%E6%B4%97/WeRateDogs%E9%A1%B9%E7%9B%AE/image-predictions.tsv'
res = requests.get(url)

# 创建文件夹
folder_name = 'data'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# 输出到文件
file_name = url.split('/')[-1]
with open(os.path.join(folder_name, file_name), mode='wb') as file:
    file.write(res.content)

# 再用pandas读取tsv文件
data = pd.read_table(os.path.join(folder_name, file_name))
data.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


### 用 python 从互联网下载文件

In [7]:
import os
import requests

data_urls = ['https://raw.githubusercontent.com/udacity/new-dand-advanced-china/master/%E6%95%B0%E6%8D%AE%E6%B8%85%E6%B4%97/WeRateDogs%E9%A1%B9%E7%9B%AE/image-predictions.tsv']
# 新建文件夹
folder_name = 'data'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# 对url循环遍历
for url in data_urls:
    res = requests.get(url)
    file_name = url.split('/')[-1]
    # 将网页内容写入到文件
    with open(os.path.join(folder_name, file_name), mode='wb') as file:
        file.write(res.content)

## python 中处理 json 数据  
json库官指：http://docs.python-guide.org/en/latest/scenarios/json/
#### json对象有如下形式，它的键必须是字符串：

In [9]:
{
  "Directed by": "Steven Spielberg",
  "Produced by": [
    "K.K.",
    "Steven Spielberg"
  ],
  "Written by": "Melissa Mathison"
  "Running time": 114
}

IndentationError: unexpected indent (<ipython-input-9-112a33cdacef>, line 2)

json 对象的值可以是任何有效的 json 数据类型：字符串，数字，JSON 数组，布尔值，空值（null），JSON 对象。    
JSON 数据来源   
- 可能来自 API   
- 也可能来自文本文件，如此，就要引入 json 库

In [11]:
json_string = '{"first_name": "Guido", "last_name":"Rossum"}'

import json
# JSON data to normal dictionary
parsed_json = json.loads(json_string)

# dictionary to JSON data
d = {
    'first_name': 'Guido',
    'second_name': 'Rossum',
    'titles': ['BDFL', 'Developer'],
}

print(json.dumps(d))
'{"first_name": "Guido", "last_name": "Rossum", "titles": ["BDFL", "Developer"]}'

{"first_name": "Guido", "second_name": "Rossum", "titles": ["BDFL", "Developer"]}


'{"first_name": "Guido", "last_name": "Rossum", "titles": ["BDFL", "Developer"]}'

### pandas 用均值填充缺失值列的技巧

In [12]:
for column in list(df.columns[df.isnull().sum() > 0]):
    mean_val = df[column].mean()
    df[column].fillna(mean_val, inplace=True)

# -------代码分解-------
# 判断哪些列有缺失值，得到series对象
df.isnull().sum() > 0
# output
contributors                      True
coordinates                       True
created_at                       False
display_text_range               False
entities                         False
extended_entities                 True
favorite_count                   False
favorited                        False
full_text                        False
geo                               True
id                               False
id_str                           False
...

# 根据上一步结果，筛选需要填充的列
df.columns[df.isnull().sum() > 0]
# output
Index(['contributors', 'coordinates', 'extended_entities', 'geo',
       'in_reply_to_screen_name', 'in_reply_to_status_id',
       'in_reply_to_status_id_str', 'in_reply_to_user_id',
       'in_reply_to_user_id_str', 'place', 'possibly_sensitive',
       'possibly_sensitive_appealable', 'quoted_status', 'quoted_status_id',
       'quoted_status_id_str', 'retweeted_status'],
      dtype='object')

SyntaxError: invalid syntax (<ipython-input-12-edbf3fb41d34>, line 9)

#### pandas 中 .stack() 的使用
有时候需要将特征名称转化为变量，也就是将数据集由横向改为纵向，或者为转秩。使用场景如下：

In [13]:
# 数据集
In [5]: test
Out[5]: 
             tweet_id doggo floofer  pupper puppo
0  675003128568291329  None    None    None  None
1  786233965241827333  None    None    None  None
2  683481228088049664  None    None  pupper  None
3  675497103322386432  None    None    None  None

# 先设置index，再使用.stack()方法由横向变纵向，对特征进行命名
In [6]: s1 = test.set_index('tweet_id').stack().rename('stage')

In [7]: s1
Out[7]: 
tweet_id                   
675003128568291329  doggo        None
                    floofer      None
                    pupper       None
                    puppo        None
786233965241827333  doggo        None
                    floofer      None
                    pupper       None
                    puppo        None
683481228088049664  doggo        None
                    floofer      None
                    pupper     pupper
                    puppo        None
675497103322386432  doggo        None
                    floofer      None
                    pupper       None
                    puppo        None
Name: stage, dtype: object

# 将多重索引reset
In [8]: s2 = s1.reset_index()

In [9]: s2
Out[9]: 
              tweet_id  level_1   stage
0   675003128568291329    doggo    None
1   675003128568291329  floofer    None
2   675003128568291329   pupper    None
3   675003128568291329    puppo    None
4   786233965241827333    doggo    None
5   786233965241827333  floofer    None
6   786233965241827333   pupper    None
7   786233965241827333    puppo    None
8   683481228088049664    doggo    None
9   683481228088049664  floofer    None
10  683481228088049664   pupper  pupper
11  683481228088049664    puppo    None
12  675497103322386432    doggo    None
13  675497103322386432  floofer    None
14  675497103322386432   pupper    None
15  675497103322386432    puppo    None

# 将level_1列删除，同时stage列只保留不为none的数据
In [10]: s2.drop(['level_1'], axis=1, inplace=True)

In [11]: s3 = s2[s2.stage != 'None']

In [12]: s3
Out[12]: 
              tweet_id   stage
10  683481228088049664  pupper

# 跟原始数据集进行合并
In [14]: result = pd.merge(test, s3, how='left', on='tweet_id')

In [15]: result
Out[15]: 
             tweet_id doggo floofer  pupper puppo   stage
0  675003128568291329  None    None    None  None     NaN
1  786233965241827333  None    None    None  None     NaN
2  683481228088049664  None    None  pupper  None  pupper
3  675497103322386432  None    None    None  None     NaN

# 删除中间特征，得到最终结果
In [16]: result.drop(['doggo','floofer','pupper','puppo'], axis=1)
Out[16]: 
             tweet_id   stage
0  675003128568291329     NaN
1  786233965241827333     NaN
2  683481228088049664  pupper
3  675497103322386432     NaN

In [17]: test
Out[17]: 
             tweet_id doggo floofer  pupper puppo
0  675003128568291329  None    None    None  None
1  786233965241827333  None    None    None  None
2  683481228088049664  None    None  pupper  None
3  675497103322386432  None    None    None  None

SyntaxError: invalid syntax (<ipython-input-13-546b5c3e58f8>, line 3)

正则表达式：https://blog.csdn.net/Guo_ya_nan/article/details/80307814  
coursera课程：https://www.coursera.org/learn/python-network-data#pricing