In [1]:
import pandas as pd
import numpy as np

# 传入列表

## 一维列表

- 生成1列，index和column默认为数字
- 若想生成一行，可使用转置

In [2]:
data = [1, 2, 3]

pd.DataFrame(data)

Unnamed: 0,0
0,1
1,2
2,3


In [3]:
data = [1, 2, 3]

pd.DataFrame(data).T

Unnamed: 0,0,1,2
0,1,2,3


## 二维列表

- 内层每一个列表为一行
- index和column默认为数字

In [4]:
data = [[1, 2, 3], [4, 5, 6]]

pd.DataFrame(data)

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6


# 传入字典

- 字典的key为DataFrame的列名，index默认为数字
- 字典的value为每一列的值
- 字典的value可以是list，array，series

In [5]:
data = {
    'a':[1, 2, 3],
    'b':[4, 5, 6],
    'c':[7, 8, 9]    
}

pd.DataFrame(data)

Unnamed: 0,a,b,c
0,1,4,7
1,2,5,8
2,3,6,9


In [6]:
data = {
    'a':np.random.rand(3),
    'b':np.random.rand(3),
    'c':np.random.rand(3)
}

pd.DataFrame(data)

Unnamed: 0,a,b,c
0,0.262483,0.62392,0.545245
1,0.694982,0.997921,0.642344
2,0.899702,0.895937,0.759665


In [7]:
pd.Series(np.random.rand(2))

0    0.588306
1    0.324417
dtype: float64

In [8]:
data = {
    'a':pd.Series(np.random.rand(2)),
#     'b':pd.Series(np.random.rand(3)),
}

pd.DataFrame(data)

Unnamed: 0,a
0,0.546029
1,0.747325


# 传入array

- 二维array的每一行为DataFrame的行
- 一维array的每个值为一行，生成一列

In [9]:
np.arange(9).reshape(3,3)

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [10]:
pd.DataFrame(np.arange(9).reshape(3,3))

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8


In [11]:
pd.DataFrame(np.arange(3))

Unnamed: 0,0
0,0
1,1
2,2


# 列表中的字典

- 字典的key为DataFrame的列名

In [12]:
data = [{"one":1,"two":2},
        {"one":5,"two":10,"three":15}]

pd.DataFrame(data)

Unnamed: 0,one,two,three
0,1,2,
1,5,10,15.0


# 字典中的字典

- 外层字典的key为DataFrame的列名
- 内层字典的key为DataFrame的index

In [13]:
data = {
    "Jack":{"math":90, "english":89, "art":78},
    "Marry":{"math":82, "english":95, "art":96},
    "Tom":{"math":85, "english":94}
}

pd.DataFrame(data)

Unnamed: 0,Jack,Marry,Tom
math,90,82,85.0
english,89,95,94.0
art,78,96,


# 实战

In [14]:
from pyquery import PyQuery as pq
import pandas as pd

for i in range(10):
    url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{}.html'.format(i+1)
    doc = pq(url=url, encoding='gbk')

    for item in doc('.dw_table .el').items():
        job_name = item('.t1').text()
        company = item('.t2').text()
        address = item('.t3').text()
        salary = item('.t4').text()
        date = item('.t5').text()

        pd.DataFrame([job_name, company, address, salary, date]).T.to_csv('51job_python_df_10.csv', 
                                                                          mode='a',index=False,header=False)

ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

In [16]:
from pyquery import PyQuery as pq

url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html'
url2 = 'https://www.baidu.com/'
doc = pq(url=url, encoding='gbk')

ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

In [None]:
from pyquery import PyQuery as pq
import pandas as pd


def parse_save(url):
    doc = pq(url=url, encoding='gbk')

    job_names = []
    companys = []
    addresss = []
    salarys = []
    dates = []

    for item in doc('.dw_table .el').items():
        job_name = item('.t1').text()
        company = item('.t2').text()
        address = item('.t3').text()
        salary = item('.t4').text()
        date = item('.t5').text()

        job_names.append(job_name)
        companys.append(company)
        addresss.append(address)
        salarys.append(salary)
        dates.append(date)

    pd.DataFrame([job_names[1:], companys[1:], addresss[1:], salarys[1:], dates[1:]]).T.to_csv('51job_python_df.csv', mode='a', index=False,
                                                                      header=False)


if __name__ == '__main__':

    pd.DataFrame('职位名 公司名 地址 薪资 发布时间'.split(' ')).T.to_csv('51job_python_df.csv', index=False,
                                                                      header=False)
    url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{}.html'
    for page in range(10):
        parse_save(url.format(page+1))
    print('爬取完毕')