- 需求：
    - 导入文件，查看原始数据
    - 将人口数据和各州简称数据进行合并
    - 将合并的数据中重复的abbreviation列进行删除
    - 查看存在缺失数据的列
    - 找到有哪些state/region使得state的值为NaN，进行去重操作
    - 为找到的这些state/region的state项补上正确的值，从而去除掉state这一列的所有NaN
    - 合并各州面积数据areas
    - 我们会发现area(sq.mi)这一列有缺失数据，找出是哪些行
    - 去除含有缺失数据的行
    - 找出2010年的全民人口数据
    - 计算各州的人口密度
    - 排序，并找出人口密度最高的州

In [1]:
import pandas as pd

In [4]:
abb = pd.read_csv('./data/state-abbrevs.csv')
abb.head(3)#州的全称和简称

Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ


In [5]:
pop = pd.read_csv('./data/state-population.csv')
pop.head(3)

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0


In [7]:
area = pd.read_csv('./data/state-areas.csv')
area.head(3)

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006


In [8]:
#将人口数据和各州简称数据进行合并
abb_pop = pd.merge(abb,pop,left_on='abbreviation',right_on='state/region',how='outer')
abb_pop

Unnamed: 0,state,abbreviation,state/region,ages,year,population
0,Alabama,AL,AL,under18,2012,1117489.0
1,Alabama,AL,AL,total,2012,4817528.0
2,Alabama,AL,AL,under18,2010,1130966.0
3,Alabama,AL,AL,total,2010,4785570.0
4,Alabama,AL,AL,under18,2011,1125763.0
...,...,...,...,...,...,...
2539,,,USA,total,2010,309326295.0
2540,,,USA,under18,2011,73902222.0
2541,,,USA,total,2011,311582564.0
2542,,,USA,under18,2012,73708179.0


In [9]:
abb_pop.drop(labels='abbreviation',axis=1,inplace=True)

In [11]:
#查看存在缺失数据的列
abb_pop.isnull().any(axis=0)

state            True
state/region    False
ages            False
year            False
population       True
dtype: bool

In [13]:
#找到有哪些state/region使得state的值为NaN，进行去重操作
abb_pop.loc[abb_pop['state'].isnull()]

Unnamed: 0,state,state/region,ages,year,population
2448,,PR,under18,1990,
2449,,PR,total,1990,
2450,,PR,total,1991,
2451,,PR,under18,1991,
2452,,PR,total,1993,
...,...,...,...,...,...
2539,,USA,total,2010,309326295.0
2540,,USA,under18,2011,73902222.0
2541,,USA,total,2011,311582564.0
2542,,USA,under18,2012,73708179.0


In [29]:
abb_pop.loc[abb_pop['state'].isnull()]['state/region'].unique()

array([], dtype=object)

In [17]:
#给state中存在的空值补上正确的值

In [23]:
#给PR简称对应的全称补上正确的值
abb_pop['state/region'] == 'PR'
abb_pop.loc[abb_pop['state/region'] == 'PR'] #将PR的行数据取出
PR_row_index = abb_pop.loc[abb_pop['state/region'] == 'PR'].index
abb_pop.loc[PR_row_index,'state'] = 'PPPRRR' #批量填充

In [28]:
#给USA简称对应的全称补上正确的值
abb_pop['state/region'] == 'USA'
abb_pop.loc[abb_pop['state/region'] == 'USA'] #取出的是USA对应的行数据
USA_row_index = abb_pop.loc[abb_pop['state/region'] == 'USA'].index

abb_pop.loc[USA_row_index,'state'] = 'United States'

In [31]:
abb_pop_area = pd.merge(abb_pop,area,how='outer')
abb_pop_area.head()

Unnamed: 0,state,state/region,ages,year,population,area (sq. mi)
0,Alabama,AL,under18,2012.0,1117489.0,52423.0
1,Alabama,AL,total,2012.0,4817528.0,52423.0
2,Alabama,AL,under18,2010.0,1130966.0,52423.0
3,Alabama,AL,total,2010.0,4785570.0,52423.0
4,Alabama,AL,under18,2011.0,1125763.0,52423.0


In [36]:
#我们会发现area(sq.mi)这一列有缺失数据，找出是哪些行
abb_pop_area['area (sq. mi)'].isnull()
abb_pop_area.loc[abb_pop_area['area (sq. mi)'].isnull()]
drop_index = abb_pop_area.loc[abb_pop_area['area (sq. mi)'].isnull()].index 

In [37]:
abb_pop_area.drop(labels=drop_index,axis=0,inplace=True)

In [38]:
#检测目前哪些列中存在缺失值
abb_pop_area.isnull().any(axis=0)

state            False
state/region      True
ages              True
year              True
population        True
area (sq. mi)    False
dtype: bool

In [39]:
for col in abb_pop_area.columns:
    if abb_pop_area[col].isnull().sum() > 0:
        drop_index = abb_pop_area.loc[abb_pop_area[col].isnull()].index
        abb_pop_area.drop(labels=drop_index,axis=0,inplace=True)

In [40]:
abb_pop_area.isnull().any(axis=0)

state            False
state/region     False
ages             False
year             False
population       False
area (sq. mi)    False
dtype: bool

In [43]:
#找出2010年的全民人口数据
abb_pop_area.query('year==2010 & ages=="total"')

Unnamed: 0,state,state/region,ages,year,population,area (sq. mi)
3,Alabama,AL,total,2010.0,4785570.0,52423.0
91,Alaska,AK,total,2010.0,713868.0,656425.0
101,Arizona,AZ,total,2010.0,6408790.0,114006.0
189,Arkansas,AR,total,2010.0,2922280.0,53182.0
197,California,CA,total,2010.0,37333601.0,163707.0
283,Colorado,CO,total,2010.0,5048196.0,104100.0
293,Connecticut,CT,total,2010.0,3579210.0,5544.0
379,Delaware,DE,total,2010.0,899711.0,1954.0
389,District of Columbia,DC,total,2010.0,605125.0,68.0
475,Florida,FL,total,2010.0,18846054.0,65758.0


In [46]:
#计算各州的人口密度
abb_pop_area['midu'] = abb_pop_area['area (sq. mi)']/abb_pop_area['population']
abb_pop_area

Unnamed: 0,state,state/region,ages,year,population,area (sq. mi),midu
0,Alabama,AL,under18,2012.0,1117489.0,52423.0,0.046911
1,Alabama,AL,total,2012.0,4817528.0,52423.0,0.010882
2,Alabama,AL,under18,2010.0,1130966.0,52423.0,0.046352
3,Alabama,AL,total,2010.0,4785570.0,52423.0,0.010954
4,Alabama,AL,under18,2011.0,1125763.0,52423.0,0.046567
...,...,...,...,...,...,...,...
2443,Wyoming,WY,under18,1993.0,137458.0,97818.0,0.711621
2444,Wyoming,WY,total,1991.0,459260.0,97818.0,0.212990
2445,Wyoming,WY,under18,1991.0,136720.0,97818.0,0.715462
2446,Wyoming,WY,under18,1990.0,136078.0,97818.0,0.718838


In [50]:
#根据密度排序
abb_pop_area.sort_values(by='midu',ascending=False)

Unnamed: 0,state,state/region,ages,year,population,area (sq. mi),midu
49,Alaska,AK,under18,1990.0,177502.0,656425.0,3.698127
51,Alaska,AK,under18,1991.0,182180.0,656425.0,3.603167
85,Alaska,AK,under18,2008.0,183124.0,656425.0,3.584593
83,Alaska,AK,under18,2007.0,184344.0,656425.0,3.560870
52,Alaska,AK,under18,1992.0,184878.0,656425.0,3.550585
...,...,...,...,...,...,...,...
389,District of Columbia,DC,total,2010.0,605125.0,68.0,0.000112
431,District of Columbia,DC,total,1990.0,605321.0,68.0,0.000112
387,District of Columbia,DC,total,2011.0,619624.0,68.0,0.000110
385,District of Columbia,DC,total,2012.0,633427.0,68.0,0.000107
