In [1]:
import numpy as np, pandas as pd

In [2]:
pop = pd.read_csv("state-population.csv")
areas = pd.read_csv("state-areas.csv")
abbrevs = pd.read_csv("state-abbrevs.csv")

print(pop.head()); print(areas.head()); print(abbrevs.head()); # .head() -> first 5 rows

  state/region     ages  year  population
0           AL  under18  2012   1117489.0
1           AL    total  2012   4817528.0
2           AL  under18  2010   1130966.0
3           AL    total  2010   4785570.0
4           AL  under18  2011   1125763.0
        state  area (sq. mi)
0     Alabama          52423
1      Alaska         656425
2     Arizona         114006
3    Arkansas          53182
4  California         163707
        state abbreviation
0     Alabama           AL
1      Alaska           AK
2     Arizona           AZ
3    Arkansas           AR
4  California           CA


In [3]:
# merge(state/region and abbreviation), how="outer"
merged = pd.merge(pop, abbrevs, how="outer",
                  left_on="state/region", right_on="abbreviation")

merged = merged.drop("abbreviation", axis=1) # 중복 정보 삭제
print(merged.head(), "\n")

# 널 값 가진 행 확인
print(merged.isnull().any(), "\n")

# 널 값 가진 행(population, state) 출력
print(merged[merged["population"].isnull()].head(), "\n") # Puerto Rico
print(merged.loc[merged["state"].isnull(), "state/region"].unique(), "\n") # abbrevs that were not mapped

# null 항목 채워넣기
merged.loc[merged["state/region"]=="PR", "state"] = "Puerto Rico"
merged.loc[merged["state/region"]=="USA", "state"] = "United States"

print(merged.loc[merged["state/region"]=="PR"].head(), "\n")
print(merged.loc[merged["state/region"]=="USA"].head(), "\n")

print(merged.head())

  state/region     ages  year  population    state
0           AL  under18  2012   1117489.0  Alabama
1           AL    total  2012   4817528.0  Alabama
2           AL  under18  2010   1130966.0  Alabama
3           AL    total  2010   4785570.0  Alabama
4           AL  under18  2011   1125763.0  Alabama 

state/region    False
ages            False
year            False
population       True
state            True
dtype: bool 

     state/region     ages  year  population state
2448           PR  under18  1990         NaN   NaN
2449           PR    total  1990         NaN   NaN
2450           PR    total  1991         NaN   NaN
2451           PR  under18  1991         NaN   NaN
2452           PR    total  1993         NaN   NaN 

['PR' 'USA'] 

     state/region     ages  year  population        state
2448           PR  under18  1990         NaN  Puerto Rico
2449           PR    total  1990         NaN  Puerto Rico
2450           PR    total  1991         NaN  Puerto Rico
2451         

In [18]:
# areas 데이터 병합
final = pd.merge(merged, areas, on="state", how="left") # how="left" -> match "merged" DataFrame
print(final.head(), "\n")

print(final.isnull().any(), "\n") # area 열 널 값 존재

# 1-D, Masking Series to find out state w/ null area values
print(final["state"][final["area (sq. mi)"].isnull()].unique(), "\n") # United States

# United States area not relevant -> 널 값 제거
final.dropna(inplace=True) # 현재 DataFrame dropna 적용

  state/region     ages  year  population    state  area (sq. mi)
0           AL  under18  2012   1117489.0  Alabama        52423.0
1           AL    total  2012   4817528.0  Alabama        52423.0
2           AL  under18  2010   1130966.0  Alabama        52423.0
3           AL    total  2010   4785570.0  Alabama        52423.0
4           AL  under18  2011   1125763.0  Alabama        52423.0 

state/region     False
ages             False
year             False
population        True
state            False
area (sq. mi)     True
dtype: bool 

['United States'] 



In [25]:
# 2010년과 전체 인구에 해당하는 데이터 부분 선택
# query() from numexpr
data2010 = final.query("year==2010 & ages==\"total\"")
print(data2010.head(), "\n")

# 인덱스 재배열, 인구 밀도 순서대로 표시
data2010.set_index("state", inplace=True)
density = data2010["population"] / data2010["area (sq. mi)"]
density.sort_values(ascending=False, inplace=True)

print(density.head(), "\n") # top5 높은 인구밀도
print(density.tail(), "\n") # top5 낮은 인구밀도

    state/region   ages  year  population       state  area (sq. mi)
3             AL  total  2010   4785570.0     Alabama        52423.0
91            AK  total  2010    713868.0      Alaska       656425.0
101           AZ  total  2010   6408790.0     Arizona       114006.0
189           AR  total  2010   2922280.0    Arkansas        53182.0
197           CA  total  2010  37333601.0  California       163707.0 

state
District of Columbia    8898.897059
Puerto Rico             1058.665149
New Jersey              1009.253268
Rhode Island             681.339159
Connecticut              645.600649
dtype: float64 

state
South Dakota    10.583512
North Dakota     9.537565
Montana          6.736171
Wyoming          5.768079
Alaska           1.087509
dtype: float64 

