### Wrangling custom CDC Wonder Population Estimates 1990-2015

### Clean and Select Years

In [16]:
#Remove duplicate year column, convert year float to int
import pandas as pd
df = pd.read_csv("CDC_population_estimates.csv")

#drop duplicate column
df = df.drop(df.columns[1], axis=1)

#removing blank rows at bottom of .csv
df = df.iloc[:-29]

#convert float to int
df[df.columns[1]] = df[df.columns[1]].astype(int)

df.head()

#Select 2005-2015
df_filtered = df[(df[df.columns[1]] >= 2005) & (df[df.columns[1]] <= 2015)]
df_filtered.head()


df_filtered.to_csv("population_estimates_2005_2015.csv", index=False)


### Applying custom age_recode

In [20]:
import pandas as pd
df = pd.read_csv("population_estimates_2005_2015.csv")

#remove Age column 
df = df.drop("Age", axis = 1)

#Adding age_recode_custom column 
bins = [0, 24, 39, 54, 69, 84, float('inf')]
labels = [0, 1, 2, 3, 4, 5]

df['age_recode_custom'] = pd.cut(df['Age Code'], bins=bins, labels=labels, right=True, include_lowest=True)

df['age_recode_custom'] = df['age_recode_custom']

df.to_csv('population_estimates_2005_2015_age.csv', index=False)

### Removing "Sex" column in favour of "Sex Code"

In [23]:
df = pd.read_csv("population_estimates_2005_2015_age.csv")
df = df.drop("Sex", axis = 1)
df.to_csv('population_estimates_2005_2015_age_sex.csv', index=False)
df.head()

Unnamed: 0,Notes,Yearly July 1st Estimates Code,Age Code,Sex Code,Race,Race Code,Population,age_recode_custom
0,,2005,0.0,F,American Indian or Alaska Native,1002-5,32194.0,0.0
1,,2005,0.0,F,Asian or Pacific Islander,A-PI,104342.0,0.0
2,,2005,0.0,F,Black or African American,2054-5,318126.0,0.0
3,,2005,0.0,F,White,2106-3,1502020.0,0.0
4,Total,2005,0.0,F,,,1956682.0,0.0


### Converting Race Code to our "race_recode_5" and droping "Race Code"

In [29]:
import pandas as pd

df = pd.read_csv("population_estimates_2005_2015_age_sex.csv")

race_mapping = {
    '1002-5': 3,
    'A-PI': 4,
    '2054-5': 2,
    '2106-3': 1
}

df['race_recode_5'] = df['Race Code'].map(race_mapping)

df = df.drop("Race Code", axis = 1)

df.to_csv('population_estimates_2005_2015_age_sex_race.csv', index=False)

df.head()


Unnamed: 0,Notes,Yearly July 1st Estimates Code,Age Code,Sex Code,Race,Population,age_recode_custom,race_recode_5
0,,2005,0.0,F,American Indian or Alaska Native,32194.0,0.0,3.0
1,,2005,0.0,F,Asian or Pacific Islander,104342.0,0.0,4.0
2,,2005,0.0,F,Black or African American,318126.0,0.0,2.0
3,,2005,0.0,F,White,1502020.0,0.0,1.0
4,Total,2005,0.0,F,,1956682.0,0.0,


### Remove rows with "Total" and the column "Notes", renaming columns, converting to int.

In [44]:
import pandas as pd
df = pd.read_csv("population_estimates_2005_2015_age_sex_race.csv")

#removing Total
df_filtered = df[df['Notes'] != 'Total']

#dropping Notes
df_filtered = df_filtered.drop("Notes", axis = 1)

#Converting to Integer
df_filtered["race_recode_5"] = df_filtered["race_recode_5"].astype(int)
df_filtered["Age Code"] = df_filtered["Age Code"].astype(int)
df_filtered["age_recode_custom"] = df_filtered["age_recode_custom"].astype(int)
df_filtered["Population"] = df_filtered["Population"].astype(int)

#Renaming
df_filtered = df_filtered.rename(columns = {"Sex Code": "Sex", "Age Code": "Age", "Yearly July 1st Estimates Code": "Year"})

df_filtered.to_csv('population_data_v1.csv', index=False)

df_filtered.head()

Unnamed: 0,Year,Age,Sex,Race,Population,age_recode_custom,race_recode_5
0,2005,0,F,American Indian or Alaska Native,32194,0,3
1,2005,0,F,Asian or Pacific Islander,104342,0,4
2,2005,0,F,Black or African American,318126,0,2
3,2005,0,F,White,1502020,0,1
5,2005,0,M,American Indian or Alaska Native,32637,0,3
