In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [5]:
data = pd.read_csv('../data/us-cities-demographics.csv', delimiter=';')
data.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


In [6]:
one_hot_races = pd.get_dummies(data['Race'])
data = data.drop('Race',axis = 1)
data = data.join(one_hot_races)
data.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Count,American Indian and Alaska Native,Asian,Black or African-American,Hispanic or Latino,White
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,25924,0,0,0,1,0
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,58723,0,0,0,0,1
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,4759,0,1,0,0,0
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,24437,0,0,1,0,0
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,76402,0,0,0,0,1


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2891 entries, 0 to 2890
Data columns (total 16 columns):
City                                 2891 non-null object
State                                2891 non-null object
Median Age                           2891 non-null float64
Male Population                      2888 non-null float64
Female Population                    2888 non-null float64
Total Population                     2891 non-null int64
Number of Veterans                   2878 non-null float64
Foreign-born                         2878 non-null float64
Average Household Size               2875 non-null float64
State Code                           2891 non-null object
Count                                2891 non-null int64
American Indian and Alaska Native    2891 non-null uint8
Asian                                2891 non-null uint8
Black or African-American            2891 non-null uint8
Hispanic or Latino                   2891 non-null uint8
White                  

In [8]:
data['City'] = data['City'].astype('str') 
data['State'] = data['State'].astype('str') 
data['State Code'] = data['State Code'].astype('str') 

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2891 entries, 0 to 2890
Data columns (total 16 columns):
City                                 2891 non-null object
State                                2891 non-null object
Median Age                           2891 non-null float64
Male Population                      2888 non-null float64
Female Population                    2888 non-null float64
Total Population                     2891 non-null int64
Number of Veterans                   2878 non-null float64
Foreign-born                         2878 non-null float64
Average Household Size               2875 non-null float64
State Code                           2891 non-null object
Count                                2891 non-null int64
American Indian and Alaska Native    2891 non-null uint8
Asian                                2891 non-null uint8
Black or African-American            2891 non-null uint8
Hispanic or Latino                   2891 non-null uint8
White                  

In [10]:
def get_race_count(row, race_cols):
    for col in race_cols:
        if row[col] == 1:
            row[col] = row['Count']
    return row
    

In [11]:
race_columns = ['American Indian and Alaska Native', 'Asian', 'Black or African-American', 'Hispanic or Latino', 'White']
data = data.apply(lambda x: get_race_count(x, race_columns), axis = 1)
data = data.drop('Count',axis = 1)
data.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,American Indian and Alaska Native,Asian,Black or African-American,Hispanic or Latino,White
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,0,0,0,25924,0
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,0,0,0,0,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,0,4759,0,0,0
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,0,0,24437,0,0
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,0,0,0,0,76402


In [12]:
race_data = data[['City', 'State', 'American Indian and Alaska Native', 'Asian', 'Black or African-American', 'Hispanic or Latino', 'White']]
race_data.head()

Unnamed: 0,City,State,American Indian and Alaska Native,Asian,Black or African-American,Hispanic or Latino,White
0,Silver Spring,Maryland,0,0,0,25924,0
1,Quincy,Massachusetts,0,0,0,0,58723
2,Hoover,Alabama,0,4759,0,0,0
3,Rancho Cucamonga,California,0,0,24437,0,0
4,Newark,New Jersey,0,0,0,0,76402


In [13]:
race_dataset = race_data.groupby(["City", "State"])['American Indian and Alaska Native', 'Asian', 'Black or African-American', 'Hispanic or Latino', 'White'].max()

In [14]:
race_dataset.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,American Indian and Alaska Native,Asian,Black or African-American,Hispanic or Latino,White
City,State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Abilene,Texas,1813,2929,14449,33222,95487
Akron,Ohio,1845,9033,66551,3684,129192
Alafaya,Florida,0,10336,6577,34897,63666
Alameda,California,1329,27984,7364,8265,44232
Albany,Georgia,445,650,53440,1783,17160


In [15]:
data = data.drop(race_columns, axis = 1)
data.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ


In [16]:
#result = pd.merge(left, right, on="key")
city_demographics_data = pd.merge(data, race_dataset, on = ["City", "State"])
city_demographics_data.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,American Indian and Alaska Native,Asian,Black or African-American,Hispanic or Latino,White
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,1084,8841,21330,25924,37756
1,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,1084,8841,21330,25924,37756
2,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,1084,8841,21330,25924,37756
3,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,1084,8841,21330,25924,37756
4,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,1084,8841,21330,25924,37756


In [17]:
city_demographics_data.drop_duplicates(inplace = True)
city_demographics_data.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,American Indian and Alaska Native,Asian,Black or African-American,Hispanic or Latino,White
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,1084,8841,21330,25924,37756
5,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,351,30473,3917,2566,58723
10,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,0,4759,18191,3430,61869
14,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,2789,24519,24437,65823,111832
19,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,2268,7349,144961,100432,76402


In [18]:
city_demographics_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 596 entries, 0 to 2886
Data columns (total 15 columns):
City                                 596 non-null object
State                                596 non-null object
Median Age                           596 non-null float64
Male Population                      595 non-null float64
Female Population                    595 non-null float64
Total Population                     596 non-null int64
Number of Veterans                   589 non-null float64
Foreign-born                         589 non-null float64
Average Household Size               588 non-null float64
State Code                           596 non-null object
American Indian and Alaska Native    596 non-null int64
Asian                                596 non-null int64
Black or African-American            596 non-null int64
Hispanic or Latino                   596 non-null int64
White                                596 non-null int64
dtypes: float64(6), int64(6), object(3)

In [19]:
city_demographics_data.to_csv('../data/data-for-s3-upload/us-cities-demographics.csv', index = False)

In [23]:
data1 = pd.read_csv('us-cities-demographics.csv')
# data1.reset_index()
data1.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,American Indian and Alaska Native,Asian,Black or African-American,Hispanic or Latino,White
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,1084,8841,21330,25924,37756
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,351,30473,3917,2566,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,0,4759,18191,3430,61869
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,2789,24519,24437,65823,111832
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,2268,7349,144961,100432,76402


In [24]:
data1.columns

Index(['City', 'State', 'Median Age', 'Male Population', 'Female Population',
       'Total Population', 'Number of Veterans', 'Foreign-born',
       'Average Household Size', 'State Code',
       'American Indian and Alaska Native', 'Asian',
       'Black or African-American', 'Hispanic or Latino', 'White'],
      dtype='object')

In [None]:
df.rename(columns={"A": "a", "B": "c"})
