<a href="https://colab.research.google.com/github/katiesunsg/comparing-baltimore-nyc-teenagebirthrate/blob/master/baltimore_nyc_teenagebirthrate_income_katiesun.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Comparing Baltimore and New York City Teenage Birth Rates by Race
* How does teenage birth rates differ within the two cities by income level when stratifying for all income, high income, middle income, and low income?
* Do we see similar trends for teenage birth rates between the two cities?

In [1]:
# pip install plotly

In [2]:
# for data analysis
import pandas as pd
import numpy as np

# for viz
import plotly.express as px

In [30]:
# import data

# baltimore teenage birth rates by income levels
df_balti_teenagebirths = pd.read_csv("https://raw.githubusercontent.com/katiesunsg/comparing-baltimore-nyc-teenagebirthrate/master/Balti-Raw-TeenageBirthRate-Income_AllHighMiddleLow.csv")

# nyc teenage birth rates by income levels 
df_nyc_teenagebirths = pd.read_csv("https://raw.githubusercontent.com/katiesunsg/comparing-baltimore-nyc-teenagebirthrate/master/NYC-Raw-TeenageBirthRate-Income_AllHighMiddleLow.csv")


In [31]:
# preview balti data
df_balti_teenagebirths.head()

Unnamed: 0,tract,Neighborhood,City,State,Teenage_Birth_Rate_women_only_rP_gF_pall,Income
0,24510280500,Pleasant View Gardens,Baltimore,MD,0.6596,All
1,24510250204,Cherry Hill,Baltimore,MD,0.6561,All
2,24510120400,Barclay,Baltimore,MD,0.6273,All
3,24510200100,Lexington,Baltimore,MD,0.6078,All
4,24510170200,McCulloh Homes,Baltimore,MD,0.6074,All


In [32]:
# preview nyc data
df_nyc_teenagebirths.head()

Unnamed: 0,tract,Neighborhood,City,State,Teenage_Birth_Rate_women_only_rP_gF_pall,Income
0,36061020600,East Harlem,New York,NY,0.4863,All
1,36061025900,Harlem,New York,NY,0.444,All
2,36061023502,Harlem,New York,NY,0.4146,All
3,36061022800,Harlem,New York,NY,0.4125,All
4,36061023200,Harlem,New York,NY,0.3984,All


In [34]:
# preview nyc data end
df_nyc_teenagebirths.tail()

Unnamed: 0,tract,Neighborhood,City,State,Teenage_Birth_Rate_women_only_rP_gF_pall,Income
1143,36061010200,Midtown,New York,NY,,Low
1144,36061031900,Lower Manhattan,New York,NY,,Low
1145,36061010100,Midtown,New York,NY,,Low
1146,36061011202,Midtown,New York,NY,,Low
1147,36061031100,Upper Manhattan,New York,NY,,Low


# Filtering Data for Neighborhoods

In [9]:
# look at balti neighborhood names
df_balti_teenagebirths["Neighborhood"].unique()

array(['Pleasant View Gardens', 'Cherry Hill', 'Barclay', 'Lexington',
       'McCulloh Homes', 'Broadway East', 'Shipley Hill', 'Oliver',
       'Langston Hughes', 'Upton', 'Butchers Hill',
       'Coldstream - Homestead - Montebello', 'Perkins Homes',
       'Sandtown-Winchester', 'Mosher', 'Gay Street',
       'Bentalou-Smallwood', 'Better Waverly', 'Darley Park',
       'Belair - Edison', 'Madison - Eastend', 'Penn North',
       'East Baltimore Midway', 'Harlem Park', 'Greenmount West',
       'Milton - Montford', 'Berea', 'Midtown Edmondson',
       'Johnson Square', 'Burleith-Leighton', 'Franklin Square',
       'Harwood', 'Central Park Heights', 'Westport', 'Cedonia',
       'Frankford', 'NW Community Action', 'Druid Heights', 'Old Goucher',
       'Patterson Park', "O'Donnell Heights", 'Arlington', 'Mondawmin',
       'Edgecomb', 'Reservoir Hill', 'Garwyn Oaks', 'Pigtown',
       'Coppin Heights', "Saint Joseph's", 'Downtown', 'Irvington',
       'Winston - Govans', 'Edmondson

In [10]:
# number of balti neighborhoods
df_balti_teenagebirths["Neighborhood"].nunique()

131

In [11]:
# nyc neighborhood names
df_nyc_teenagebirths["Neighborhood"].unique()

array(['East Harlem', 'Harlem', 'Alphabet City', 'Upper Manhattan',
       'Lower Manhattan', 'Inwood', 'Fort George', 'Upper West Side',
       'West Bronx', 'Midtown', 'Lincoln Square', "Hell's Kitchen",
       'Hudson Heights', 'Bowery', 'Upper East Side', 'East Village',
       'Bloomingdale', 'Roosevelt Island', 'Gramercy Park', 'Lenox Hill',
       'Lower East Side', 'Kips Bay', 'Chinatown', 'Midtown East',
       'Yorkville', 'Randalls Island', 'Central Park', 'Governors Island',
       'Garment District'], dtype=object)

In [12]:
# number of nyc neighborhoods
df_nyc_teenagebirths["Neighborhood"].nunique()

29

In [7]:
# look at all of the column names
df_balti_teenagebirths.columns

Index(['tract', 'Neighborhood', 'City', 'State',
       'Teenage_Birth_Rate_women_only_rP_gF_pall', 'Income'],
      dtype='object')

In [13]:
df_balti_teenagebirths.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 772 entries, 0 to 771
Data columns (total 6 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   tract                                     772 non-null    int64  
 1   Neighborhood                              772 non-null    object 
 2   City                                      772 non-null    object 
 3   State                                     772 non-null    object 
 4   Teenage_Birth_Rate_women_only_rP_gF_pall  765 non-null    float64
 5   Income                                    772 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 36.3+ KB


In [14]:
df_balti_teenagebirths.describe()

Unnamed: 0,tract,Teenage_Birth_Rate_women_only_rP_gF_pall
count,772.0,765.0
mean,24494490000.0,0.362926
std,87713030.0,0.148345
min,24003750000.0,0.0098
25%,24510120000.0,0.2542
50%,24510190000.0,0.3753
75%,24510260000.0,0.4823
max,24510280000.0,0.6596


In [77]:
# rename columns teenage birth rates for balti
df_balti_teenagebirths = df_balti_teenagebirths.rename(columns={"Teenage_Birth_Rate_women_only_rP_gF_pall":"Teenage Birth Rate"})

# preview new column names
df_balti_teenagebirths.head()

Unnamed: 0,tract,Neighborhood,City,State,Teenage Birth Rate,Income
0,24510280500,Pleasant View Gardens,Baltimore,MD,0.6596,All
1,24510250204,Cherry Hill,Baltimore,MD,0.6561,All
2,24510120400,Barclay,Baltimore,MD,0.6273,All
3,24510200100,Lexington,Baltimore,MD,0.6078,All
4,24510170200,McCulloh Homes,Baltimore,MD,0.6074,All


In [79]:
# rename columns teenage births for nyc
df_nyc_teenagebirths = df_nyc_teenagebirths.rename(columns={"Teenage_Birth_Rate_women_only_rP_gF_pall":"Teenage Birth Rate"})

# preview new column names
df_nyc_teenagebirths.head()


Unnamed: 0,tract,Neighborhood,City,State,Teenage Birth Rate,Income
0,36061020600,East Harlem,New York,NY,0.4863,All
1,36061025900,Harlem,New York,NY,0.444,All
2,36061023502,Harlem,New York,NY,0.4146,All
3,36061022800,Harlem,New York,NY,0.4125,All
4,36061023200,Harlem,New York,NY,0.3984,All


# Removing Null Values for Teenage Birth Rates

In [87]:
# check for number of null values in balti df
df_balti_teenagebirths.isnull().sum()

tract                 0
Neighborhood          0
City                  0
State                 0
Teenage Birth Rate    7
Income                0
dtype: int64

In [88]:
# check for number of null values in nyc df
df_nyc_teenagebirths.isnull().sum()

tract                  0
Neighborhood           0
City                   0
State                  0
Teenage Birth Rate    61
Income                 0
dtype: int64

In [89]:
# remove null values for balti 
modified_df_balti_teenagebirths = df_balti_teenagebirths.dropna()

In [90]:
# check balti null values were removed
modified_df_balti_teenagebirths.isnull().sum()

tract                 0
Neighborhood          0
City                  0
State                 0
Teenage Birth Rate    0
Income                0
dtype: int64

In [91]:
# remove null values for nyc
modified_df_nyc_teenagebirths = df_nyc_teenagebirths.dropna()

In [92]:
# check nyc null values were removed
modified_df_nyc_teenagebirths.isnull().sum()

tract                 0
Neighborhood          0
City                  0
State                 0
Teenage Birth Rate    0
Income                0
dtype: int64

# Pivot Tables with Teenage Birth Rates per Neighborhood by Income Level 

In [93]:
# pivot table for balti teenage birth rates by income level 
pvmodified_df_balti_teenagebirths = pd.pivot_table(modified_df_balti_teenagebirths,
                    index = "Neighborhood",
                    columns = "Income",
                    values = "Teenage Birth Rate",
                    aggfunc = np.mean).reset_index()

In [97]:
# preview balti pivot table
pvmodified_df_balti_teenagebirths.head(15)

Income,Neighborhood,All,High,Low,Middle
0,Allendale,0.4737,0.4526,0.4777,0.4633
1,Arcadia,0.255,0.0823,0.4525,0.2404
2,Arlington,0.5046,0.3024,0.532,0.4004
3,Armistead Gardens,0.2884,0.1756,0.3273,0.2403
4,Baltimore Highlands,0.38715,0.1254,0.44335,0.26115
5,Barclay,0.6273,0.3865,0.6167,0.4848
6,Beechfield,0.4142,0.1902,0.526,0.3336
7,Belair - Edison,0.44495,0.289725,0.516375,0.3865
8,Bentalou-Smallwood,0.5598,0.3063,0.557,0.4134
9,Berea,0.5401,0.38205,0.5744,0.46415


In [95]:
# pivot table for nyc teenage birth rates by income level
pvmodified_df_nyc_teenagebirths = pd.pivot_table(modified_df_nyc_teenagebirths,
                    index = "Neighborhood",
                    columns = "Income",
                    values = "Teenage Birth Rate",
                    aggfunc = np.mean).reset_index()

In [98]:
# preview nyc pivot table
pvmodified_df_nyc_teenagebirths.head(15)

Income,Neighborhood,All,High,Low,Middle
0,Alphabet City,0.208033,0.22105,0.206083,0.21465
1,Bloomingdale,0.10145,0.07085,0.1599,0.10885
2,Bowery,0.1158,0.09115,0.1081,0.0984
3,Chinatown,0.016667,0.009767,0.028633,0.013367
4,East Harlem,0.314089,0.220367,0.339106,0.271061
5,East Village,0.125,0.0902,0.1277,0.1062
6,Fort George,0.209675,0.121713,0.22825,0.167188
7,Gramercy Park,0.041133,0.041,0.073467,0.054867
8,Harlem,0.330193,0.215153,0.34434,0.270313
9,Hell's Kitchen,0.144429,0.0905,0.190586,0.1249


In [124]:
# sort balti by all income
pvmodified_df_balti_teenagebirths = pvmodified_df_balti_teenagebirths.sort_values("All")

In [121]:
# preview data
pvmodified_df_balti_teenagebirths.head(10)

Income,Neighborhood,All,High,Low,Middle,avg_teenage_birth_rate_all
84,Mount Washington,0.0098,0.0298,0.0699,0.0469,0.0098
109,Roland Park,0.0108,0.0121,0.0828,0.0423,0.0108
115,Tuscany - Canterbury,0.0166,0.022,0.108,0.0587,0.0166
39,Evergreen,0.0179,0.0367,0.1474,0.084,0.0179
58,Homeland,0.0251,0.0503,0.158,0.0963,0.0251
22,Cheswolde,0.0437,0.0258,0.1454,0.0769,0.0437
27,Cross Country,0.0585,0.0287,0.1498,0.0804,0.0585
75,Mid-Charles,0.0933,0.1786,0.6247,0.3691,0.0933
101,Radnor - Winston,0.1457,0.0328,0.3257,0.1578,0.1457
87,North Harford Road,0.14645,0.1102,0.21555,0.15515,0.14645


In [136]:
# sort nyc by all income
pvmodified_df_nyc_teenagebirths = pvmodified_df_nyc_teenagebirths.sort_values("All")

# preview data
pvmodified_df_nyc_teenagebirths.head(10)

Income,Neighborhood,All,High,Low,Middle
3,Chinatown,0.016667,0.009767,0.028633,0.013367
24,Yorkville,0.022,0.0263,0.0581,0.0399
13,Lenox Hill,0.023783,0.020542,0.049133,0.031583
18,Midtown East,0.0247,0.0165,0.0259,0.0205
20,Upper East Side,0.032378,0.032739,0.081606,0.052889
12,Kips Bay,0.0329,0.024875,0.067175,0.0415
7,Gramercy Park,0.041133,0.041,0.073467,0.054867
15,Lower East Side,0.0621,0.0835,0.08695,0.08495
17,Midtown,0.06254,0.040019,0.111703,0.066998
14,Lincoln Square,0.066514,0.057957,0.086071,0.067114


# Create Baltimore Data Visualization

In [146]:
# bar graph for teenage birth rates in baltimore all income 
bar_baltiallteenagebirthrate = px.bar(pvmodified_df_balti_teenagebirths,x= "Neighborhood", y = "All", title = "Teenage Birth Rates in Baltimore City by All Income")

In [147]:
# data visualization 
bar_baltiallteenagebirthrate

In [148]:
# bar graph for teenage birth rates in nyc all income 
bar_nycallteenagebirthrate = px.bar(pvmodified_df_nyc_teenagebirths,x= "Neighborhood", y = "All", title = "Teenage Birth Rates in New York City by All Income")

In [149]:
# data visualization 
bar_nycallteenagebirthrate

In [151]:
# bar graph for teenage birth rates in baltimore low income 
bar_baltilowteenagebirthrate = px.bar(pvmodified_df_balti_teenagebirths,x= "Neighborhood", y = "Low", title = "Teenage Birth Rates in Baltimore City by Low Income")

In [152]:
# data visualization 
bar_baltilowteenagebirthrate

In [153]:
# bar graph for teenage birth rates in nyc low income 
bar_nyclowteenagebirthrate = px.bar(pvmodified_df_nyc_teenagebirths,x= "Neighborhood", y = "Low", title = "Teenage Birth Rates in New York City by Low Income")

In [154]:
# data visualization 
bar_nyclowteenagebirthrate

# Pivot Tables by City

In [156]:
# pivot table for overall nyc teenage birth rates by income level
citypvmodified_df_nyc_teenagebirths = pd.pivot_table(modified_df_nyc_teenagebirths,
                    index = "City",
                    columns = "Income",
                    values = "Teenage Birth Rate",
                    aggfunc = np.mean).reset_index()

In [158]:
# nyc overall table
citypvmodified_df_nyc_teenagebirths.head()

Income,City,All,High,Low,Middle
0,New York,0.147025,0.105773,0.183685,0.137729


In [159]:
# pivot table for overall balti teenage birth rates by income level
citypvmodified_df_balti_teenagebirths = pd.pivot_table(modified_df_balti_teenagebirths,
                    index = "City",
                    columns = "Income",
                    values = "Teenage Birth Rate",
                    aggfunc = np.mean).reset_index()

In [160]:
# balti overall table
citypvmodified_df_balti_teenagebirths.head()

Income,City,All,High,Low,Middle
0,Baltimore,0.397786,0.259241,0.451793,0.342266


# Merge Condensed Datasets to Create Bar Graph

In [168]:
#using base function merge
df_combinedcities = pd.concat([citypvmodified_df_balti_teenagebirths,citypvmodified_df_nyc_teenagebirths],axis=0)

In [169]:
# viewing combined dataset for overall cities
df_combinedcities.head()

Income,City,All,High,Low,Middle
0,Baltimore,0.397786,0.259241,0.451793,0.342266
0,New York,0.147025,0.105773,0.183685,0.137729


In [180]:
# bar graph for overall teenage birth rates in balti and nyc all income
bar_combinedcities = px.bar(df_combinedcities,x= "City", y = ("All"), title = "Teenage Birth Rates in Baltimore City vs. New York City for All Income")

In [179]:
# view bar graph for two cities
bar_combinedcities

In [183]:
# bar graph for overall teenage birth rates in balti and nyc low income
bar_lowcombinedcities = px.bar(df_combinedcities,x= "City", y = ("Low"), title = "Teenage Birth Rates in Baltimore City vs. New York City for Low Income")

In [184]:
# view bar graph for two cities
bar_lowcombinedcities