In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

### Exploratory Data Analysis of education

##### Preparation of the data

In [2]:
f = open(r'data\demographic\us-cities-demographics.json')
json_file = json.load(f)

In [3]:
json_fields = [dic['fields'] for dic in json_file]

In [4]:
json_fields

[{'total_population': 281913,
  'female_population': 143873,
  'count': 76402,
  'foreign_born': 86253,
  'state_code': 'NJ',
  'average_household_size': 2.73,
  'city': 'Newark',
  'race': 'White',
  'male_population': 138040,
  'median_age': 34.6,
  'number_of_veterans': 5829,
  'state': 'New Jersey'},
 {'total_population': 118661,
  'female_population': 62432,
  'count': 1343,
  'foreign_born': 7517,
  'state_code': 'IL',
  'average_household_size': 2.4,
  'city': 'Peoria',
  'race': 'American Indian and Alaska Native',
  'male_population': 56229,
  'median_age': 33.1,
  'number_of_veterans': 6634,
  'state': 'Illinois'},
 {'total_population': 85032,
  'female_population': 43270,
  'count': 2583,
  'foreign_born': 3269,
  'state_code': 'MO',
  'average_household_size': 2.77,
  'city': "O'Fallon",
  'race': 'Hispanic or Latino',
  'male_population': 41762,
  'median_age': 36.0,
  'number_of_veterans': 5783,
  'state': 'Missouri'},
 {'total_population': 136454,
  'female_population': 

In [5]:
df_demographic = pd.DataFrame.from_dict(json_fields)

In [6]:
df_races = df_demographic[['state','race','count']].groupby(['state','race']).sum()

In [7]:
df_races_percentage = df_races / df_races.groupby('state').transform('sum')

In [8]:
df_final = df_races_percentage.unstack()

In [9]:
df_final = df_final.droplevel(0,axis=1)

In [10]:
df_final = df_final.rename_axis(None,axis=1)

In [11]:
df_final = df_final.reset_index()

In [12]:
df_final

Unnamed: 0,state,American Indian and Alaska Native,Asian,Black or African-American,Hispanic or Latino,White
0,Alabama,0.007375,0.026245,0.47536,0.035864,0.455155
1,Alaska,0.108078,0.109524,0.068724,0.081079,0.632595
2,Arizona,0.022539,0.039824,0.051473,0.262066,0.624098
3,Arkansas,0.014576,0.034279,0.232456,0.120903,0.597786
4,California,0.012641,0.143093,0.064465,0.310403,0.469398
5,Colorado,0.017455,0.041479,0.057998,0.196182,0.686886
6,Connecticut,0.009696,0.04366,0.209504,0.280148,0.456992
7,Delaware,0.005516,0.015896,0.588717,0.0735,0.316371
8,District of Columbia,0.008437,0.048274,0.45255,0.097904,0.392835
9,Florida,0.005397,0.030577,0.190735,0.224136,0.549155


We have the of the percentage of races in each state. Now we just need the population of each state.

In [13]:
import requests

In [14]:
response = requests.get(url='https://api.census.gov/data/2021/pep/population?get=POP_2021,NAME&for=state:*')

In [15]:
json_population = response.json()

In [16]:
json_population_columns = json_population[0]
json_population_values = json_population[1:]

In [17]:
np_population = np.array(json_population_values)

In [18]:
df_population = pd.DataFrame(np_population,columns=json_population_columns)

In [19]:
df_population.drop('state', axis=1, inplace=True)

In [20]:
df_population.columns = ['population','state']

In [21]:
df_population

Unnamed: 0,population,state
0,3986639,Oklahoma
1,1963692,Nebraska
2,1441553,Hawaii
3,895376,South Dakota
4,6975218,Tennessee
5,3143991,Nevada
6,2115877,New Mexico
7,3193079,Iowa
8,2934582,Kansas
9,670050,District of Columbia


In [28]:
df_joined = df_final.merge(df_population, on='state', how='outer')

In [29]:
df_joined

Unnamed: 0,state,American Indian and Alaska Native,Asian,Black or African-American,Hispanic or Latino,White,population
0,Alabama,0.007375,0.026245,0.47536,0.035864,0.455155,5039877
1,Alaska,0.108078,0.109524,0.068724,0.081079,0.632595,732673
2,Arizona,0.022539,0.039824,0.051473,0.262066,0.624098,7276316
3,Arkansas,0.014576,0.034279,0.232456,0.120903,0.597786,3025891
4,California,0.012641,0.143093,0.064465,0.310403,0.469398,39237836
5,Colorado,0.017455,0.041479,0.057998,0.196182,0.686886,5812069
6,Connecticut,0.009696,0.04366,0.209504,0.280148,0.456992,3605597
7,Delaware,0.005516,0.015896,0.588717,0.0735,0.316371,1003384
8,District of Columbia,0.008437,0.048274,0.45255,0.097904,0.392835,670050
9,Florida,0.005397,0.030577,0.190735,0.224136,0.549155,21781128
