# 6.3 Geographical Visualization

This script contains the following:
1. Import data and libraries
2. Data wrangling
3. Data cleaning
4. Plotting a choropleth

1 - Importing Databases and Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import os
import folium
import json

In [2]:
# This command propts matplotlib visuals to appear in the notebook 

%matplotlib inline

In [3]:
# Import ".json" file for the U.S. 

country_geo = r'C:\Users\kenwh\Achievement 6\Data\Original Data\us_states.json'

In [4]:
# Specify the path to your JSON file

f = open(r'C:\Users\kenwh\Achievement 6\Data\Original Data\us_states.json')
  
# returns JSON object asa dictionary
data = json.load(f)
  
# Iterating through the json list
for i in data['features']:
    print(i)


{'type': 'Feature', 'id': 'AL', 'properties': {'name': 'Alabama'}, 'geometry': {'type': 'Polygon', 'coordinates': [[[-87.359296, 35.00118], [-85.606675, 34.984749], [-85.431413, 34.124869], [-85.184951, 32.859696], [-85.069935, 32.580372], [-84.960397, 32.421541], [-85.004212, 32.322956], [-84.889196, 32.262709], [-85.058981, 32.13674], [-85.053504, 32.01077], [-85.141136, 31.840985], [-85.042551, 31.539753], [-85.113751, 31.27686], [-85.004212, 31.003013], [-85.497137, 30.997536], [-87.600282, 30.997536], [-87.633143, 30.86609], [-87.408589, 30.674397], [-87.446927, 30.510088], [-87.37025, 30.427934], [-87.518128, 30.280057], [-87.655051, 30.247195], [-87.90699, 30.411504], [-87.934375, 30.657966], [-88.011052, 30.685351], [-88.10416, 30.499135], [-88.137022, 30.318396], [-88.394438, 30.367688], [-88.471115, 31.895754], [-88.241084, 33.796253], [-88.098683, 34.891641], [-88.202745, 34.995703], [-87.359296, 35.00118]]]}}
{'type': 'Feature', 'id': 'AK', 'properties': {'name': 'Alaska'},

In [5]:
path = "C:/Users/kenwh/Achievement 6"

In [6]:
df_cleaned = pd.read_csv(os.path.join(path, 'Data', 'Prepared Data', 'cleaned_data.cvc'))

In [7]:
df_cleaned.head()

Unnamed: 0.1,Unnamed: 0,Age,Gender,Body Mass Index,Number of Children,Smoker,Region of USA,Expenses
0,0,19,female,27.9,0,yes,southwest,16884.924
1,1,18,male,33.77,1,no,southeast,1725.5523
2,2,28,male,33.0,3,no,southeast,4449.462
3,3,33,male,22.705,0,no,northwest,21984.47061
4,4,32,male,28.88,0,no,northwest,3866.8552


In [8]:
df_cleaned.shape

(1337, 8)

2: Data Wrangling

In [9]:
 # dictionary with regions and states
states_region = {
    'southeast': ['Alabama', 'Arkansas', 'Flordia', 'Gerogia', 'Kentucky', 'Mississippi', 
                  'North Carolina', 'Tennessee', 'Virginia', 'West Virginia', 'South Carolina'],
    'southwest': ['Arizona', 'California', 'Colorado', 'Hawaii', 'Kansas', 
                  'Louisiana', 'Missouri', 'Nevada', 'New Mexico', 'Texas', 'Utah', 'Oklahoma'],
    'northeast': ['Conneticut', 'Delaware', 'Illonois', 'Indiana', 'Maine', 'Maryland', 'Massachusettes', 
                  'Michigan', 'New Hampshire', 'New Jersey', 'New York', 'Vermont', 'Ohio', 'Pennslyvania', 'Rhode Island' ],
    'northwest': ['Alaska', 'Idaho', 'Iowa', 'Minnesota', 'Montana', 'Nebraska', 'North Dakota', 
                  'Washington', 'Wisconsin', 'Wyoming', 'Oregon', 'South Carolina']
}

# Convert dictionary to DataFrame
df = pd.DataFrame([(state, region) for region, states in data.items() for state in states], columns=['State', 'Region'])

# Display the DataFrame
print(df)

                                                State    Region
0                                                   F      type
1                                                   e      type
2                                                   a      type
3                                                   t      type
4                                                   u      type
..                                                ...       ...
62  {'type': 'Feature', 'id': 'VA', 'properties': ...  features
63  {'type': 'Feature', 'id': 'WA', 'properties': ...  features
64  {'type': 'Feature', 'id': 'WV', 'properties': ...  features
65  {'type': 'Feature', 'id': 'WI', 'properties': ...  features
66  {'type': 'Feature', 'id': 'WY', 'properties': ...  features

[67 rows x 2 columns]


In [10]:
df.shape

(67, 2)

In [11]:
# Function to assign random state within region
import random

# Function to assign random state within region
def assign_random_state(region):
    states = df.get(region)
    if states:
        return random.choice(states)
    else:
        return None

# Add new column with random states
df_cleaned['Random_State'] = df_cleaned['Region of USA'].apply(assign_random_state)

print(df_cleaned)

      Unnamed: 0  Age  Gender  Body Mass Index  Number of Children Smoker  \
0              0   19  female           27.900                   0    yes   
1              1   18    male           33.770                   1     no   
2              2   28    male           33.000                   3     no   
3              3   33    male           22.705                   0     no   
4              4   32    male           28.880                   0     no   
...          ...  ...     ...              ...                 ...    ...   
1332        1332   50    male           30.970                   3     no   
1333        1333   18  female           31.920                   0     no   
1334        1334   18  female           36.850                   0     no   
1335        1335   21  female           25.800                   0     no   
1336        1336   61  female           29.070                   0    yes   

     Region of USA     Expenses Random_State  
0        southwest  16884.92

In [12]:

# Add new column with random states
df_cleaned['Random_State'] = df_cleaned['Region of USA'].apply(assign_random_state)

# Replace "None" with random state from the same region
for index, row in df_cleaned.iterrows():
    if pd.isnull(row['Random_State']):
        df_cleaned.at[index, 'Random_State'] = assign_random_state(row['Region of USA'])

print(df_cleaned)
        

      Unnamed: 0  Age  Gender  Body Mass Index  Number of Children Smoker  \
0              0   19  female           27.900                   0    yes   
1              1   18    male           33.770                   1     no   
2              2   28    male           33.000                   3     no   
3              3   33    male           22.705                   0     no   
4              4   32    male           28.880                   0     no   
...          ...  ...     ...              ...                 ...    ...   
1332        1332   50    male           30.970                   3     no   
1333        1333   18  female           31.920                   0     no   
1334        1334   18  female           36.850                   0     no   
1335        1335   21  female           25.800                   0     no   
1336        1336   61  female           29.070                   0    yes   

     Region of USA     Expenses Random_State  
0        southwest  16884.92

In [13]:
df_cleaned.head()

Unnamed: 0.1,Unnamed: 0,Age,Gender,Body Mass Index,Number of Children,Smoker,Region of USA,Expenses,Random_State
0,0,19,female,27.9,0,yes,southwest,16884.924,
1,1,18,male,33.77,1,no,southeast,1725.5523,
2,2,28,male,33.0,3,no,southeast,4449.462,
3,3,33,male,22.705,0,no,northwest,21984.47061,
4,4,32,male,28.88,0,no,northwest,3866.8552,


In [14]:
# Step 1: Group the DataFrame by the 'region' column
grouped = df_cleaned.groupby('Region of USA')

# Step 2: Initialize empty DataFrames for each region
northeast_df = pd.DataFrame()
northwest_df = pd.DataFrame()
southeast_df = pd.DataFrame()
southwest_df = pd.DataFrame()

# Step 3: Iterate through the groups and assign them to respective DataFrames
for region, data in grouped:
    if region == 'northeast':
        northeast_df = pd.concat([northeast_df, data])
    elif region == 'northwest':
        northwest_df = pd.concat([northwest_df, data])
    elif region == 'southeast':
        southeast3_df = pd.concat([southeast_df, data])
    elif region == 'southwest':
        southwest_df = pd.concat([southwest_df, data])

In [15]:
for region, data in grouped:
    print(f"Region: {region}")
    print(data)  # Display the data for the current region
    print("\n")

Region: northeast
      Unnamed: 0  Age  Gender  Body Mass Index  Number of Children Smoker  \
8              8   37    male           29.830                   2     no   
10            10   25    male           26.220                   0     no   
16            16   52  female           30.780                   1     no   
17            17   23    male           23.845                   0     no   
20            20   60  female           36.005                   0     no   
...          ...  ...     ...              ...                 ...    ...   
1320        1320   62    male           26.695                   0    yes   
1324        1324   61    male           33.535                   0     no   
1325        1325   42  female           32.870                   0     no   
1327        1327   23  female           24.225                   2     no   
1333        1333   18  female           31.920                   0     no   

     Region of USA     Expenses Random_State  
8        n

In [16]:
import itertools

# Define the list of states for each region
states_by_region = {
    'northeast': ['Connecticut', 'Delaware', 'Illinois', 'Indiana', 'Maine', 'Maryland', 'Massachusetts', 
                  'Michigan', 'New Hampshire', 'New Jersey', 'New York', 'Vermont', 'Ohio', 'Pennsylvania', 'Rhode Island'],  # Define states for region1
    'northwest': ['Alaska', 'Idaho', 'Iowa', 'Minnesota', 'Montana', 'Nebraska', 'North Dakota', 
                  'Washington', 'Wisconsin', 'Wyoming', 'Oregon'],  # Define states for region2
    'southeast': ['Alabama', 'Arkansas', 'Florida', 'Georgia', 'Kentucky', 'Mississippi', 
                  'North Carolina', 'Tennessee', 'Virginia', 'West Virginia', 'South Carolina'],  # Define states for region3
    'southwest': ['Arizona', 'California', 'Colorado', 'Hawaii', 'Kansas', 
                  'Louisiana', 'Missouri', 'Nevada', 'New Mexico', 'Texas', 'Utah', 'Oklahoma']}  # Define states for region4

# Iterate over each group and assign cyclical values based on the region
for region, data in grouped:
    # Check if the region exists in the dictionary
    if region in states_by_region:
        # Get the list of states for the current region
        states = states_by_region[region]
        
        # Get an iterator that cycles through the states indefinitely
        state_cycle = itertools.cycle(states)
        
        # Replace 'None' values in the 'Random_State' column with cyclically assigned states for the current region
        data['Random_State'] = data['Random_State'].apply(lambda x: next(state_cycle) if x == 'None' else x)
        
        # Update the modified data back to the original DataFrame
        df_cleaned.loc[data.index, 'Random_State'] = data['Random_State']

In [17]:
for region, data in grouped:
    print(f"Region: {region}")
    print(data)  # Display the data for the current region
    print("\n")

Region: northeast
      Unnamed: 0  Age  Gender  Body Mass Index  Number of Children Smoker  \
8              8   37    male           29.830                   2     no   
10            10   25    male           26.220                   0     no   
16            16   52  female           30.780                   1     no   
17            17   23    male           23.845                   0     no   
20            20   60  female           36.005                   0     no   
...          ...  ...     ...              ...                 ...    ...   
1320        1320   62    male           26.695                   0    yes   
1324        1324   61    male           33.535                   0     no   
1325        1325   42  female           32.870                   0     no   
1327        1327   23  female           24.225                   2     no   
1333        1333   18  female           31.920                   0     no   

     Region of USA     Expenses Random_State  
8        n

In [18]:
# Iterate over each group and assign cyclical values based on the region
for region, data in grouped:
    print(f"Processing region: {region}")
    # Check if the region exists in the dictionary
    if region in states_by_region:
        # Get the list of states for the current region
        states = states_by_region[region]
        print(f"States for {region}: {states}")
        
        # Get an iterator that cycles through the states indefinitely
        state_cycle = itertools.cycle(states)
        
        # Replace 'None' values in the 'Random_State' column with cyclically assigned states for the current region
        data['Random_State'] = data['Random_State'].apply(lambda x: next(state_cycle) if str(x).strip().lower() == 'none' else x)
        
        print(data[['Random_State']])  # Print the 'Random_State' column after applying the lambda function
        
        # Update the modified data back to the original DataFrame using .loc
        df_cleaned.loc[data.index, 'Random_State'] = data['Random_State']

Processing region: northeast
States for northeast: ['Connecticut', 'Delaware', 'Illinois', 'Indiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'New Hampshire', 'New Jersey', 'New York', 'Vermont', 'Ohio', 'Pennsylvania', 'Rhode Island']
       Random_State
8       Connecticut
10         Delaware
16         Illinois
17          Indiana
20            Maine
...             ...
1320          Maine
1324       Maryland
1325  Massachusetts
1327       Michigan
1333  New Hampshire

[324 rows x 1 columns]
Processing region: northwest
States for northwest: ['Alaska', 'Idaho', 'Iowa', 'Minnesota', 'Montana', 'Nebraska', 'North Dakota', 'Washington', 'Wisconsin', 'Wyoming', 'Oregon']
     Random_State
3          Alaska
4           Idaho
7            Iowa
9       Minnesota
24        Montana
...           ...
1318       Alaska
1319        Idaho
1323         Iowa
1332    Minnesota
1336      Montana

[324 rows x 1 columns]
Processing region: southeast
States for southeast: ['Alabama', 'Arkansa

# Consistancey Checks


In [19]:
df_cleaned.head()

Unnamed: 0.1,Unnamed: 0,Age,Gender,Body Mass Index,Number of Children,Smoker,Region of USA,Expenses,Random_State
0,0,19,female,27.9,0,yes,southwest,16884.924,Arizona
1,1,18,male,33.77,1,no,southeast,1725.5523,Alabama
2,2,28,male,33.0,3,no,southeast,4449.462,Arkansas
3,3,33,male,22.705,0,no,northwest,21984.47061,Alaska
4,4,32,male,28.88,0,no,northwest,3866.8552,Idaho


In [20]:
type(df_cleaned)

pandas.core.frame.DataFrame

In [21]:
# Missing Value Check
df_cleaned.isnull().sum()

Unnamed: 0            0
Age                   0
Gender                0
Body Mass Index       0
Number of Children    0
Smoker                0
Region of USA         0
Expenses              0
Random_State          0
dtype: int64

In [22]:
# Duplicate Check
dups = df_cleaned.duplicated()

In [23]:
dups.shape # no dupes

(1337,)

In [24]:
df_cleaned.dtypes

Unnamed: 0              int64
Age                     int64
Gender                 object
Body Mass Index       float64
Number of Children      int64
Smoker                 object
Region of USA          object
Expenses              float64
Random_State           object
dtype: object

In [25]:
# Create a data frame with just the states and the values for rating we want plotted

data_to_plot = df_cleaned[['Random_State','Expenses']]
data_to_plot.head()

Unnamed: 0,Random_State,Expenses
0,Arizona,16884.924
1,Alabama,1725.5523
2,Arkansas,4449.462
3,Alaska,21984.47061
4,Idaho,3866.8552


In [26]:
# Setup a folium map at a high-level zoom
map = folium.Map(location = [100, 0], zoom_start = 1.5)

# Choropleth maps bind Pandas Data Frames and json geometries.
folium.Choropleth(
    geo_data = country_geo, 
    data = data_to_plot,
    columns = ['Random_State', 'Expenses'],
    key_on = 'feature.properties.name', # this part is very important - check your json file to see where the KEY is located
    fill_color = 'YlOrBr', fill_opacity=0.6, line_opacity=0.1,
    legend_name = "rating").add_to(map)
folium.LayerControl().add_to(map)

map



Explanation

Based on the map above, we can see that certian areas of the country have higher expenses then others. Most of the states in the deep southeast have low expenses, while many in the northwest have higher expenses.

# create another choropleth using BMI

In [27]:
# Create a data frame with just the states and the values for rating we want plotted

data_to_plot2 = df_cleaned[['Random_State','Body Mass Index']]
data_to_plot2.head()

Unnamed: 0,Random_State,Body Mass Index
0,Arizona,27.9
1,Alabama,33.77
2,Arkansas,33.0
3,Alaska,22.705
4,Idaho,28.88


In [28]:
# Setup a folium map at a high-level zoom
map = folium.Map(location = [100, 0], zoom_start = 1.5)

# Choropleth maps bind Pandas Data Frames and json geometries.
folium.Choropleth(
    geo_data = country_geo, 
    data = data_to_plot2,
    columns = ['Random_State', 'Body Mass Index'],
    key_on = 'feature.properties.name', # this part is very important - check your json file to see where the KEY is located
    fill_color = 'YlOrBr', fill_opacity=0.6, line_opacity=0.1,
    legend_name = "rating").add_to(map)
folium.LayerControl().add_to(map)

map

Comparing this map to the expenses one, we can see that there is some correlation between expenses and BMI, especially in states like Virginia and South Dakota where both are high. However, this does not apply to all states/regions. For example despite most states in the southeast having high BMI, their expenses are comparatively low.

# Exporting New Data Frame

In [29]:
df_cleaned.to_csv(os.path.join(path,'Data','Prepared data','with_states_added.cvc'))

In [30]:
output_file = 'output_file.xlsx'
df_cleaned.to_excel(output_file, index=False)

In [31]:
print(f'Data has been successfully saved to {output_file}')

Data has been successfully saved to output_file.xlsx
