# 6.3 Geographic Visualization 

### This script contains the following:
#### 1. Import data and libraries
#### 2. Data wrangling
#### 3. Data cleaning
#### 4. Plotting a choropleth

### 1. Import data and libraries

In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import os
import folium
import json

In [17]:
# This command propts matplotlib visuals to appear in the notebook 

%matplotlib inline

In [18]:
# Import ".json" file for the U.S. 

country_geo = r'C:/Users/asus/Documents/Work/Projects/Career Foundry/Data Analysis Immersion - Achievement 6 /us-states.json'

In [None]:
path = 'C:/Users/asus/Documents/Career Foundry/Achievement 6'

In [None]:
# Import the recipes data

df = pd.read_csv(os.path.join(path, '6.3 Images & Assets', 'Data', 'recipes.csv'))

In [None]:
df.head()

In [None]:
df.shape

### 2. Data wrangling 

##### Fix dummy columns

In [None]:
# Select only the necessary columns and put them in a list called columns

columns = ["alabama",
"alaska",
"arizona",
"california",
"colorado",
"connecticut",
"florida",
"georgia",
"hawaii",
"idaho",
"illinois",
"indiana",
"iowa",
"kansas",
"kentucky",
"louisiana",
"maine",
"maryland",
"massachusetts",
"michigan",
"minnesota",
"mississippi",
"missouri",
"nebraska",
"new hampshire",
"new jersey",
"new mexico",
"new york",
"north carolina",
"ohio",
"oklahoma",
"oregon",
"pennsylvania",
"rhode island",
"south carolina",
"tennessee",
"texas",
"utah",
"vermont",
"virginia",
"washington",
"west virginia",
"wisconsin",
"title",
"rating",
"calories",
"protein",
"fat",
"sodium"
]

In [None]:
# Create a subset

state_rec = df[columns]

In [None]:
state_rec.head()

In [None]:
# Select only the states from state_rec in a new subset

states_num = state_rec[["alabama",
"alaska",
"arizona",
"california",
"colorado",
"connecticut",
"florida",
"georgia",
"hawaii",
"idaho",
"illinois",
"indiana",
"iowa",
"kansas",
"kentucky",
"louisiana",
"maine",
"maryland",
"massachusetts",
"michigan",
"minnesota",
"mississippi",
"missouri",
"nebraska",
"new hampshire",
"new jersey",
"new mexico",
"new york",
"north carolina",
"ohio",
"oklahoma",
"oregon",
"pennsylvania",
"rhode island",
"south carolina",
"tennessee",
"texas",
"utah",
"vermont",
"virginia",
"washington",
"west virginia",
"wisconsin"]]

In [None]:
# This command turns the dummy data from the states columns into a categorical variable in s2, which is a pandas Series data structure

s2 = states_num.idxmax(axis=1)

# You have not encountered Series yet, but it is another pandas data structure. It is similar to a dataframe, but it is 
# one-dimensional - this means it can only have one column, whereas a dataframe is two dimensional. You can turn any dataframe
# column into series and you can add series to a dataframe as a column!

In [None]:
s2

In [None]:
type(s2)

In [None]:
# Create new column 'state' in the state_rec dataframe

state_rec['STATE_NAME'] = s2

In [None]:
state_rec.columns

In [None]:
# Drop the dummy columns from the dataframe

state_rec.drop(columns = ["alabama",
"alaska",
"arizona",
"california",
"colorado",
"connecticut",
"florida",
"georgia",
"hawaii",
"idaho",
"illinois",
"indiana",
"iowa",
"kansas",
"kentucky",
"louisiana",
"maine",
"maryland",
"massachusetts",
"michigan",
"minnesota",
"mississippi",
"missouri",
"nebraska",
"new hampshire",
"new jersey",
"new mexico",
"new york",
"north carolina",
"ohio",
"oklahoma",
"oregon",
"pennsylvania",
"rhode island",
"south carolina",
"tennessee",
"texas",
"utah",
"vermont",
"virginia",
"washington",
"west virginia",
"wisconsin"], inplace = True)

In [None]:
state_rec.columns

In [None]:
state_rec.head()

In [None]:
type(state_rec)

In [None]:
state_rec['STATE_NAME'] = state_rec['STATE_NAME'].str.title()

In [None]:
state_rec.head()

### 3. Conduct consistency checks

In [None]:
# Check for missing values

state_rec.isnull().sum()

In [None]:
# Impute missing values with median

state_rec['calories'].fillna(state_rec['calories'].median(), inplace=True)
state_rec['protein'].fillna(state_rec['protein'].median(), inplace=True)
state_rec['fat'].fillna(state_rec['fat'].median(), inplace=True)
state_rec['sodium'].fillna(state_rec['sodium'].median(), inplace=True)

In [None]:
# Last check for NaN

state_rec.isnull().sum()

##### Duplicates check

In [None]:
dups = state_rec.duplicated()

In [None]:
dups.shape # no dups

##### Extreme values checks

In [None]:
# Check how many extreme values

state_rec[state_rec['fat'] >1000]

In [None]:
# Clean extreme values

state_rec = state_rec[state_rec['fat'] < 500] 
# 500 was picked because it is beyond belief there could be a meal with over 500 gr of fat!

In [None]:
state_rec.shape

In [None]:
# Create a 'cal_per_portion' column

state_rec['cal_per_portion'] = state_rec['calories']*4

In [None]:
sns.histplot(state_rec['cal_per_portion'], bins=20, kde = True)  # shows extreme values for 'cal_per_portion'

In [None]:
# Clean extreme values in 'cal_per_portion'

state_rec = state_rec[state_rec['cal_per_portion'] < 8000]

In [None]:
# Check the rating variable

sns.histplot(state_rec['rating'], bins=20, kde = True)

In [None]:
# Select only entries with a non-zero rating

state_rec = state_rec[state_rec['rating'] >= 1]

In [None]:
state_rec.dtypes

### 4. Plotting a choropleth

In [None]:
# Create a data frame with just the states and the values for rating we want plotted

data_to_plot = state_rec[['STATE_NAME','rating']]
data_to_plot.head()

In [None]:
# Setup a folium map at a high-level zoom
map = folium.Map(location = [100, 0], zoom_start = 1.5)

# Choropleth maps bind Pandas Data Frames and json geometries.This allows us to quickly visualize data combinations
folium.Choropleth(
    geo_data = country_geo, 
    data = data_to_plot,
    columns = ['STATE_NAME', 'rating'],
    key_on = 'feature.properties.name', # this part is very important - check your json file to see where the KEY is located
    fill_color = 'YlOrBr', fill_opacity=0.6, line_opacity=0.1,
    legend_name = "rating").add_to(map)
folium.LayerControl().add_to(map)

map

In [None]:
map.save('plot_data.html')