In [339]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import requests
from shapely.geometry import Point
shp_file = gpd.read_file('datasets/geo_export_26bce2f2-c163-42a9-9329-9ca6e082c5e9.shp')
shp_file.set_crs(epsg=4326, inplace=True)
school_progress_21_22 = pd.read_csv("https://data.cityofchicago.org/resource/ngix-dc87.csv")
school_progress_22_23 = pd.read_csv("https://data.cityofchicago.org/resource/d7as-muwj.csv")
school_progress_23_24 = pd.read_csv("https://data.cityofchicago.org/resource/2dn2-x66j.csv")
crime = pd.DataFrame(requests.get("https://data.cityofchicago.org/resource/9hwr-2zxp.json").json())
data = pd.read_csv("datasets/My_CHI._My_Future._Programs.csv")


## Basic Eda
### 1. Are there any correlations between crime and availability of programs in certain districts when crime incidents happen, and more importantly, does a crime incident mean that programs are affected some considerable time after the incident? 
### 2. Is there such a thing as outlier districts that despite having high crime, they present high student attainment, high graduation rate and a great number of programs? How big of a role does crime play in access to programs and high educational attainment?


## Data preparation and cleaning

### Missing value handling

In [340]:
import geopandas as gpd
from shapely.geometry import Point
# Handle missing values
has_latlong = ~data[['Latitude', 'Longitude']].isna().all(axis=1)
has_location = ~data['Location'].isna()
location_data = data[has_latlong  | has_location]
location_data = location_data[location_data['State'] != 'KS']
# impute geographic cluster name
school_progress_21_22['date'] = pd.to_datetime('2022-09-18')
school_progress_22_23['date'] = pd.to_datetime('2023-06-06')
school_progress_23_24['date'] = pd.to_datetime('2024-10-21')
school_progress = pd.concat([school_progress_21_22, school_progress_22_23, school_progress_23_24], axis=0) # no missing zip codes, no missing latitudes and longitudes
crime.dropna(axis='rows',how='any', inplace=True) # crime only has 3 rows in which the location is missing

In [341]:
location_data = location_data.copy()
location_data.drop(['Image', 'Program URL', 'Registration URL', 'Registration Open', 
                   'Contact Phone', 'Contact Email', 'Hidden Programs', 'Featured', 
                   'Meeting Type', 'Has Free Food', 'Transport Provided', 'Participants Paid', 'Contact Name', 'Program ID', 
                   'Online Address', 'State', 'Registration Deadline', 'Scholarship Available'], 
                   axis='columns', inplace=True)

### Impute geographic cluster name

In [None]:
geometry = [Point(xy) for xy in zip(location_data['Longitude'], location_data['Latitude'])]
gdf_points = gpd.GeoDataFrame(location_data, geometry=geometry)
gdf_points.set_crs(epsg=4326, inplace=True)
points_with_labels = gpd.sjoin(gdf_points, shp_file, how='left', predicate='within')


Unnamed: 0,Program Name,Description,Org Name,Category Name,Capacity,Min Age,Max Age,Address,City,ZIP Code,...,Start Time,End Time,Program Price,Geographic Cluster Name,Custom Categories,Tag,Latitude,Longitude,Location,geometry
27,Phalanx Family Services,Federally funded workforce development program...,Chicago Cook Workforce Partnership,Work + Career,0.0,16,24,837 W. 119th St,Chicago,60643.0,...,,,Free,WEST PULLMAN,,Program,41.677101,-87.643402,POINT (-87.6434021 41.677101135),POINT (-87.6434 41.6771)
28,Weightlifting at Union,This activity is designed to improve overall m...,Chicago Park District,Sports + Wellness.,10.0,13,18,1501 W. Randolph St.,Chicago,60607.0,...,18:00,19:00,Free,NEAR WEST SIDE,,Program,41.884602,-87.664200,POINT (-87.664199829 41.884601593),POINT (-87.6642 41.8846)
29,Halloween Party at Williams Park,"<p>Halloween Party, candy giveaway, facepainti...",Chicago Park District,Nature.,,4,65,2820 South State Street,Chicago,60616.0,...,11:00,13:00,Free,DOUGLAS,,Event,41.842899,-87.626999,POINT (-87.626998901 41.842899323),POINT (-87.627 41.8429)
30,Origami Workshop with Ty Yamamoto,Join us in the library for this art workshop h...,Chicago Public Library,Music & Art.,0.0,6,13,3400 S. Halsted Street,Chicago,60608.0,...,16:00,17:00,Free,BRIDGEPORT,,Event,41.832638,-87.646461,POINT (-87.646461487 41.832637787),POINT (-87.64646 41.83264)
31,Youth Swim at Gill,Social fun swim. Some may chose to work on the...,Chicago Park District,Sports + Wellness.,0.0,6,18,825 W. Sheridan Rd.,Chicago,60613.0,...,15:00,16:30,Free,LAKE VIEW,,Program,41.952400,-87.650597,POINT (-87.650596619 41.952400208),POINT (-87.6506 41.9524)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227741,Baseball Officiating at West Pullman,"Through this activity, participants will learn...",Chicago Park District,Sports + Wellness.,10.0,18,99,401 W. 123rd St.,Chicago,60628.0,...,18:30,20:15,Free,Far South Equity Zone,,Program,41.669998,-87.632202,POINT (-87.632202148 41.669998169),POINT (-87.6322 41.67)
227742,Computer ABCs,Learn computer basics like creating word docum...,Chicago Public Library,Computers.,,18,99,955 E. 131st Street,Chicago,60827.0,...,14:00,16:00,Free,RIVERDALE,,Event,41.657200,-87.598602,POINT (-87.598602295 41.65719986),POINT (-87.5986 41.6572)
227743,Fun and Games (Special Rec) at Mann,For individuals with a primary intellectual or...,Chicago Park District,Sports + Wellness.,15.0,8,16,2949 E. 131st St.,Chicago,60633.0,...,16:00,18:00,$50 or Less,HEGEWISCH,,Program,41.657291,-87.552673,POINT (-87.55267334 41.657291412),POINT (-87.55267 41.65729)
227744,Dance - Hip Hop at West Pullman,Learn current Hip Hop dances and develop techn...,Chicago Park District,Sports + Wellness.,10.0,6,13,401 W. 123rd St.,Chicago,60628.0,...,16:45,18:00,Free,Far South Equity Zone,,Program,41.669998,-87.632202,POINT (-87.632202148 41.669998169),POINT (-87.6322 41.67)


In [347]:
points_with_labels = points_with_labels.dropna(subset=['community'])

In [351]:
points_with_labels.dtypes

Program Name                 object
Description                  object
Org Name                     object
Category Name                object
Capacity                    float64
Min Age                       int64
Max Age                       int64
Address                      object
City                         object
ZIP Code                    float64
Program Type                 object
Start Date                   object
End Date                     object
Start Time                   object
End Time                     object
Program Price                object
Geographic Cluster Name      object
Custom Categories            object
Tag                          object
Latitude                    float64
Longitude                   float64
Location                     object
geometry                   geometry
index_right                 float64
area                        float64
area_num_1                   object
area_numbe                   object
comarea                     

### Merge datasets and derive additional variables