# Notebook Purpose

This notebook serves as documented walkthrough for the creation of the `illinois-gambling` postgresql database.

### Data Sources

This database contains data from the Illinois Gaming Board's monthly reports for both [Video Gambling](https://www.igb.illinois.gov/VideoReports.aspx) and [Casino Gambling](https://www.igb.illinois.gov/CasinoReports.aspx) as well as demographic data for each Illinois municipality from the 5 year American Community Survey.

<center><img src="../static/schema.png" width="1000"></center>

In [2]:
import sys
sys.path.append('..')
import pandas as pd
import os
import numpy as np
from geopy import geocoders  
import matplotlib.pyplot as plt
from io import BytesIO
from geopy.geocoders import Nominatim
import psycopg2
from requests import get
import zipfile
import censusdata
import json
from geopy.distance import geodesic
from sqlalchemy import create_engine
from tqdm.notebook import tqdm
geolocator = Nominatim(user_agent='Illinois Gambling')

# Create Database

Connect to postgresql

In [4]:
conn = psycopg2.connect(dbname="postgres")
cursor = conn.cursor()

Set configurations 

In [5]:
conn.autocommit = True 
conn.set_isolation_level(0)

If creating the database from scratch, uncomment the code below to drop the database if it exists and create an empty `illinois-gaming` database.

In [4]:
# cursor.execute('DROP DATABASE IF EXISTS illinois_gambling;')
# cursor.execute('CREATE DATABASE illinois_gambling;')
# conn.close()

Now that the database is created, open a connection to the `illinois-gambling`  database

In [6]:
conn = psycopg2.connect(dbname="illinois_gambling")
user = conn.get_dsn_parameters()['user']
port = conn.get_dsn_parameters()['port'] 
# Create sqlalchemy engine to streamline the sql process
engine = create_engine(f'postgresql+psycopg2://{user}:@localhost:{port}/illinois_gambling')

# Casino Table

In [6]:
with tqdm(total = 3, desc = ''.rjust(35, '–'), ncols=800) as pbar:
    pbar.desc = ' Reading scraped data...'.rjust(35, '–')
    # Create import path for the casino gambling data
    casino_path = os.path.join(os.pardir,'data', 'casino_gambling', 'casino_data.csv')
    # Read in csv file
    casino_table = pd.read_csv(casino_path)
    # Change date column from string to datetime
    casino_table['date'] = pd.to_datetime(casino_table.date)
    pbar.update(1)
    
# =======================================================================
    pbar.desc = ' Cleaning casino data...'.rjust(35, '–')
    # Data Cleaning: 
    # The naming of casinos is somewhat inconsistent.
    # Below, we map names that deviate from the majority
    # so they are consistent across observations
    CASINO_REFORMAT = {'Alton': 'ALTON - ARGOSY',
                       'Argosy Casino Alton': 'ALTON - ARGOSY',
                    'East Peoria': 'E. PEORIA - PAR-A-DICE', 
                       'Par-A-Dice Hotel Casino': 'E. PEORIA - PAR-A-DICE',
                    'Rock Island': 'ROCK ISLAND - JUMERS', 
                       "Jumer's Casino & Hotel": 'ROCK ISLAND - JUMERS',
                     'Joliet - Empress': 'JOLIET - Argosy Empress Casino', 
                     'Metropolis': 'METROPOLIS - HARRAHS',
                    "Joliet - Harrah's": 'JOLIET - HARRAHS',
                    "Harrah's Joliet Casino & Hotel": 'JOLIET - HARRAHS',
                    'Aurora': 'AURORA - HOLLYWOOD', 
                    'Hollywood Casino Aurora': 'AURORA - HOLLYWOOD',
                    'E  St  Louis': 'E. ST. LOUIS - CASINO QUEEN',
                    'E St  Louis': 'E. ST. LOUIS - CASINO QUEEN',
                    'DraftKings at Casino Queen': 'E. ST. LOUIS - CASINO QUEEN',
                    'Elgin': 'ELGIN - GRAND VICTORIA',
                    'Grand Victoria Casino': 'ELGIN - GRAND VICTORIA',
                    'Joliet - Hollywood': 'JOLIET - HOLLYWOOD',
                       'Hollywood Casino Joliet': 'JOLIET - HOLLYWOOD',
                    'Des Plaines': 'DES PLAINES - RIVERS CASINO', 
                       'Rivers Casino': 'DES PLAINES - RIVERS CASINO',
                    'METROPOLIS - HARRAHS*': 'METROPOLIS - HARRAHS',
                      "Harrah's Metropolis Casino": 'METROPOLIS - HARRAHS'}

    def casino_map(casino):
        if casino in CASINO_REFORMAT:
            return CASINO_REFORMAT[casino]
        else:
            return casino

    casino_table['casino'] = casino_table.casino.apply(casino_map)

    # Data Cleaning:
    # To make the columns more informative, we seperate the 
    # name of the casino from the name of the municipality.
    casino_table['municipality'] = casino_table.casino.apply(lambda x: x.split(' - ')[0])
    
#     def find_error(text):
#         try:
#             return text.split(' - ')[1]
#         except:
#             print(text)
#             ValueError('Oops')
            
    casino_table.casino.apply(find_error)
    # casino_table['casino'] = casino_table.casino.apply(lambda x: x.split(' - ')[1])
    casino_table['municipality'] = casino_table.municipality.apply(lambda x: x.title().strip().replace('E.', 'East'))
# =======================================================================
    pbar.desc = ' Searching for geo data'.rjust(35, '–')
    # The location of the casino is useful information
    # for measuring the relationship between video gambling
    # and casino performance. The addresses were searched manually 
    # Using a google search. While it would be valuable to have a 
    # dynamic way of fetching this information, it is unlikely that
    # these data points will change. 
    casino_addresses = {'Alton': '1 Piasa St, Alton, IL 62002',
    'East Peoria':'21 Blackjack Blvd, East Peoria, IL 61611',
    'Rock Island':'777 Jumer Dr, Rock Island, IL 61201',
    'Joliet': '777 Hollywood Blvd, Joliet, IL 60436',
    'Aurora':'100 E Front St, Metropolis, IL 62960',
    'East St. Louis': '200 S Front St, East St Louis, IL 62201',
    'Elgin': '250 S Grove Ave, Elgin, IL 60120',
    'Des Plaines':'3000 S River Rd, Des Plaines, IL 60018',
    'Metropolis': '100 E Front St, Metropolis, IL 62960'}

    casino_table['address'] = casino_table.municipality.map(casino_addresses)

    # The two columns below are aggregates of other columns. Because these numbers are 
    # easily calculable from the existing data, they are dropped. 
    # casino_table.drop(['agr_per_square_foot', 'agr_per_admission'], axis = 1, inplace=True)

    def coordinates_from_address(address):

        location = geolocator.geocode(address)
        if location:
            return location.latitude, location.longitude

    coordinates_from_address(casino_table.address[0])

    casino_coordinates = {}
    for address in casino_table.address.unique():
        try:
            latitude, longitude = coordinates_from_address(address)
            casino_coordinates[address] = (latitude, longitude)
        except:
            continue
    pbar.desc = ' Matching coordinates...'.rjust(35, '–')
    casino_coordinates['777 Hollywood Blvd, Joliet, IL 60436'] = (41.479289, -88.145328)
    casino_table['latitude'] = casino_table.address.apply(lambda x: casino_coordinates[x][0])
    casino_table['longitude'] = casino_table.address.apply(lambda x: casino_coordinates[x][1])


    casino_table.rename({'casino': 'name', 'agr': 'adjusted_gross_receipt',
                        'casino_square_feet': 'square_feet'}, axis = 1, inplace=True)
    pbar.update(1)
# =======================================================================
    pbar.desc = ' Inserting data into casino table...'.rjust(35, '–')
    casino_table = casino_table[['address', 'name', 'date', 'municipality',
                  'adjusted_gross_receipt', 'square_feet', 'admissions',
                   'state_share', 'local_share','latitude', 'longitude']]

    casino_table.to_sql('casino', engine, if_exists='append', index=True)
    pbar.desc = ' Complete'.rjust(35, '–')
    pbar.update(1)
# =======================================================================

HBox(children=(FloatProgress(value=0.0, description='–––––––––––––––––––––––––––––––––––', layout=Layout(flex=…




# Video Gambling Table

In [7]:
with tqdm(total=3, desc = ''.rjust(31, '–'), ncols=800) as pbar:
    pbar.desc = ' Reading scraped data...'.rjust(31, '–')
    data_path = os.path.join(os.pardir,'data', 'video_gambling')
    file_path = [file for file in os.listdir(data_path) if file.endswith('.csv')][0]
    video_gambling_path = os.path.join(data_path, file_path )
    video_gambling_table = pd.read_csv(video_gambling_path)
    pbar.update(1)
    pbar.desc = ' Cleaning video gambling data...'.rjust(31)
    video_gambling_table['date'] = pd.to_datetime(video_gambling_table.date, format='%B-%Y')
    video_gambling_table = video_gambling_table.dropna()

    video_gambling_table = video_gambling_table[['date','Establishment', 'License Number', 'Municipality', 'VGT Count',
           'Amount Played', 'Amount Won', 'Net Wager', 'Funds In', 'Funds Out',
           'Net Terminal Income', 'NTI Tax', 'State Share', 'Municipality Share'
           ]]

    column_edit = lambda x: x.lower().strip().replace(' ', '_').replace('-', '_')
    video_gambling_table.columns = [column_edit(column) for column in video_gambling_table.columns]
    video_gambling_table.rename({'vgt_count': 'terminal_count'}, axis = 1, inplace=True)
    pbar.update(1)
    pbar.desc = ' Inserting table into database...'.rjust(31, '–')
    video_gambling_table.to_sql('video_gambling', engine, if_exists='append', index=True)
    pbar.desc = ' Complete!'.rjust(31, '–')
    pbar.update(1)

HBox(children=(FloatProgress(value=0.0, description='–––––––––––––––––––––––––––––––', layout=Layout(flex='2')…




# Municipality Table

In [11]:
with tqdm(total=6, desc = ''.rjust(31), ncols=800) as pbar:
    # ============================================================================
    # Importing dataset for tracking whether or not a municipality has
    # prohibited video gambling
    prohibition = pd.read_csv('../data/MunicipalityList.csv')
    prohibition.dropna(subset=['Name'],inplace=True)
    # ============================================================================
    # Request census geo id dataset
    pbar.desc = ' Requesting Census Geo Ids'.rjust(39, '–')
    response = get('https://www2.census.gov/programs-surveys/acs/summary_file/2018/data/5_year_by_state/Illinois_All_Geographies_Not_Tracts_Block_Groups.zip')
    pbar.desc = ' Reading response Bytes'.rjust(39, '–')
    translated = BytesIO(response.content)
    pbar.desc = ' Unzipping census data'.rjust(39)
    zip_file = zipfile.ZipFile(translated)
    csv_raw = zip_file.open('g20185il.csv')
    pbar.desc = ' Reading id file'.rjust(39, '–')
    municipality_ids = pd.read_csv(csv_raw, header=None)
    municipality_ids = municipality_ids[municipality_ids.iloc[:,2] == 160]
    pbar.update(1)
    # ============================================================================
    # Collect ids for municipalities
    pbar.desc = ' Filtering for municipalities'.rjust(39, '–')
    census_ids = {}
    counties = video_gambling_table[video_gambling_table.municipality.str.contains('county', case=False)].index
    municipalities = video_gambling_table.loc[[x for x in video_gambling_table.index if x not in counties]]
    towns_unique = list(municipalities.municipality.unique())
    towns_unique += [town for town in casino_table.municipality.unique() if town not in towns_unique]
    towns_unique += [town for town in prohibition.Name.unique() if town not in towns_unique]
    # ============================================================================
    # Collecting Census Data
    pbar.desc = ' Matching Census ID to Municipality Name'.rjust(39, '–')
    for town in towns_unique:
        try:
            search = municipality_ids[municipality_ids.iloc[:,49].str.contains(town, case=False)]
            search  = search[~search.iloc[:,49].str.contains('CDP')]
            id_ = search.iloc[:,48].values[0]
            census_ids[town] = id_
        except:
            census_ids[town] = None
    pbar.desc = ' Addressing alternate string formats'.rjust(39, '–')
    error_cities = {'La Salle': 'lasalle',
     'Lamoille': 'la moille',
     'Windsor (Mercer)': 'windsor village' ,
     'Windsor (Shelby)': 'windsor city',
     'Leroy': 'le roy',
     'Depue': 'de pue',
     'Wilmington (Will)': 'wilmington city',
     'Sainte Marie': 'ste. marie',
     'Gulfport': 'Gulf port',
     'Saint Elmo': 'St. Elmo',
     'Whiteash': 'Whiteash',
     'Wilmington (Greene)': 'wilmington village',
     'Garden Prairie': 'Garden Prairie'}

    for city in error_cities:
        id_ = municipality_ids[municipality_ids.iloc[:,49].str.contains(error_cities[city], case=False)].iloc[:,48].values[0]
        census_ids[city] = id_
    pbar.desc = ' Finalizing municipal IDs'.rjust(39, '–')
    municipal_table = pd.DataFrame()
    municipal_table['name'] = towns_unique
    municipal_table['id'] = municipal_table.name.map(census_ids)
    municipal_table.dropna(subset=['id'], inplace=True)
    pbar.update(1)
    # ============================================================================
    pbar.desc = ' Collecting coordinates'.rjust(39, '–')
    def find_geo_data(municipality):

        location = geolocator.geocode(f'{municipality}, Illinois')
        if location:
            return location.raw

    town_geo = {}

    for town in towns_unique:
        town_geo[town] = find_geo_data(town)

    def collect_coordinates(town):
        return town_geo[town]['lat'], town_geo[town]['lon'], town_geo[town]['boundingbox']
    
    pbar.desc = ' Finalizing coordinates'.rjust(39, '–')
    municipal_table['latitude'], municipal_table['longitude'], municipal_table['boundingbox'] = zip(*municipal_table.name.apply(collect_coordinates))
    pbar.update(1)
    # ============================================================================
    pbar.desc = ' Collecting county data'.rjust(39, '–')
    def county_info(row):
        county_json = get(f'https://geo.fcc.gov/api/census/area?lat={row.latitude}&lon={row.longitude}&format=json').json()['results'][0]
        county_fips = county_json['county_fips']
        county_name = county_json['county_name']
        return county_fips, county_name

    municipal_table['county_fips'] = ''
    municipal_table['county_name'] = ''
    for idx, row in municipal_table.iterrows():
        county_fips, county_name = county_info(row)
        municipal_table.at[idx, 'county_fips'] = county_fips
        municipal_table.at[idx, 'county_name'] = county_name
    pbar.update(1)
    # ============================================================================
    pbar.desc = ' Identifying nearest casino'.rjust(39, '–')
    closest_casinos = []
    distance_from_casino = []
    for idx, row in municipal_table.iterrows():
        town_coor = (row.latitude, row.longitude)
        distances = {}
        for casino in casino_coordinates:
            casino_coor = casino_coordinates[casino]
            miles = geodesic(town_coor, casino_coor).miles
            distances[casino] = miles
            # ADD DISTANC INFO
        sort = sorted(distances.items(), key=lambda x: x[1])[0][0]
        closest_casinos.append(sort)

    municipal_table['nearest_casino'] = closest_casinos
    pbar.update(1)
    # ============================================================================
    pbar.desc = 'Inserting table into database'.rjust(39, '–')
    municipal_table = municipal_table[['name', 'id', 'nearest_casino','latitude', 'longitude', 'boundingbox', 'county_fips', 'county_name']]
    municipal_table.rename({'id': 'geo_id'}, axis=1, inplace=True)
    municipal_table.to_sql('municipality', engine, if_exists='replace', index=False)
    pbar.desc = ' Complete!'.rjust(39)
    pbar.update(1)

HBox(children=(FloatProgress(value=0.0, description='                               ', layout=Layout(flex='2')…

  interactivity=interactivity, compiler=compiler, result=result)
  return func(self, *args, **kwargs)





In [7]:
munic = pd.read_sql('''select * from municipality;''', engine)
census_ids = munic[['name', 'geo_id']].set_index('name').to_dict()['geo_id']



(1-(U2R/Total Migration / total population/rural population) + (R2U/Total Migration/total population/urban population)) * 2

In [127]:
def weight_counties(lp_migration, hp_migration, lp_population, hp_population):
    total_migration = lp_migration + hp_migration
    total_population = lp_population + hp_population
    h2l = (lp_migration/total_migration/(total_population/lp_population))
    l2h = (hp_migration/total_migration/(total_population/hp_population))
    return (1- (h2l + l2h))

In [175]:
distances = np.array([4 ,7.8 ,9.3 ,18 ,14.6 ,4.7 ,11.2 ,24.1 ,10.5 ,25.9 ,11.4 ,10.7 ,15.6,16.7])

In [189]:
distances.var(ddof=-1)

36.342904761904755

In [193]:
1 - distances.max()/distances.var(ddof=-1)

0.28734370106957396

In [150]:
len(munic[munic.county_name == 'Rock Island'])

15

In [170]:
weight_counties(500, 2000, 41000, 141000)

0.3351648351648352

In [13]:
pd.read_sql('''select * from casino''', engine)

Unnamed: 0,index,address,name,date,municipality,adjusted_gross_receipt,square_feet,admissions,state_share,local_share,latitude,longitude
0,0,"1 Piasa St, Alton, IL 62002",ALTON - ARGOSY,2017-05-01,Alton,3480782.0,23000.0,35248.0,418574.0,209287.0,38.889240,-90.186737
1,1,"21 Blackjack Blvd, East Peoria, IL 61611",E. PEORIA - PAR-A-DICE,2017-05-01,East Peoria,6714723.0,26116.0,67449.0,1309974.0,403185.0,40.678239,-89.561869
2,2,"777 Jumer Dr, Rock Island, IL 61201",ROCK ISLAND - JUMERS,2017-05-01,Rock Island,6021952.0,42300.0,80300.0,1029555.0,381398.0,41.459335,-90.616536
3,3,"777 Hollywood Blvd, Joliet, IL 60436",JOLIET - HOLLYWOOD,2017-05-01,Joliet,10938798.0,50000.0,93877.0,2199199.0,640817.0,41.479289,-88.145328
4,4,"100 E Front St, Metropolis, IL 62960",METROPOLIS - HARRAHS,2017-05-01,Metropolis,6561714.0,24269.0,44960.0,1238220.0,373046.0,37.143971,-88.726324
...,...,...,...,...,...,...,...,...,...,...,...,...
2347,2347,"777 Hollywood Blvd, Joliet, IL 60436",JOLIET - HARRAHS,2015-02-01,Joliet,13888492.0,39000.0,113987.0,1930760.0,808412.0,41.479289,-88.145328
2348,2348,"100 E Front St, Metropolis, IL 62960",AURORA - HOLLYWOOD,2015-02-01,Aurora,9179304.0,41384.0,78830.0,1075590.0,537795.0,37.143971,-88.726324
2349,2349,"200 S Front St, East St Louis, IL 62201",E. ST. LOUIS - CASINO QUEEN,2015-02-01,East St. Louis,8590246.0,40000.0,96737.0,1052499.0,526249.0,38.627501,-90.175079
2350,2350,"250 S Grove Ave, Elgin, IL 60120",ELGIN - GRAND VICTORIA,2015-02-01,Elgin,13564216.0,29850.0,104157.0,1707143.0,782368.0,42.031316,-88.279651


In [11]:
munic[munic.county_fips=='17073']

Unnamed: 0,name,geo_id,nearest_casino,latitude,longitude,boundingbox,county_fips,county_name
230,Kewanee,16000US1739727,"777 Jumer Dr, Rock Island, IL 61201",41.2455927,-89.9248303,"(41.214814,41.261416,-89.964774,-89.887008)",17073,Henry
231,Colona,16000US1715664,"777 Jumer Dr, Rock Island, IL 61201",41.4754634,-90.3603496,"(41.440966,41.491032,-90.39295,-90.319456)",17073,Henry
245,Annawan,16000US1701569,"777 Jumer Dr, Rock Island, IL 61201",41.3972573,-89.9045518,"(41.385084,41.410423,-89.914735,-89.848101)",17073,Henry
280,Orion,16000US1756601,"777 Jumer Dr, Rock Island, IL 61201",41.3532038,-90.3808307,"(41.340996,41.362015,-90.391205,-90.358339)",17073,Henry
376,Henry,16000US1734163,"777 Jumer Dr, Rock Island, IL 61201",41.3418549,-90.1177442,"(41.1494595,41.58625,-90.4396477,-89.855469)",17073,Henry
491,Galva,16000US1728430,"777 Jumer Dr, Rock Island, IL 61201",41.1675369,-90.0426137,"(41.156806,41.18611,-90.061104,-90.013182)",17073,Henry
563,Cambridge,16000US1710643,"777 Jumer Dr, Rock Island, IL 61201",41.3036472,-90.1928971,"(41.283113,41.31057,-90.208286,-90.17507)",17073,Henry
643,Atkinson,16000US1702726,"777 Jumer Dr, Rock Island, IL 61201",41.4208678,-90.0151121,"(41.405274,41.435366,-90.025076,-89.974396)",17073,Henry
669,Bishop Hill,16000US1706171,"777 Jumer Dr, Rock Island, IL 61201",41.2017036,-90.1190054,"(41.194189,41.205133,-90.124744,-90.110208)",17073,Henry
703,Woodhull,16000US1783063,"777 Jumer Dr, Rock Island, IL 61201",41.1789256,-90.3159582,"(41.172976,41.184748,-90.341817,-90.30587)",17073,Henry


# Census Data

In [21]:
variables_path = os.path.join(os.pardir,'data', 'census_variables.json')
with open(variables_path, 'r') as file:
    variables = json.load(file)

### Add variable lookup table

In [22]:
lookup = pd.DataFrame(variables, index=[0]).T.reset_index()
lookup.columns = ['variable', 'description']
lookup['variable'] = lookup.variable.apply(lambda x: x.upper())

lookup.to_sql('demographics_lookup', engine, if_exists='replace')



### Create Demographics Table

In [23]:
variable_names = list(variables)
variable_names.reverse()
count = 0
errors = {}
data = pd.DataFrame()
for year in range(2012, 2019):
    with tqdm(total=len(census_ids)-1, desc=str(year)) as pbar:
        for idx in range(len(census_ids)):
            town = list(census_ids)[idx]
            try:
                id_ = census_ids[town][-5:]
                if id_ == '67158':
                    continue
                geo = censusdata.censusgeo([('state', '17'), ('place', id_)])
                frame = censusdata.download('acs5', year, geo, variable_names, key='ef2b118b032f366e377ba482a7e9cdbc8cbfd617')
                frame['geo_id'] = census_ids[town]
                frame['municipality'] = town
                frame['year'] = year
                frame = frame[['geo_id','municipality', 'year'] + variable_names]
                frame.index = [count]
                data = data.append(frame)
                count +=1 
                pbar.update(1)
            except:
                pbar.write(f'Error thrown for year {year}')
                pbar.update(1)
                continue

HBox(children=(FloatProgress(value=0.0, description='2012', max=1298.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='2013', max=1298.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='2014', max=1298.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='2015', max=1298.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='2016', max=1298.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='2017', max=1298.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='2018', max=1298.0, style=ProgressStyle(description_width=…




In [26]:
data.to_sql('demographics', engine, if_exists='replace')

print('Database created successfully!')

Database created successfully!


In [30]:
cursor = conn.cursor()
cursor.execute("UPDATE municipality SET latitude = 40.8131 WHERE name = 'Gulfport';")
cursor.execute("UPDATE municipality SET longitude = -91.0865 WHERE name = 'Gulfport';")

In [31]:
conn.commit()