# A Minimal Example of Loading a Dataset to a Database

In [4]:
!pip3 install -U -q PyMySQL sqlalchemy sql_magic

[?25l[K     |███████▌                        | 10 kB 19.0 MB/s eta 0:00:01[K     |███████████████                 | 20 kB 10.5 MB/s eta 0:00:01[K     |██████████████████████▍         | 30 kB 7.5 MB/s eta 0:00:01[K     |██████████████████████████████  | 40 kB 3.8 MB/s eta 0:00:01[K     |████████████████████████████████| 43 kB 1.4 MB/s 
[?25h

In [2]:
import pandas as pd

In [3]:
from sqlalchemy import create_engine

### Downloading Data and Putting in a Dataframe

In [1]:
!curl 'https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv?accessType=DOWNLOAD' -o restaurants.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  115M    0  115M    0     0  4095k      0 --:--:--  0:00:28 --:--:-- 4606k


In [43]:
# Read the CSV file
df = pd.read_csv('restaurants.csv')

In [45]:
# Adding underscores in all column names
cols = df.columns
cols = cols.map(lambda x: x.replace(' ', '_').upper())
df.columns = cols

In [46]:
# Some bookkeeping regarding datatypes
df["INSPECTION_DATE"] = pd.to_datetime(df["INSPECTION_DATE"], format="%m/%d/%Y")
df["SCORE"] = pd.to_numeric(df["SCORE"])

# Delete useless columns
df = df.drop(['GRADE_DATE', 'RECORD_DATE'], axis='columns')

## Create MySQL Connection

In [47]:
import os
from sqlalchemy import create_engine

# Connect to the MySQL, and use the "public" database
conn_string = 'mysql+pymysql://{user}:{password}@{host}:{port}/{db}?charset=utf8'.format(
    user     = 'student', 
    password = 'dwdstudent2015', 
    host     = 'db.ipeirotis.org', 
    port     = 3306, 
    encoding = 'utf-8',
    db = 'public'
)
engine = create_engine(conn_string)

## Create the table for storing the data

Although we can let Pandas create the table automatically, the choice of data types of not always great. It is better to manually define the data types for the database.

In [48]:
# Report the maximum string lengths for 
# the textual attributes. Useful when creating
# a table in SQL.
for c in df.columns.values:
  if df.dtypes[c] == 'object': 
    print(c, df[c].str.len().max())


DBA 95.0
BORO 13
BUILDING 10.0
STREET 40.0
PHONE 12.0
CUISINE_DESCRIPTION 30.0
ACTION 130.0
VIOLATION_CODE 4.0
VIOLATION_DESCRIPTION 360.0
CRITICAL_FLAG 14
GRADE 1.0
INSPECTION_TYPE 59.0
NTA 4.0


In [50]:
drop_table_sql = '''
DROP TABLE IF EXISTS inspections;
'''
engine.execute(drop_table_sql)


create_table_sql = '''
CREATE TABLE inspections (
    CAMIS CHAR(8),
    DBA VARCHAR(100),
    BUILDING VARCHAR(10),
    STREET VARCHAR(40),
    ZIPCODE CHAR(5),
    BORO VARCHAR(15),
    PHONE CHAR(12),
    CUISINE_DESCRIPTION VARCHAR(30),
    LATITUDE FLOAT,
    LONGITUDE FLOAT,
    COMMUNITY_BOARD CHAR(3),
    COUNCIL_DISTRICT CHAR(2),
    CENSUS_TRACT CHAR(6),
    BIN CHAR(7),
    BBL CHAR(10),
    NTA CHAR(4),
    INSPECTION_DATE DATETIME,
    ACTION VARCHAR(130),
    GRADE CHAR(1),
    INSPECTION_TYPE VARCHAR(60),
    VIOLATION_CODE VARCHAR(10),
    VIOLATION_DESCRIPTION VARCHAR(360),
    CRITICAL_FLAG VARCHAR(15),
    SCORE SMALLINT
)  ENGINE=INNODB DEFAULT CHARSET=UTF8MB4;
'''
engine.execute(create_table_sql)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7f0b93eeb350>

## Insert data to DB using the `to_sql` command

In [37]:
# Store the dataframe as a SQL table, using the to_sql command
df.to_sql(name='inspections', # name the table "inspections"
                   con=engine, # use the connection to MySQL created earlier
                   if_exists='append', # we created the empty table above
                   index=False, # do not write the index column in the database
                   chunksize=1000 # write 1000 lines at a time
)

In [38]:
# And then we can just retrieve it from the database
r = pd.read_sql("SELECT * FROM public.inspections LIMIT 100", con=engine)
r.head(100)

Unnamed: 0,CAMIS,DBA,BUILDING,STREET,ZIPCODE,BORO,PHONE,CUISINE_DESCRIPTION,LATITUDE,LONGITUDE,...,BBL,NTA,INSPECTION_DATE,ACTION,GRADE,INSPECTION_TYPE,VIOLATION_CODE,VIOLATION_DESCRIPTION,CRITICAL_FLAG,SCORE
0,50046352,KENNEDY FRIED CHICKEN,1956,JEROME AVENUE,10453,Bronx,7184665118,Chicken,40.8514,-73.9095,...,2028530022,BX41,2020-01-08,Violations were cited in the following area(s).,,Cycle Inspection / Initial Inspection,02B,Hot food item not held at or above 140º F.,Critical,18.0
1,41644418,BAGEL BIN,8610,JAMAICA AVENUE,11421,Queens,7184416669,Bagels/Pretzels,40.6926,-73.8578,...,4089230004,QN53,2020-02-13,Violations were cited in the following area(s).,B,Cycle Inspection / Re-inspection,10B,Plumbing not properly installed or maintained;...,Not Critical,26.0
2,41345052,BROADWAY GOURMET,584,BROADWAY,10012,Manhattan,2129416566,American,40.7247,-73.9973,...,1005110008,MN24,2022-03-18,Violations were cited in the following area(s).,,Cycle Inspection / Initial Inspection,06D,"Food contact surface not properly washed, rins...",Critical,20.0
3,50060875,EL CONDE NUEVO,4139,BROADWAY,10033,Manhattan,2127810419,Spanish,40.8461,-73.9385,...,1021420237,MN36,2021-08-10,Violations were cited in the following area(s).,,Cycle Inspection / Initial Inspection,10B,Plumbing not properly installed or maintained;...,Not Critical,22.0
4,50083895,3D'S DELIGHT BAKERY & RESTAURANT,1377,EAST NEW YORK AVENUE,11212,Brooklyn,7189751121,Caribbean,40.6695,-73.9173,...,3014740045,BK79,2019-08-20,Violations were cited in the following area(s).,B,Cycle Inspection / Re-inspection,04N,Filth flies or food/refuse/sewage-associated (...,Critical,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,50110721,TACOS GUEY,37,WEST 19 STREET,10011,Manhattan,2129918222,Mexican,40.7396,-73.9927,...,1008210019,MN13,2021-08-10,Violations were cited in the following area(s).,A,Pre-permit (Operational) / Initial Inspection,10F,Non-food contact surface improperly constructe...,Not Critical,4.0
96,50059601,CAFE MORNING OF NY,15814,NORTHERN BLVD,11358,Queens,9172852072,Coffee/Tea,40.7634,-73.8071,...,4052770025,QN51,2019-04-22,Violations were cited in the following area(s).,,Cycle Inspection / Initial Inspection,06D,"Food contact surface not properly washed, rins...",Critical,21.0
97,40402397,THE RIVER CLUB,447,EAST 52 STREET,10022,Manhattan,2127510100,American,40.7547,-73.9642,...,1013640022,MN19,2022-03-31,Violations were cited in the following area(s).,,Cycle Inspection / Initial Inspection,06C,Food not protected from potential source of co...,Critical,28.0
98,50036483,THE RIDGEWOOD ALE HOUSE,5738,MYRTLE AVE,11385,Queens,7184564495,American,40.7005,-73.9023,...,4035730018,QN20,2019-07-05,No violations were recorded at the time of thi...,,Administrative Miscellaneous / Initial Inspection,,Current letter grade sign not posted.,Not Critical,
