<a href="https://colab.research.google.com/github/ipeirotis/dealing_with_data/blob/master/01-Pandas/B2-Storing_from_Pandas_to_SQL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A Minimal Example of Loading a Dataset to a Database

In [None]:
!pip3 install -U -q PyMySQL sqlalchemy

In [None]:
import pandas as pd

In [None]:
from sqlalchemy import create_engine

## Downloading Data and Putting in a Dataframe

In [None]:
!curl 'https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv?accessType=DOWNLOAD' -o restaurants.csv

In [None]:
# Read the CSV file
df = pd.read_csv('restaurants.csv')

In [None]:
# Adding underscores in all column names
cols = df.columns
cols = cols.map(lambda x: x.replace(' ', '_').upper())
df.columns = cols

In [None]:
cols

In [None]:
# Some bookkeeping regarding datatypes
df["INSPECTION_DATE"] = pd.to_datetime(df["INSPECTION_DATE"], format="%m/%d/%Y")
df["SCORE"] = pd.to_numeric(df["SCORE"])

# Delete useless columns
df = df.drop(['GRADE_DATE', 'RECORD_DATE', 'LOCATION_POINT1'], axis='columns')

## Create MySQL Connection

In [None]:
import os
from sqlalchemy import create_engine, text

# Connect to the MySQL, and use the "public" database
conn_string = 'mysql+pymysql://{user}:{password}@{host}:{port}/{db}?charset=utf8'.format(
    user     = 'student',
    password = 'dwdstudent2015',
    host     = 'db.ipeirotis.org',
    port     = 3306,
    encoding = 'utf-8',
    db = 'public'
)
engine = create_engine(conn_string)

## Create the table for storing the data

Although we can let Pandas create the table automatically, the choice of data types of not always great. It is better to manually define the data types for the database.

In [None]:
# Report the maximum string lengths for
# the textual attributes. Useful when creating
# a table in SQL.
for c in df.columns.values:
  if df.dtypes[c] == 'object':
    print(c, df[c].str.len().max())


In [None]:
# To avoid conflicts between people writing in the same database, we add a random suffix in the tables
# We only create the variable once while running the notebook
import uuid
if "suffix" not in globals():
    suffix = str(uuid.uuid4())[:8]
print(suffix)

In [None]:
# Define the MySQL database that we will use to store the table
db_name = "public"

# The name f the table that we will use
table_name = f"inspections_{suffix}"

In [None]:
drop_table_sql = f'''
DROP TABLE IF EXISTS {db_name}.{table_name};
'''

with engine.connect() as con:
    con.execute(text(drop_table_sql))


create_table_sql = f'''
CREATE TABLE IF NOT EXISTS {db_name}.{table_name} (
    CAMIS CHAR(8),
    DBA VARCHAR(100),
    BUILDING VARCHAR(10),
    STREET VARCHAR(40),
    ZIPCODE CHAR(5),
    BORO VARCHAR(15),
    PHONE CHAR(12),
    CUISINE_DESCRIPTION VARCHAR(30),
    LATITUDE FLOAT,
    LONGITUDE FLOAT,
    COMMUNITY_BOARD CHAR(3),
    COUNCIL_DISTRICT CHAR(2),
    CENSUS_TRACT CHAR(6),
    BIN CHAR(7),
    BBL CHAR(10),
    NTA CHAR(4),
    INSPECTION_DATE DATETIME,
    ACTION VARCHAR(130),
    GRADE CHAR(1),
    INSPECTION_TYPE VARCHAR(60),
    VIOLATION_CODE VARCHAR(10),
    VIOLATION_DESCRIPTION VARCHAR(1000),
    CRITICAL_FLAG VARCHAR(15),
    SCORE SMALLINT
)  ENGINE=INNODB DEFAULT CHARSET=UTF8MB4;
'''

with engine.connect() as con:
    con.execute(text(create_table_sql))



## Insert data to DB using the `to_sql` command

In [None]:
# Store the dataframe as a SQL table, using the to_sql command
df.to_sql(name=table_name, # name of the table
                   con=engine, # use the connection to MySQL created earlier
                   if_exists='append', # we created the empty table above
                   index=False, # do not write the index column in the database
                   chunksize=1000 # write 1000 lines at a time
)

In [None]:
# And then we can just retrieve it from the database
with engine.connect() as connection:
  r = pd.read_sql(text(f"SELECT * FROM public.{table_name} LIMIT 100"), con=connection)
r.head(100)