### Using Pandas DataFrames to store SQL results

In [None]:
import pandas as pd

#### Using sql magic extension



The code below demonstrates how to convert the results of an SQL query into a Pandas Dataframe

In [None]:
%reload_ext sql
%sql mysql://root:dwdstudent2015@localhost:3306/imdb?charset=utf8

In [None]:
actors = %sql SELECT * FROM actors

In [None]:
actors.keys

In [None]:
df_actors = pd.DataFrame(actors, columns=actors.keys)
df_actors.head(10)

In [None]:
df_actors["first_name"].describe()

In [None]:
df_actors["first_name"][df_actors["gender"]=='F'].value_counts()[:10]

In [None]:
df_actors["first_name"][df_actors["gender"]=='M'].value_counts()[:10]

In [None]:
df_actors["last_name"].value_counts()[:10]

#### Importing into DataFrames using MySQLdb and cursors

In [None]:
from sqlalchemy import create_engine
import pandas as pd

host = 'localhost'
username = 'root'
port = 3306
password = 'dwdstudent2015'
database = 'imdb'

conn_template = 'mysql+mysqldb://{u}:{pw}@{h}:{p}/{d}'
conn_string = conn_template.format(u=username, pw=password, h=host, p=port, d=database)
engine = create_engine(conn_string)


cur = engine.execute("SELECT * FROM actors LIMIT 1000")
actors = [dict(x) for x in cur.fetchall()]
df_actors = pd.DataFrame(actors)
df_actors

### Further Examples with SQL and Pandas

In [None]:
%reload_ext sql
%sql mysql://root:dwdstudent2015@localhost:3306/facebook?charset=utf8

In [None]:
# A little bit of setup code, just to make the plots look better
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# Make the graphs a bit prettier, and bigger
plt.rcParams['figure.figsize'] = (15, 5)

Now let's run a query to get the political views of Facebook users, broken down by gender.

In [None]:
polviews_by_gender = %%sql \
SELECT Sex, `Political Views` AS PoliticalViews, COUNT(*) AS cnt \
FROM Profiles \
WHERE Sex IS NOT NULL AND `Political Views` IS NOT NULL \
GROUP BY Sex, `Political Views`  \
ORDER BY  `Political Views`, Sex

And let's get the dataframe:

In [None]:
# This is an idiom from the sql magic extension
# df = polviews_by_gender.DataFrame()
# df
# 
# See https://github.com/catherinedevlin/ipython-sql
 
# But let's do things our way:
df = pd.DataFrame(polviews_by_gender, columns=polviews_by_gender.keys)
df

In [None]:
# Let's plot this!
# Bleh, this is really fugly...
df.plot(kind='bar')

In [None]:
# Pivot, baby!
dfp = df.pivot(index='PoliticalViews', columns='Sex', values='cnt')
dfp

In [None]:
dfp.plot(kind='bar')

In [None]:
# Let's normalize the columns, as we have more females than males, and it seems that there are always more women
dfp["Female"] = dfp["Female"]/sum(dfp["Female"])
dfp["Male"] = dfp["Male"]/sum(dfp["Male"])
dfp

In [None]:
dfp.plot(kind='bar')

In [None]:
# OK, now let's try to re-order the list of results according to the logical structure
neworder = ['Very Liberal', 'Liberal', 'Moderate', 'Conservative', 'Very Conservative', 'Libertarian', 'Apathetic', 'Other']
newindex = sorted(dfp.index, key=lambda x: neworder.index(x))
dfp = dfp.reindex(newindex)
dfp

In [None]:
dfp.plot(kind='bar')

### Facebook, Favorite Books, and Political views

In [None]:
books = %%sql \
SELECT F.FavoriteBook, P.`Political Views` AS PoliticalViews, COUNT(*) AS cnt \
FROM Profiles P JOIN FavoriteBooks F ON F.ProfileID = P.ProfileId  \
WHERE `Political Views` IS NOT NULL AND F.FavoriteBook IS NOT NULL \
GROUP BY F.FavoriteBook, P.`Political Views`

In [None]:
df_books = pd.DataFrame(books, columns=books.keys)
df_books.head(10)

In [None]:
# Perhaps we can limit ourselves only to books with enough fans
# df_books = df_books[ df_books['cnt'] > 5]

In [None]:
# Pivot, baby!
dfp = df_books.pivot(index='FavoriteBook', columns='PoliticalViews', values='cnt')
dfp.head(10)

In [None]:
# We will normalize the columns
import numpy as np

for politival_view in dfp.columns:
    # print politival_view, np.nansum(dfp[politival_view])
    dfp[politival_view] = dfp[politival_view] / np.nansum(dfp[politival_view])
    
dfp

In [None]:
dfp["Liberal_To_Conservative"] = (dfp["Liberal"] + dfp["Very Liberal"]) / (dfp["Conservative"] + dfp["Very Conservative"])
dfp["Conservative_To_Liberal"] = (dfp["Conservative"] + dfp["Very Conservative"]) / (dfp["Liberal"] + dfp["Very Liberal"])

In [None]:
# dfp.sort_values("Liberal_To_Conservative", ascending=False)
dfp[["Liberal_To_Conservative"]].sort_values("Liberal_To_Conservative", ascending=False).head(50)

In [None]:
dfp[["Conservative_To_Liberal"]].sort_values("Conservative_To_Liberal", ascending=False).head(50)

In [None]:
dfp[["Conservative_To_Liberal"]].sort_values("Conservative_To_Liberal", ascending=False).head(10).plot(kind='bar')

### Inserting Data in a Database using Pandas

In [2]:
import pandas as pd

In [3]:
# !curl 'https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv?accessType=DOWNLOAD' -o data/restaurant.csv
# !rm data/restaurant.csv.gz
# !gzip data/restaurant.csv

restaurants = pd.read_csv('data/restaurant.csv.gz', encoding="utf-8", dtype="unicode")

In [4]:


# Reading CSV file directly from URL
# url = 'https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv?accessType=DOWNLOAD'
# restaurants = pd.read_csv(url, encoding="utf-8", dtype="unicode")


In [5]:
restaurants["GRADE DATE"] = pd.to_datetime(restaurants["GRADE DATE"], format="%m/%d/%Y")
restaurants["RECORD DATE"] = pd.to_datetime(restaurants["RECORD DATE"], format="%m/%d/%Y")
restaurants["INSPECTION DATE"] = pd.to_datetime(restaurants["INSPECTION DATE"], format="%m/%d/%Y")
restaurants["SCORE"] = pd.to_numeric(restaurants["SCORE"])
restaurants["BORO"] =  pd.Categorical(restaurants["BORO"], ordered=False)
restaurants["GRADE"] =  pd.Categorical(restaurants["GRADE"], categories = ['A', 'B', 'C'], ordered=True)
restaurants["VIOLATION CODE"] =  pd.Categorical(restaurants["VIOLATION CODE"], ordered=False)
restaurants["CRITICAL FLAG"] =  pd.Categorical(restaurants["CRITICAL FLAG"], ordered=False)
restaurants["ACTION"] =  pd.Categorical(restaurants["ACTION"], ordered=False)
restaurants["CUISINE DESCRIPTION"] =  pd.Categorical(restaurants["CUISINE DESCRIPTION"], ordered=False)

In [None]:
from sqlalchemy import create_engine

host = 'localhost'
username = 'root'
port = 3306
password = 'dwdstudent2015'
database = 'nyc_restaurant_inspections'

conn_template = 'mysql+mysqldb://{u}:{pw}@{h}:{p}/{d}'
conn_string = conn_template.format(u=username, pw=password, h=host, p=port, d=database)
engine = create_engine(conn_string, echo=False)

In [None]:
restaurants.to_sql(name='inspections', if_exists='replace', index=False, con=engine, chunksize=1000)

In [None]:
cur = engine.execute("SELECT * FROM inspections LIMIT 100")

In [None]:
rows = cur.fetchall()
for row in rows:
    print(row)