### Using Pandas DataFrames to store SQL results

#### Using sql magic extension



The code below demonstrates how to convert the results of an SQL query into a Pandas Dataframe

In [None]:
%reload_ext sql
%sql mysql://root:dwdstudent2015@localhost:3306/imdb?charset=utf8

In [None]:
actors = %sql SELECT * FROM actors

In [None]:
actors.keys

In [None]:
import pandas as pd
df_actors = pd.DataFrame(actors, columns=actors.keys)
df_actors

In [None]:
df_actors["first_name"].describe()

In [None]:
df_actors["last_name"].value_counts()

In [None]:
%reload_ext sql
%sql mysql://root:dwdstudent2015@localhost:3306/citibike?charset=utf8

In [None]:
docks = %sql SELECT * FROM citibike.Docks

In [None]:
df_docks = pd.DataFrame(docks, columns=docks.keys)
df_docks

In [None]:
%matplotlib inline
df_docks["number_of_docks"].hist()
df_docks["available_docks"].hist()

#### Importing into DataFrames using MySQLdb and cursors

In [None]:
import MySQLdb as mdb
import sys
import pandas

host = 'localhost'
username = 'root'
password = 'dwdstudent2015'
database = 'imdb'

con = mdb.connect(host, username, password, database, charset='utf8', use_unicode=True);


# The dictionary cursor
# There are multiple cursor types in the MySQLdb module. 
# The default cursor returns the data in a tuple of tuples. 
# When we use a dictionary cursor, the data is sent in a form of Python dictionaries. 
# This way we can refer to the data by their column names.
with con:

    cur = con.cursor(mdb.cursors.DictCursor)
    cur.execute("SELECT * FROM actors LIMIT 1000")
    actors = cur.fetchall()
    df_actors = pandas.DataFrame(list(actors))

    

### Further Examples with SQL and Pandas

In [None]:
%reload_ext sql
%sql mysql://root:dwdstudent2015@localhost:3306/facebook?charset=utf8

In [None]:
# A little bit of setup code, just to make the plots look better
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# Make the graphs a bit prettier, and bigger
pd.set_option('display.mpl_style', 'default')
plt.rcParams['figure.figsize'] = (15, 5)

Now let's run a query to get the political views of Facebook users, broken down by gender.

In [None]:
polviews_by_gender = %%sql \
SELECT Sex, `Political Views` AS PoliticalViews, COUNT(*) AS cnt \
FROM Profiles \
WHERE Sex IS NOT NULL AND `Political Views` IS NOT NULL \
GROUP BY Sex, `Political Views`  \
ORDER BY  `Political Views`, Sex

And let's get the dataframe:

In [None]:
# This is an idiom from the sql magic extension
# df = polviews_by_gender.DataFrame()
# df
# 
# See https://github.com/catherinedevlin/ipython-sql
 
# But let's do things our way:
df = pd.DataFrame(polviews_by_gender, columns=polviews_by_gender.keys)
df

In [None]:
# Let's plot this!
# Bleh, this is really fugly...
df.plot(kind='bar')

In [None]:
# Pivot, baby!
dfp = df.pivot(index='PoliticalViews', columns='Sex', values='cnt')
dfp

In [None]:
dfp.plot(kind='bar')

In [None]:
# Let's normalize the columns, as we have more females than males, and it seems that there are always more women
dfp["Female"] = dfp["Female"]/sum(dfp["Female"])
dfp["Male"] = dfp["Male"]/sum(dfp["Male"])
dfp

In [None]:
dfp.plot(kind='bar')

In [None]:
# OK, now let's try to re-order the list of results according to the logical structure
neworder = ['Very Liberal', 'Liberal', 'Moderate', 'Conservative', 'Very Conservative', 'Libertarian', 'Apathetic', 'Other']
newindex = sorted(dfp.index, key=lambda x: neworder.index(x))
dfp = dfp.reindex(newindex)
dfp

In [None]:
dfp.plot(kind='bar')

### Facebook, Favorite Books, and Political views

In [None]:
books = %%sql \
SELECT F.FavoriteBook, P.`Political Views` AS PoliticalViews, COUNT(*) AS cnt \
FROM Profiles P JOIN FavoriteBooks F ON F.ProfileID = P.ProfileId  \
WHERE `Political Views` IS NOT NULL AND F.FavoriteBook IS NOT NULL \
GROUP BY F.FavoriteBook, P.`Political Views`

In [None]:
df_books = pd.DataFrame(books, columns=books.keys)
df_books.head(10)

In [None]:
df_books[ df_books['cnt'] > 20]

In [None]:
# Pivot, baby!
dfp = df_books.pivot(index='FavoriteBook', columns='PoliticalViews', values='cnt')
dfp.head(10)

In [None]:
# We will normalize the columns
import numpy as np

for politival_view in dfp.columns:
    print politival_view, np.nansum(dfp[politival_view])
    dfp[politival_view] = dfp[politival_view] / np.nansum(dfp[politival_view])
    
dfp

In [None]:
dfp["Liberal_To_Conservative"] = (dfp["Liberal"] + dfp["Very Liberal"]) / (dfp["Conservative"] + dfp["Very Conservative"])

In [None]:
dfp.sort_values("Liberal_To_Conservative", ascending=False)

In [None]:
dfp.sort_values("Liberal_To_Conservative", ascending=True)