### Using Pandas DataFrames to store SQL results

In [7]:
import pandas as pd

#### Using sql magic extension



The code below demonstrates how to convert the results of an SQL query into a Pandas Dataframe

In [1]:
%reload_ext sql
%sql mysql://root:dwdstudent2015@localhost:3306/imdb?charset=utf8

  warn("IPython.utils.traitlets has moved to a top-level traitlets package.")


u'Connected: root@imdb'

In [None]:
actors = %sql SELECT * FROM actors

In [None]:
actors.keys

In [None]:
df_actors = pd.DataFrame(actors, columns=actors.keys)
df_actors

In [None]:
df_actors["first_name"].describe()

In [None]:
df_actors["last_name"].value_counts()

#### Importing into DataFrames using MySQLdb and cursors

In [None]:
import MySQLdb as mdb
import sys
import pandas

host = 'localhost'
username = 'root'
password = 'dwdstudent2015'
database = 'imdb'

con = mdb.connect(host, username, password, database, charset='utf8', use_unicode=True);
cur = con.cursor(mdb.cursors.DictCursor)
cur.execute("SELECT * FROM actors LIMIT 1000")
actors = cur.fetchall()
df_actors = pandas.DataFrame(list(actors))
   

### Further Examples with SQL and Pandas

In [3]:
%reload_ext sql
%sql mysql://root:dwdstudent2015@localhost:3306/facebook?charset=utf8

u'Connected: root@facebook'

In [None]:
# A little bit of setup code, just to make the plots look better
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# Make the graphs a bit prettier, and bigger
pd.set_option('display.mpl_style', 'default')
plt.rcParams['figure.figsize'] = (15, 5)

Now let's run a query to get the political views of Facebook users, broken down by gender.

In [None]:
polviews_by_gender = %%sql \
SELECT Sex, `Political Views` AS PoliticalViews, COUNT(*) AS cnt \
FROM Profiles \
WHERE Sex IS NOT NULL AND `Political Views` IS NOT NULL \
GROUP BY Sex, `Political Views`  \
ORDER BY  `Political Views`, Sex

And let's get the dataframe:

In [None]:
# This is an idiom from the sql magic extension
# df = polviews_by_gender.DataFrame()
# df
# 
# See https://github.com/catherinedevlin/ipython-sql
 
# But let's do things our way:
df = pd.DataFrame(polviews_by_gender, columns=polviews_by_gender.keys)
df

In [None]:
# Let's plot this!
# Bleh, this is really fugly...
df.plot(kind='bar')

In [None]:
# Pivot, baby!
dfp = df.pivot(index='PoliticalViews', columns='Sex', values='cnt')
dfp

In [None]:
dfp.plot(kind='bar')

In [None]:
# Let's normalize the columns, as we have more females than males, and it seems that there are always more women
dfp["Female"] = dfp["Female"]/sum(dfp["Female"])
dfp["Male"] = dfp["Male"]/sum(dfp["Male"])
dfp

In [None]:
dfp.plot(kind='bar')

In [None]:
# OK, now let's try to re-order the list of results according to the logical structure
neworder = ['Very Liberal', 'Liberal', 'Moderate', 'Conservative', 'Very Conservative', 'Libertarian', 'Apathetic', 'Other']
newindex = sorted(dfp.index, key=lambda x: neworder.index(x))
dfp = dfp.reindex(newindex)
dfp

In [None]:
dfp.plot(kind='bar')

### Facebook, Favorite Books, and Political views

In [8]:
books = %%sql \
SELECT F.FavoriteBook, P.`Political Views` AS PoliticalViews, COUNT(*) AS cnt \
FROM Profiles P JOIN FavoriteBooks F ON F.ProfileID = P.ProfileId  \
WHERE `Political Views` IS NOT NULL AND F.FavoriteBook IS NOT NULL \
GROUP BY F.FavoriteBook, P.`Political Views`

32387 rows affected.


In [17]:
df_books = pd.DataFrame(books, columns=books.keys)
df_books.head(10)

Unnamed: 0,FavoriteBook,PoliticalViews,cnt
0,000 In College Loans And A Meaningless Diploma...,Liberal,1
1,000 Leagues Under The Sea,Liberal,2
2,000 Leagues Under The Sea,Very Liberal,2
3,000 Places To See Before You Die,Apathetic,1
4,000 Things To Be Happy About,Liberal,1
5,1,Apathetic,1
6,1,Liberal,1
7,1 3 Of Whatever Is Currently On Each Class Syl...,Moderate,1
8,1 Bachelorette,Liberal,1
9,1 Fish 2 Fish Red Fish Blue Fish,Moderate,1


In [18]:
# Perhaps we can limit ourselves only to books with enough fans
# df_books = df_books[ df_books['cnt'] > 5]

In [19]:
# Pivot, baby!
dfp = df_books.pivot(index='FavoriteBook', columns='PoliticalViews', values='cnt')
dfp.head(10)

PoliticalViews,Apathetic,Conservative,Liberal,Libertarian,Moderate,Other,Very Conservative,Very Liberal
FavoriteBook,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
000 In College Loans And A Meaningless Diploma Quot,,,1.0,,,,,
000 Leagues Under The Sea,,,2.0,,,,,2.0
000 Places To See Before You Die,1.0,,,,,,,
000 Things To Be Happy About,,,1.0,,,,,
1,1.0,,1.0,,,,,
1 3 Of Whatever Is Currently On Each Class Syllabus,,,,,1.0,,,
1 Bachelorette,,,1.0,,,,,
1 Fish 2 Fish Red Fish Blue Fish,,,,,1.0,,,
1 Timothy And 2 Timothy,,,,,1.0,,,
1 Trainspotting Irvine Welsh 2 Fight Club Chuck Palahniuk 3 Requiem For A Dream Herbert Selby Jr 4 High Fidelity Nick Hornby 5 Songbook Nick Hornby,1.0,,,,,,,


In [20]:
# We will normalize the columns
import numpy as np

for politival_view in dfp.columns:
    print politival_view, np.nansum(dfp[politival_view])
    dfp[politival_view] = dfp[politival_view] / np.nansum(dfp[politival_view])
    
dfp

Apathetic 2881.0
Conservative 3318.0
Liberal 31253.0
Libertarian 1593.0
Moderate 12173.0
Other 3772.0
Very Conservative 627.0
Very Liberal 12630.0


PoliticalViews,Apathetic,Conservative,Liberal,Libertarian,Moderate,Other,Very Conservative,Very Liberal
FavoriteBook,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
000 In College Loans And A Meaningless Diploma Quot,,,0.000032,,,,,
000 Leagues Under The Sea,,,0.000064,,,,,0.000158
000 Places To See Before You Die,0.000347,,,,,,,
000 Things To Be Happy About,,,0.000032,,,,,
1,0.000347,,0.000032,,,,,
1 3 Of Whatever Is Currently On Each Class Syllabus,,,,,0.000082,,,
1 Bachelorette,,,0.000032,,,,,
1 Fish 2 Fish Red Fish Blue Fish,,,,,0.000082,,,
1 Timothy And 2 Timothy,,,,,0.000082,,,
1 Trainspotting Irvine Welsh 2 Fight Club Chuck Palahniuk 3 Requiem For A Dream Herbert Selby Jr 4 High Fidelity Nick Hornby 5 Songbook Nick Hornby,0.000347,,,,,,,


In [29]:
dfp["Liberal_To_Conservative"] = (dfp["Liberal"] + dfp["Very Liberal"]) / (dfp["Conservative"] + dfp["Very Conservative"])
dfp["Conservative_To_Liberal"] = (dfp["Conservative"] + dfp["Very Conservative"]) / (dfp["Liberal"] + dfp["Very Liberal"])

In [30]:
# dfp.sort_values("Liberal_To_Conservative", ascending=False)
dfp[["Liberal_To_Conservative"]].sort_values("Liberal_To_Conservative", ascending=False).head(50)

PoliticalViews,Liberal_To_Conservative
FavoriteBook,Unnamed: 1_level_1
One Hundred Years Of Solitude,2.831502
The Bell Jar,2.635832
On The Road,2.124912
Catch 22,1.86695
The Perks Of Being A Wallflower,1.786482
Lolita,1.763747
Franny And Zooey,1.749649
America The Book,1.556607
The Stranger,1.552559
Me Talk Pretty One Day,1.529344


In [31]:
dfp[["Conservative_To_Liberal"]].sort_values("Conservative_To_Liberal", ascending=False).head(50)

PoliticalViews,Conservative_To_Liberal
FavoriteBook,Unnamed: 1_level_1
Liar S Poker,42.689381
War And Peace,17.873188
The Apology,17.056968
C S Lewis,15.350022
The Idiot,14.91117
Mere Christianity,14.229794
Republic,13.244934
All The Kings Men,13.244934
Exodus,12.594917
The Bible,12.469576
