In [2]:
import numpy as np # imports a fast numerical programming library
import scipy as sp #imports stats functions, amongst other things
import matplotlib as mpl # this actually imports matplotlib
import matplotlib.cm as cm #allows us easy access to colormaps
import matplotlib.pyplot as plt #sets up plotting under plt
import pandas as pd #lets us handle data as dataframes
#sets up pandas table display
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns #sets up styles and gives us more plotting options


df=pd.read_csv("all.csv", header=None,
               names=["rating", 'review_count', 'isbn', 'booktype','author_url', 'year', 'genre_urls', 'dir','rating_count', 'name'],
)
df.head()

Unnamed: 0,rating,review_count,isbn,booktype,author_url,year,genre_urls,dir,rating_count,name
0,4.4,136455,439023483,good_reads:book,https://www.goodreads.com/author/show/153394.S...,2008,/genres/young-adult|/genres/science-fiction|/g...,dir01/2767052-the-hunger-games.html,2958974,"The Hunger Games (The Hunger Games, #1)"
1,4.41,16648,439358078,good_reads:book,https://www.goodreads.com/author/show/1077326....,2003,/genres/fantasy|/genres/young-adult|/genres/fi...,dir01/2.Harry_Potter_and_the_Order_of_the_Phoe...,1284478,Harry Potter and the Order of the Phoenix (Har...
2,3.56,85746,316015849,good_reads:book,https://www.goodreads.com/author/show/941441.S...,2005,/genres/young-adult|/genres/fantasy|/genres/ro...,dir01/41865.Twilight.html,2579564,"Twilight (Twilight, #1)"
3,4.23,47906,61120081,good_reads:book,https://www.goodreads.com/author/show/1825.Har...,1960,/genres/classics|/genres/fiction|/genres/histo...,dir01/2657.To_Kill_a_Mockingbird.html,2078123,To Kill a Mockingbird
4,4.23,34772,679783261,good_reads:book,https://www.goodreads.com/author/show/1265.Jan...,1813,/genres/classics|/genres/fiction|/genres/roman...,dir01/1885.Pride_and_Prejudice.html,1388992,Pride and Prejudice


# The basics

In [3]:
# The data types of the columns in the dataFrame
df.dtypes 

rating          float64
review_count     object
isbn             object
booktype         object
author_url       object
year            float64
genre_urls       object
dir              object
rating_count     object
name             object
dtype: object

In [4]:
# The shape of the object is:
df.shape 

(6000, 10)

6000 rows times 10 columns. A spredsheet is a table is a matrix.

In [5]:
df.shape[0], df.shape[1] # num of rows, num of columns

(6000, 10)

In [6]:
# These are the column names
df.columns 

Index([u'rating', u'review_count', u'isbn', u'booktype', u'author_url', u'year', u'genre_urls', u'dir', u'rating_count', u'name'], dtype='object')

In [7]:
# pandas considers a table (dataframe) as a pasting of many "series" together, horizontally.
type(df.rating), type(df) 

(pandas.core.series.Series, pandas.core.frame.DataFrame)

# Query

In [8]:
# look for all books with a rating less than 3
df.rating < 3 

0       False
1       False
2       False
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
        ...  
5970    False
5971    False
5972    False
5973    False
5974    False
5975    False
5976    False
5977    False
5978    False
5979     True
5980    False
5981    False
5982    False
5983    False
5984    False
5985    False
5986    False
5987    False
5988    False
5989    False
5990    False
5991    False
5992    False
5993    False
5994    False
5995    False
5996    False
5997    False
5998    False
5999    False
Name: rating, dtype: bool

In [9]:
# count the number of Trues
np.sum(df.rating < 3) 

4

In [10]:
# divide by the total, get the fraction of ratings < 3
np.sum(df.rating < 3)/float(df.shape[0]) 

0.00066666666666666664

In [11]:
# find the mean
np.mean(df.rating < 3.0) 

0.00066666666666666664

In [12]:
# another way to find mean
(df.rating < 3).mean() 

0.00066666666666666664

# Filtering

In [13]:
# create a mask and use it to "index" into the dataframe to get the rows we want
df.query("rating > 4.5") 

Unnamed: 0,rating,review_count,isbn,booktype,author_url,year,genre_urls,dir,rating_count,name
17,4.58,1314,0345538374,good_reads:book,https://www.goodreads.com/author/show/656983.J...,1973,/genres/fantasy|/genres/classics|/genres/scien...,dir01/30.J_R_R_Tolkien_4_Book_Boxed_Set.html,68495,J.R.R. Tolkien 4-Book Boxed Set
162,4.55,15777,075640407X,good_reads:book,https://www.goodreads.com/author/show/108424.P...,2007,/genres/fantasy|/genres/fiction,dir02/186074.The_Name_of_the_Wind.html,210018,The Name of the Wind (The Kingkiller Chronicle...
222,4.53,15256,055357342X,good_reads:book,https://www.goodreads.com/author/show/346732.G...,2000,/genres/fantasy|/genres/fiction|/genres/fantas...,dir03/62291.A_Storm_of_Swords.html,327992,"A Storm of Swords (A Song of Ice and Fire, #3)"
242,4.53,5404,0545265355,good_reads:book,https://www.goodreads.com/author/show/153394.S...,2010,/genres/young-adult|/genres/fiction|/genres/fa...,dir03/7938275-the-hunger-games-trilogy-boxset....,102330,The Hunger Games Trilogy Boxset (The Hunger Ga...
249,4.80,644,0740748475,good_reads:book,https://www.goodreads.com/author/show/13778.Bi...,2005,/genres/sequential-art|/genres/comics|/genres/...,dir03/24812.The_Complete_Calvin_and_Hobbes.html,22674,The Complete Calvin and Hobbes
284,4.58,15195,1406321346,good_reads:book,https://www.goodreads.com/author/show/150038.C...,2013,/genres/fantasy|/genres/young-adult|/genres/fa...,dir03/18335634-clockwork-princess.html,130161,"Clockwork Princess (The Infernal Devices, #3)"
304,4.54,572,0140259449,good_reads:book,https://www.goodreads.com/author/show/1265.Jan...,1933,/genres/classics|/genres/fiction|/genres/roman...,dir04/14905.The_Complete_Novels.html,17539,The Complete Novels
386,4.55,8820,0756404738,good_reads:book,https://www.goodreads.com/author/show/108424.P...,2011,/genres/fantasy|/genres/fantasy|/genres/epic-f...,dir04/1215032.The_Wise_Man_s_Fear.html,142499,"The Wise Man's Fear (The Kingkiller Chronicle,..."
400,4.53,9292,1423140605,good_reads:book,https://www.goodreads.com/author/show/15872.Ri...,2012,/genres/fantasy|/genres/young-adult|/genres/fa...,dir05/12127750-the-mark-of-athena.html,128412,"The Mark of Athena (The Heroes of Olympus, #3)"
475,4.57,824,1416997857,good_reads:book,https://www.goodreads.com/author/show/150038.C...,2009,/genres/fantasy|/genres/young-adult|/genres/fa...,dir05/6485421-the-mortal-instruments-boxed-set...,39720,The Mortal Instruments Boxed Set (The Mortal I...


In [14]:
# another way to filter
df[(df.year < 0) & (df.rating > 4)]

Unnamed: 0,rating,review_count,isbn,booktype,author_url,year,genre_urls,dir,rating_count,name
246,4.01,365,147712556,good_reads:book,https://www.goodreads.com/author/show/903.Homer,-800,/genres/classics|/genres/fantasy|/genres/mytho...,dir03/1375.The_Iliad_The_Odyssey.html,35123,The Iliad/The Odyssey
746,4.06,1087,140449183,good_reads:book,https://www.goodreads.com/author/show/5158478....,-500,/genres/classics|/genres/spirituality|/genres/...,dir08/99944.The_Bhagavad_Gita.html,31634,The Bhagavad Gita
1397,4.03,890,192840509,good_reads:book,https://www.goodreads.com/author/show/12452.Aesop,-560,/genres/classics|/genres/childrens|/genres/lit...,dir14/21348.Aesop_s_Fables.html,71259,Aesop's Fables
1882,4.02,377,872205541,good_reads:book,https://www.goodreads.com/author/show/879.Plato,-400,/genres/philosophy|/genres/classics|/genres/no...,dir19/22632.The_Trial_and_Death_of_Socrates.html,18712,The Trial and Death of Socrates
3133,4.3,131,872203492,good_reads:book,https://www.goodreads.com/author/show/879.Plato,-400,/genres/philosophy|/genres/classics|/genres/no...,dir32/9462.Complete_Works.html,7454,Complete Works
4475,4.11,281,865163480,good_reads:book,https://www.goodreads.com/author/show/879.Plato,-390,/genres/philosophy|/genres/classics|/genres/no...,dir45/73945.Apology.html,11478,Apology
5367,4.07,133,872206335,good_reads:book,https://www.goodreads.com/author/show/879.Plato,-360,/genres/philosophy|/genres/classics|/genres/no...,dir54/30292.Five_Dialogues.html,9964,Five Dialogues


# Cleaning

First check the datatypes. Notice that review_count, rating_count are of type object (which means they are either strings or Pandas couldnt figure what they are), while year is a float

In [15]:
df.dtypes

rating          float64
review_count     object
isbn             object
booktype         object
author_url       object
year            float64
genre_urls       object
dir              object
rating_count     object
name             object
dtype: object

If we try to directly convert the some data types into in, we will get exceptions, because some of the cell may have "None"

In [16]:
df['rating_count']=df.rating_count.astype(int)
df['review_count']=df.review_count.astype(int)
df['year']=df.year.astype(int)

ValueError: invalid literal for long() with base 10: 'None'

In [17]:
df[df.year.isnull()]

Unnamed: 0,rating,review_count,isbn,booktype,author_url,year,genre_urls,dir,rating_count,name
2442,4.23,526.0,,good_reads:book,https://www.goodreads.com/author/show/623606.A...,,/genres/religion|/genres/islam|/genres/non-fic...,dir25/1301625.La_Tahzan.html,4134.0,La Tahzan
2869,4.61,2.0,,good_reads:book,https://www.goodreads.com/author/show/8182217....,,,dir29/22031070-my-death-experiences---a-preach...,23.0,My Death Experiences - A Preacherâs 18 Apoca...
3643,,,,,,,,dir37/9658936-harry-potter.html,,
5282,,,,,,,,dir53/113138.The_Winner.html,,
5572,3.71,35.0,8423336603.0,good_reads:book,https://www.goodreads.com/author/show/285658.E...,,/genres/fiction,dir56/890680._rase_una_vez_el_amor_pero_tuve_q...,403.0,Ãrase una vez el amor pero tuve que matarlo. ...
5658,4.32,44.0,,good_reads:book,https://www.goodreads.com/author/show/25307.Ro...,,/genres/fantasy|/genres/fantasy|/genres/epic-f...,dir57/5533041-assassin-s-apprentice-royal-assa...,3850.0,Assassin's Apprentice / Royal Assassin (Farsee...
5683,4.56,204.0,,good_reads:book,https://www.goodreads.com/author/show/3097905....,,/genres/fantasy|/genres/young-adult|/genres/ro...,dir57/12474623-tiger-s-dream.html,895.0,"Tiger's Dream (The Tiger Saga, #5)"


In [18]:
# get rid of the Null data
df = df[df.year.notnull()]
df.shape

(5993, 10)

In [19]:
# let's try again
df['rating_count']=df.rating_count.astype(int)
df['review_count']=df.review_count.astype(int)
df['year']=df.year.astype(int)
df.dtypes

rating          float64
review_count      int64
isbn             object
booktype         object
author_url       object
year              int64
genre_urls       object
dir              object
rating_count      int64
name             object
dtype: object

# Visualizing

In [20]:
df.rating.hist()

<matplotlib.axes._subplots.AxesSubplot at 0x105dfeb50>