In [1]:
import numpy as np # imports a fast numerical programming library
import scipy as sp #imports stats functions, amongst other things
import matplotlib as mpl # this actually imports matplotlib
import matplotlib.cm as cm #allows us easy access to colormaps
import matplotlib.pyplot as plt #sets up plotting under plt
import pandas as pd #lets us handle data as dataframes
#sets up pandas table display
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns #sets up styles and gives us more plotting options


df=pd.read_csv("all.csv", header=None,
               names=["rating", 'review_count', 'isbn', 'booktype','author_url', 'year', 'genre_urls', 'dir','rating_count', 'name'],
)
df.head()

Unnamed: 0,rating,review_count,isbn,booktype,author_url,year,genre_urls,dir,rating_count,name
0,4.4,136455,439023483,good_reads:book,https://www.goodreads.com/author/show/153394.S...,2008,/genres/young-adult|/genres/science-fiction|/g...,dir01/2767052-the-hunger-games.html,2958974,"The Hunger Games (The Hunger Games, #1)"
1,4.41,16648,439358078,good_reads:book,https://www.goodreads.com/author/show/1077326....,2003,/genres/fantasy|/genres/young-adult|/genres/fi...,dir01/2.Harry_Potter_and_the_Order_of_the_Phoe...,1284478,Harry Potter and the Order of the Phoenix (Har...
2,3.56,85746,316015849,good_reads:book,https://www.goodreads.com/author/show/941441.S...,2005,/genres/young-adult|/genres/fantasy|/genres/ro...,dir01/41865.Twilight.html,2579564,"Twilight (Twilight, #1)"
3,4.23,47906,61120081,good_reads:book,https://www.goodreads.com/author/show/1825.Har...,1960,/genres/classics|/genres/fiction|/genres/histo...,dir01/2657.To_Kill_a_Mockingbird.html,2078123,To Kill a Mockingbird
4,4.23,34772,679783261,good_reads:book,https://www.goodreads.com/author/show/1265.Jan...,1813,/genres/classics|/genres/fiction|/genres/roman...,dir01/1885.Pride_and_Prejudice.html,1388992,Pride and Prejudice


# The basics

In [None]:
# The data types of the columns in the dataFrame
df.dtypes 

In [None]:
# The shape of the object is:
df.shape 

6000 rows times 10 columns. A spredsheet is a table is a matrix.

In [None]:
df.shape[0], df.shape[1] # num of rows, num of columns

In [None]:
# These are the column names
df.columns 

In [None]:
# pandas considers a table (dataframe) as a pasting of many "series" together, horizontally.
type(df.rating), type(df) 

# Query

In [None]:
# look for all books with a rating less than 3
df.rating < 3 

In [None]:
# count the number of Trues
np.sum(df.rating < 3) 

In [None]:
# divide by the total, get the fraction of ratings < 3
np.sum(df.rating < 3)/float(df.shape[0]) 

In [None]:
# find the mean
np.mean(df.rating < 3.0) 

In [None]:
# another way to find mean
(df.rating < 3).mean() 

# Filtering

In [None]:
# create a mask and use it to "index" into the dataframe to get the rows we want
df.query("rating > 4.5") 

In [None]:
# another way to filter
df[(df.year < 0) & (df.rating > 4)]

# Cleaning

First check the datatypes. Notice that review_count, rating_count are of type object (which means they are either strings or Pandas couldnt figure what they are), while year is a float

In [None]:
df.dtypes

If we try to directly convert the some data types into in, we will get exceptions, because some of the cell may have "None"

In [None]:
df['rating_count']=df.rating_count.astype(int)
df['review_count']=df.review_count.astype(int)
df['year']=df.year.astype(int)

In [None]:
df[df.year.isnull()]

In [None]:
# get rid of the Null data
df = df[df.year.notnull()]
df.shape

In [None]:
# let's try again
df['rating_count']=df.rating_count.astype(int)
df['review_count']=df.review_count.astype(int)
df['year']=df.year.astype(int)
df.dtypes

# Visualizing

In [3]:
df.rating.hist()

<matplotlib.axes._subplots.AxesSubplot at 0x105e16c10>