In [1]:
import json

import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 30)

# set some nicer defaults for matplotlib
from matplotlib import rcParams

#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'


def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecessary plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()

  mplDeprecation)


In [10]:
from io import StringIO  
movie_txt = requests.get('https://raw.github.com/cs109/cs109_data/master/movies.dat').text
movie_file = StringIO(movie_txt) # treat a string like a file
movies = pd.read_csv(movie_file, delimiter='\t')

#print the first row
print movies[['id', 'title', 'imdbID', 'year']].loc[0]
print movies.info()

id                1
title     Toy story
imdbID       114709
year           1995
Name: 0, dtype: object
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9423 entries, 0 to 9422
Data columns (total 21 columns):
id                        9423 non-null int64
title                     9423 non-null object
imdbID                    9423 non-null int64
spanishTitle              9423 non-null object
imdbPictureURL            9247 non-null object
year                      9423 non-null int64
rtID                      9126 non-null object
rtAllCriticsRating        9423 non-null object
rtAllCriticsNumReviews    9423 non-null object
rtAllCriticsNumFresh      9423 non-null object
rtAllCriticsNumRotten     9423 non-null object
rtAllCriticsScore         9423 non-null object
rtTopCriticsRating        9423 non-null object
rtTopCriticsNumReviews    9423 non-null object
rtTopCriticsNumFresh      9423 non-null object
rtTopCriticsNumRotten     9423 non-null object
rtTopCriticsScore         9423 non-null o