## Exploring a dataset in the notebook

### Provenance of the data

### How we cleaned up the data

### Downloading and loading a dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
!wget https://github.com/ipython-books/minibook-2nd-data/blob/master/nyc_taxi.zip?raw=true -O nyc_taxi.zip

In [3]:
!unzip nyc_taxi.zip

In [4]:
%ls data

nyc_data.csv  nyc_fare.csv  [...]

In [5]:
data_filename = 'data/nyc_data.csv'
fare_filename = 'data/nyc_fare.csv'

In [6]:
data = pd.read_csv(data_filename, parse_dates=['pickup_datetime',
                                               'dropoff_datetime'])

In [7]:
fare = pd.read_csv(fare_filename, parse_dates=['pickup_datetime'])

In [8]:
data.head(3)

### Making plots with matplotlib

In [9]:
data.columns

Index(['medallion',
       ...
       'pickup_datetime',
       'dropoff_datetime',
       'passenger_count',
       'trip_time_in_secs',
       'trip_distance',
       'pickup_longitude',
       'pickup_latitude',
       'dropoff_longitude',
       'dropoff_latitude'], dtype='object')

In [10]:
px = data.pickup_longitude
py = data.pickup_latitude
dx = data.dropoff_longitude
dy = data.dropoff_latitude

In [11]:
px

0        -73.955925
1        -74.005501
...
846943   -73.978477
846944   -73.987206
Name: pickup_longitude, Length: 846945, dtype: float64

In [12]:
plt.scatter(px, py)

In [13]:
plt.figure(figsize=(8, 6))
plt.scatter(px, py, linewidths=1e-2, s=1, alpha=.03)
plt.xlim(-74.1, -73.7)
plt.ylim(40.6, 40.9)
plt.axis('off')

### Descriptive statistics with pandas and seaborn

In [14]:
px.count(), px.min(), px.max()

(846945, -74.098305000000025, -73.028472999999977)

In [15]:
px.mean(), px.median(), px.std()

(-73.975155092033091, -73.98209399999998, 0.03514209949581662)

In [16]:
!conda install seaborn -q -y

In [17]:
import seaborn as sns
sns.__version__

'0.5.1'

In [18]:
data.trip_distance.hist(bins=np.linspace(0., 10., 100))