# Selecting one week of taxi data (Oct. 6, 2014 - Oct. 12, 2014)

**Import libraries: Pandas & Numpy**

In [1]:
import pandas as pd
import numpy as np

**Load taxi data for the full month (October 2014)**

In [2]:
taxiData = pd.read_csv('../data/yellow_tripdata_2014-10.csv', delimiter=',')

**Explore taxi data: `head()`, `shape`, `dtypes` & `columns`**

In [3]:
taxiData.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount
0,CMT,2014-10-01 08:55:07,2014-10-01 09:11:03,1,2.2,-74.005867,40.73757,1,Y,-74.015534,40.708277,CRD,12.0,0.0,0.5,1.0,0.0,13.5
1,CMT,2014-10-01 10:51:17,2014-10-01 11:26:11,1,15.7,-73.873193,40.774056,1,Y,-73.999846,40.631132,CRD,45.5,0.0,0.5,9.2,0.0,55.2
2,CMT,2014-10-01 02:03:03,2014-10-01 02:06:55,1,1.0,0.0,0.0,1,N,0.0,0.0,CRD,5.0,0.5,0.5,1.0,0.0,7.0
3,CMT,2014-10-01 00:06:35,2014-10-01 00:17:05,2,2.5,-73.987151,40.732922,1,N,-73.991831,40.758148,CRD,10.0,0.5,0.5,2.2,0.0,13.2
4,CMT,2014-10-01 01:34:13,2014-10-01 01:47:02,1,4.2,-73.983267,40.726577,1,N,-73.937556,40.71638,CRD,15.0,0.5,0.5,3.2,0.0,19.2


In [35]:
taxiData.shape

(14232487, 18)

In [6]:
taxiData.dtypes

vendor_id               object
 pickup_datetime        object
 dropoff_datetime       object
 passenger_count         int64
 trip_distance         float64
 pickup_longitude      float64
 pickup_latitude       float64
 rate_code               int64
 store_and_fwd_flag     object
 dropoff_longitude     float64
 dropoff_latitude      float64
 payment_type           object
 fare_amount           float64
 surcharge             float64
 mta_tax               float64
 tip_amount            float64
 tolls_amount          float64
 total_amount          float64
dtype: object

In [15]:
taxiData.columns

Index(['vendor_id', ' pickup_datetime', ' dropoff_datetime',
       ' passenger_count', ' trip_distance', ' pickup_longitude',
       ' pickup_latitude', ' rate_code', ' store_and_fwd_flag',
       ' dropoff_longitude', ' dropoff_latitude', ' payment_type',
       ' fare_amount', ' surcharge', ' mta_tax', ' tip_amount',
       ' tolls_amount', ' total_amount'],
      dtype='object')

**Test conversion to `datetime` data type**

In [20]:
pd.to_datetime(taxiData[' pickup_datetime']).head()

0   2014-10-01 08:55:07
1   2014-10-01 10:51:17
2   2014-10-01 02:03:03
3   2014-10-01 00:06:35
4   2014-10-01 01:34:13
Name:  pickup_datetime, dtype: datetime64[ns]

**Create new dataframe with just the trips that started between Oct. 6, 2014 and Oct. 12, 2014**

In [34]:
selectedData = taxiData[(pd.to_datetime(taxiData[' pickup_datetime']) >= '2014-10-06') & (pd.to_datetime(taxiData[' pickup_datetime']) < '2014-10-13')]

**Explore new dataframe: `shape`, `max()` & `min()`**

In [36]:
selectedData.shape

(3217091, 18)

In [37]:
selectedData[' pickup_datetime'].max()

'2014-10-12 23:59:59'

In [38]:
selectedData[' pickup_datetime'].min()

'2014-10-06 00:00:00'

**Export selected trips to `.csv` file**

In [39]:
selectedData.to_csv('../data/yellow_tripdata_141006_141012.csv')