#### Basic Library Tools

In [1]:
%matplotlib notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
#Notebook Specifics

pd.set_option("max_rows", 20)
np.set_printoptions(suppress = True)
#Pretty Graphs

from seaborn import set_style
set_style("darkgrid")
import seaborn as sns


import time
from datetime import datetime
import calendar

# This package will output the execution time of each cell. Pretty neat!
warnings.filterwarnings('ignore')
%install_ext https://raw.github.com/cpcloud/ipython-autotime/master/autotime.py
warnings.filterwarnings('default')
%load_ext autotime

Installed autotime.py. To use it, type:
  %load_ext autotime


### Loading Data into a Test and a Training set
#### Testing Data = Summer 2016 

In [4]:
testing_csv_files = ["201606-citibike-tripdata.csv", "201608-citibike-tripdata.csv", "201607-citibike-tripdata.csv"]
testing = pd.DataFrame()
for filename in testing_csv_files:
    testing = testing.append(pd.read_csv(filename))

#### Training Data = Summer 2014 and Summer 2015

In [7]:
training_csv_files = ["2014-06 - Citi Bike trip data.csv",  "201506-citibike-tripdata.csv", "2014-07 - Citi Bike trip data.csv", "201507-citibike-tripdata.csv", "2014-08 - Citi Bike trip data.csv", "201508-citibike-tripdata.csv"]
training = pd.DataFrame()
for filename in training_csv_files:
    training = training.append(pd.read_csv(filename))

#### Sanity checks

In [12]:
training.columns.difference(testing.columns) #To check that the heading of our test Data and Training Data are the same
                                             # Since object return is empty, this implies that they have the same headings

Index([], dtype='object')

In [11]:
training.columns.equals(testing.columns)      #Object returned is true, so both have the same headings. 

True

## Some Data Mangling

### Step 1 Take only the Subscribers

##### First lets get the ratio of people who are subscribers  vs people who are customers 

In [8]:
usertype_ratio = pd.Categorical(training["usertype"])
usertype_ratio.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Customer,904882,0.148948
Subscriber,5170268,0.851052


##### Thus the number of Subscribers is 85% of the original data we had. 
##### This leaves us with 5.17 million trips

In [9]:
# To clean out the customers detail
training[training.usertype == "Subscriber"]
del training['usertype']

#### Now to Drop NAN entries

In [24]:
training = training.dropna(how='any')

#### This drops another 370k entries.  Reducing our total data size to 

### Step 2. Calculate age of User 

In [10]:
training['birth year'] = pd.to_numeric(training['birth year'], errors='coerce')

In [11]:
training = training.dropna(how='any')

In [18]:
training['Age'] = 2015 - training['birth year']

In [20]:
del training['birth year'] 

In [23]:
del training['bikeid']

In [24]:
training.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,gender,Age
0,520,2014-06-01 00:00:02,2014-06-01 00:08:42,358,Christopher St & Greenwich St,40.732916,-74.007114,426,West St & Chambers St,40.717548,-74.013221,1,36.0
2,414,2014-06-01 00:00:32,2014-06-01 00:07:26,439,E 4 St & 2 Ave,40.726281,-73.98978,368,Carmine St & 6 Ave,40.730386,-74.00215,1,35.0
3,310,2014-06-01 00:00:34,2014-06-01 00:05:44,463,9 Ave & W 16 St,40.742065,-74.004432,380,W 4 St & 7 Ave S,40.734011,-74.002939,1,31.0
4,457,2014-06-01 00:00:35,2014-06-01 00:08:12,352,W 56 St & 6 Ave,40.763406,-73.977225,305,E 58 St & 3 Ave,40.760958,-73.967245,1,46.0
5,399,2014-06-01 00:00:43,2014-06-01 00:07:22,293,Lafayette St & E 8 St,40.730287,-73.990765,247,Perry St & Bleecker St,40.735354,-74.004831,1,45.0


### Step 3. Add day of week 

In [48]:
training.to_csv('partiall_cleaned_training_data.csv', sep=',', encoding='utf-8')

In [5]:
training = pd.read_csv('partiall_cleaned_training_data.csv')

time: 34.1 s


###### Since our data follows multiple date formats, we need to clean it to get data out of it

In [6]:
weekend = []

def try_parsing_date(text):
    for fmt in ('%Y-%m-%d %H:%M:%S', '%m/%d/%Y %H:%M:%S', '%m/%d/%Y %H:%M'):
        try:
            return datetime.strptime(text, fmt)
        except ValueError:
            pass
    raise ValueError('no valid date format found')

time: 8.31 ms


In [7]:
for i in range(len(training)):
    weekend.append(calendar.day_name[try_parsing_date(training["starttime"].values[i]).weekday()])
training["day of week"] = weekend

time: 12min 54s


In [8]:
training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5170103 entries, 0 to 5170102
Data columns (total 15 columns):
Unnamed: 0                 int64
tripduration               int64
starttime                  object
stoptime                   object
start station id           int64
start station name         object
start station latitude     float64
start station longitude    float64
end station id             int64
end station name           object
end station latitude       float64
end station longitude      float64
gender                     int64
Age                        float64
day of week                object
dtypes: float64(5), int64(5), object(5)
memory usage: 591.7+ MB
time: 1.39 s


In [9]:
training.head()

  def _singleton_printers_default(self):


Unnamed: 0.1,Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,gender,Age,day of week
0,0,520,2014-06-01 00:00:02,2014-06-01 00:08:42,358,Christopher St & Greenwich St,40.732916,-74.007114,426,West St & Chambers St,40.717548,-74.013221,1,36.0,Sunday
1,2,414,2014-06-01 00:00:32,2014-06-01 00:07:26,439,E 4 St & 2 Ave,40.726281,-73.98978,368,Carmine St & 6 Ave,40.730386,-74.00215,1,35.0,Sunday
2,3,310,2014-06-01 00:00:34,2014-06-01 00:05:44,463,9 Ave & W 16 St,40.742065,-74.004432,380,W 4 St & 7 Ave S,40.734011,-74.002939,1,31.0,Sunday
3,4,457,2014-06-01 00:00:35,2014-06-01 00:08:12,352,W 56 St & 6 Ave,40.763406,-73.977225,305,E 58 St & 3 Ave,40.760958,-73.967245,1,46.0,Sunday
4,5,399,2014-06-01 00:00:43,2014-06-01 00:07:22,293,Lafayette St & E 8 St,40.730287,-73.990765,247,Perry St & Bleecker St,40.735354,-74.004831,1,45.0,Sunday


time: 74.4 ms


###### Now to change our day of the week into an integer

In [10]:
training['day of week'].replace({"Sunday": 0, "Monday":1, "Tuesday": 2, "Wednesday":3, "Thursday": 4, "Friday":5, "Saturday": 6 }, inplace=True)

time: 9.71 s


In [11]:
training.head()

Unnamed: 0.1,Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,gender,Age,day of week
0,0,520,2014-06-01 00:00:02,2014-06-01 00:08:42,358,Christopher St & Greenwich St,40.732916,-74.007114,426,West St & Chambers St,40.717548,-74.013221,1,36.0,0
1,2,414,2014-06-01 00:00:32,2014-06-01 00:07:26,439,E 4 St & 2 Ave,40.726281,-73.98978,368,Carmine St & 6 Ave,40.730386,-74.00215,1,35.0,0
2,3,310,2014-06-01 00:00:34,2014-06-01 00:05:44,463,9 Ave & W 16 St,40.742065,-74.004432,380,W 4 St & 7 Ave S,40.734011,-74.002939,1,31.0,0
3,4,457,2014-06-01 00:00:35,2014-06-01 00:08:12,352,W 56 St & 6 Ave,40.763406,-73.977225,305,E 58 St & 3 Ave,40.760958,-73.967245,1,46.0,0
4,5,399,2014-06-01 00:00:43,2014-06-01 00:07:22,293,Lafayette St & E 8 St,40.730287,-73.990765,247,Perry St & Bleecker St,40.735354,-74.004831,1,45.0,0


time: 61.7 ms


In [14]:
training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5170103 entries, 0 to 5170102
Data columns (total 15 columns):
Unnamed: 0                 int64
tripduration               int64
starttime                  object
stoptime                   object
start station id           int64
start station name         object
start station latitude     float64
start station longitude    float64
end station id             int64
end station name           object
end station latitude       float64
end station longitude      float64
gender                     int64
Age                        float64
day of week                int64
dtypes: float64(5), int64(6), object(4)
memory usage: 591.7+ MB
time: 363 ms


###### Now That we were able to add the weekday to the data. The next component is to remove the dates that we have and keep just the start time and stop time. 

In [15]:
training.to_csv('semi_cleaned_training_data.csv', sep=',', encoding='utf-8')

time: 5min 48s
