# Hubway project - python
To better familiarize myself with Lasso in python, I reimplemented the core features.
### Import libraries

In [117]:
import pandas as pd
from sklearn.linear_model import LassoCV
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

In [121]:
import warnings
warnings.filterwarnings('ignore')

### Load trip data, and parse the date

In [6]:
trips_df = pd.read_csv('../data/hubway_trips.csv', parse_dates=['start_date'])
trips_df.set_index('seq_id', inplace=True)
trips_df.head(2)

Unnamed: 0_level_0,hubway_id,status,duration,start_date,strt_statn,end_date,end_statn,bike_nr,subsc_type,zip_code,birth_date,gender
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,8,Closed,9,2011-07-28 10:12:00,23.0,7/28/2011 10:12:00,23.0,B00468,Registered,'97217,1976.0,Male
2,9,Closed,220,2011-07-28 10:21:00,23.0,7/28/2011 10:25:00,23.0,B00554,Registered,'02215,1966.0,Male


In [7]:
parsed_dates = pd.to_datetime(trips_df['start_date'], format = '%m/%d/%Y %H:%M:%S')
parsed_dates = parsed_dates.apply(lambda x: x.replace(minute=0))

In [8]:
trips_df.start_date = parsed_dates
trips_df.head(2)

Unnamed: 0_level_0,hubway_id,status,duration,start_date,strt_statn,end_date,end_statn,bike_nr,subsc_type,zip_code,birth_date,gender
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,8,Closed,9,2011-07-28 10:00:00,23.0,7/28/2011 10:12:00,23.0,B00468,Registered,'97217,1976.0,Male
2,9,Closed,220,2011-07-28 10:00:00,23.0,7/28/2011 10:25:00,23.0,B00554,Registered,'02215,1966.0,Male


In [9]:
group_trips_df= trips_df.groupby('start_date')['status'].count()
group_trips_df.index.name = 'date_time'
group_trips_df.name = 'trip_counts'
group_trips_df.head(3)

date_time
2011-07-28 10:00:00     8
2011-07-28 11:00:00     4
2011-07-28 12:00:00    66
Name: trip_counts, dtype: int64

In [123]:
group_trips_df= trips_df.groupby(['strt_statn', 'end_statn'])['status'].count()
group_trips_df.index.name = 'start_stn'
group_trips_df.name = 'trip_counts'
group_trips_df.head(3)

strt_statn  end_statn
3.0         3.0          472
            4.0          165
            5.0          289
Name: trip_counts, dtype: int64

### Load the weather data, and merge the dataframes

In [37]:
weather_df = pd.read_csv('../data/basel_weather.csv', sep = ';', parse_dates=['date_time'])
weather_df.set_index('date_time', inplace=True)
weather_df = weather_df.drop(weather_df.columns[[0, 1, 2, 3, 4]], axis=1)

In [38]:
weather_df.describe()

Unnamed: 0,temp,humidity,pressure,precipitation,snowfall,wind,wind_dir
count,20568.0,20568.0,20568.0,20568.0,20568.0,20568.0,20568.0
mean,12.078217,72.3891,1016.812048,0.091336,0.003961,10.816914,204.00948
std,7.833957,14.560318,7.982376,0.334826,0.045475,8.086336,95.950205
min,-12.52,22.0,983.6,0.0,0.0,0.0,0.67
25%,6.15,63.0,1012.3,0.0,0.0,5.04,116.57
50%,12.1,74.0,1017.0,0.0,0.0,8.53,229.4
75%,17.71,84.0,1021.8,0.0,0.0,14.66,279.46
max,36.22,100.0,1040.5,8.1,1.54,74.34,360.0


In [97]:
merged_df = weather_df.join(pd.DataFrame(group_trips_df),how='inner')
merged_df.drop(merged_df.columns[0], axis = 1, inplace=True)
merged_df['precipitation_bin'] = np.where(merged_df['precipitation']>0, 1, 0)
merged_df.head(3)

Unnamed: 0_level_0,humidity,pressure,precipitation,snowfall,wind,wind_dir,trip_counts,precipitation_bin
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-07-28 10:00:00,78.0,1018.7,0.0,0.0,3.76,286.7,8,0
2011-07-28 11:00:00,77.0,1018.5,0.0,0.0,4.35,294.44,4,0
2011-07-28 12:00:00,73.0,1018.7,0.1,0.0,7.2,323.13,66,1


In [98]:
merged_df_dummies = pd.get_dummies(merged_df)
merged_df_dummies.head()

Unnamed: 0_level_0,humidity,pressure,precipitation,snowfall,wind,wind_dir,trip_counts,precipitation_bin
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-07-28 10:00:00,78.0,1018.7,0.0,0.0,3.76,286.7,8,0
2011-07-28 11:00:00,77.0,1018.5,0.0,0.0,4.35,294.44,4,0
2011-07-28 12:00:00,73.0,1018.7,0.1,0.0,7.2,323.13,66,1
2011-07-28 13:00:00,71.0,1018.7,0.2,0.0,8.4,329.04,36,1
2011-07-28 14:00:00,70.0,1018.9,0.3,0.0,9.59,325.71,8,1


### Linear Regression

In [99]:
labels = merged_df['precipitation_bin'].values

13900

In [102]:
features = merged_df[['humidity', 'pressure', 'snowfall', 'wind', 'wind_dir', 'trip_counts']]

83400

In [124]:
data = merged_df.values

In [125]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer


train_data, test_data, train_labels, test_labels = train_test_split(
    data, labels, random_state=0)

imp = Imputer()
imp.fit(train_data)
train_data_finite = imp.transform(train_data)
test_data_finite = imp.transform(test_data)

In [126]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression().fit(train_data_finite, train_labels)
print("logistic regression score: %f" % lr.score(test_data_finite, test_labels))

logistic regression score: 1.000000


In [127]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500, random_state=0).fit(train_data_finite, train_labels)
print("random forest score: %f" % rf.score(test_data_finite, test_labels))

random forest score: 1.000000
