# Machine Learning Algorithms

In [1]:
%matplotlib notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Notebook Specifics

pd.set_option("max_rows", 20)
np.set_printoptions(suppress = True)
#Pretty Graphs

from seaborn import set_style
set_style("darkgrid")
import seaborn as sns
import warnings

In [2]:
# This package will output the execution time of each cell. Pretty neat!
warnings.filterwarnings('ignore')
%install_ext https://raw.github.com/cpcloud/ipython-autotime/master/autotime.py
warnings.filterwarnings('default')
%load_ext autotime

Installed autotime.py. To use it, type:
  %load_ext autotime


In [7]:
data = pd.read_csv('semi_cleaned_training_data.csv')

time: 39.4 s


In [8]:
from sklearn.decomposition import PCA 

time: 644 ms


2014-06 - Citi Bike trip data.csv  201508-citibike-tripdata.csv
2014-07 - Citi Bike trip data.csv  [0m[01;31mdata.zip[0m
2014-08 - Citi Bike trip data.csv  partiall_cleaned_training_data.csv
201506-citibike-tripdata.csv       semi_cleaned_training_data.csv
201507-citibike-tripdata.csv       test.py
time: 129 ms


In [21]:
data.groupby('day of week')['start station name', 'end station name'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,start station name,end station name
day of week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,count,557480,557480
0,unique,429,429
0,top,W 21 St & 6 Ave,W 21 St & 6 Ave
0,freq,5985,6205
1,count,805557,805557
1,unique,431,432
1,top,8 Ave & W 31 St,8 Ave & W 31 St
1,freq,13399,9259
2,count,795996,795996
2,unique,411,412


time: 4.75 s


### Now lets see what happens when we scale the data before we apply PCA

In [28]:
del data["end station name"]

time: 3.55 ms


In [18]:
data.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,gender,Age,day of week
0,520,2014-06-01 00:00:02,2014-06-01 00:08:42,358,Christopher St & Greenwich St,40.732916,-74.007114,426,West St & Chambers St,40.717548,-74.013221,1,36.0,0
1,414,2014-06-01 00:00:32,2014-06-01 00:07:26,439,E 4 St & 2 Ave,40.726281,-73.98978,368,Carmine St & 6 Ave,40.730386,-74.00215,1,35.0,0
2,310,2014-06-01 00:00:34,2014-06-01 00:05:44,463,9 Ave & W 16 St,40.742065,-74.004432,380,W 4 St & 7 Ave S,40.734011,-74.002939,1,31.0,0
3,457,2014-06-01 00:00:35,2014-06-01 00:08:12,352,W 56 St & 6 Ave,40.763406,-73.977225,305,E 58 St & 3 Ave,40.760958,-73.967245,1,46.0,0
4,399,2014-06-01 00:00:43,2014-06-01 00:07:22,293,Lafayette St & E 8 St,40.730287,-73.990765,247,Perry St & Bleecker St,40.735354,-74.004831,1,45.0,0


time: 60.9 ms


In [29]:
data.head()

Unnamed: 0,tripduration,start station latitude,start station longitude,end station latitude,end station longitude,gender,Age,day of week
0,520,40.732916,-74.007114,40.717548,-74.013221,1,36.0,0
1,414,40.726281,-73.98978,40.730386,-74.00215,1,35.0,0
2,310,40.742065,-74.004432,40.734011,-74.002939,1,31.0,0
3,457,40.763406,-73.977225,40.760958,-73.967245,1,46.0,0
4,399,40.730287,-73.990765,40.735354,-74.004831,1,45.0,0


time: 39.6 ms


In [30]:
pca = PCA(n_components = 2)
pca.fit(data)

PCA(copy=True, n_components=2, whiten=False)

time: 16.9 s


In [31]:
print(pca.explained_variance_ratio_)

[ 0.99998955  0.00001016]
time: 287 ms


In [32]:
print(pd.DataFrame(pca.components_,columns = data.columns, index = ['PC-1','PC-2']))

      tripduration  start station latitude  start station longitude  \
PC-1       1.00000           -4.130538e-08            -4.844547e-09   
PC-2       0.00002           -6.460746e-05             4.255993e-05   

      end station latitude  end station longitude    gender       Age  \
PC-1         -5.602724e-08           1.129247e-08  0.000002  0.000020   
PC-2         -7.205947e-05           4.090499e-05  0.002493 -0.999997   

       day of week  
PC-1  3.575752e-07  
PC-2 -8.024887e-04  
time: 365 ms


In [33]:
from sklearn import preprocessing
data_scaled = pd.DataFrame(preprocessing.scale(data),columns = data.columns)

time: 2.63 s


In [34]:
pca2 = PCA(n_components=2)
pca2.fit_transform(data_scaled)

array([[ 1.71770815,  1.43155981],
       [ 0.63277439,  0.22711191],
       [ 0.54082203,  1.19452061],
       ..., 
       [-0.42646629, -0.00952201],
       [ 1.41904019, -1.38994751],
       [-1.27658075, -0.39438163]])

time: 16 s


In [35]:
print("Without scaling:", pca.explained_variance_ratio_)
print("with scaling: ", pca2.explained_variance_ratio_)

Without scaling: [ 0.99998955  0.00001016]
with scaling:  [ 0.22873072  0.16174658]
time: 15.7 ms


In [36]:
print(pd.DataFrame(pca2.components_,columns = data_scaled.columns, index = ['PC-1','PC-2']))

      tripduration  start station latitude  start station longitude  \
PC-1      0.014199               -0.582465                -0.393439   
PC-2     -0.029010                0.353142                -0.564578   

      end station latitude  end station longitude    gender       Age  \
PC-1             -0.580876               -0.38879  0.127082 -0.030848   
PC-2              0.356737               -0.56790 -0.165142  0.280414   

      day of week  
PC-1    -0.007958  
PC-2    -0.005080  
time: 185 ms


##### For Plotting

In [37]:
transformed_scaled_data = pca2.transform(data_scaled)

time: 920 ms


In [38]:
first_pc=pca.components_[0]
second_pc=pca.components_[1]

time: 35.9 ms


In [39]:
#for ii, jj in zip(transformed_scaled_data, data_scaled):
 #   plt.scatter(first_pc[0]*ii[0], first_pc[1]*ii[0], color = "r")
#   plt.scatter(second_pc[0]*ii[1], second_pc[1]*ii[1], color = "c")
  #  plt.scatter(jj[0], jj[1], color = "b")

#plt.xlabel("X-axis")
#plt.ylabel("Y-axis")
#plt.show()


time: 3.63 ms
