In [1]:
import pylab as pl
import pandas as pd
import numpy as np
import os
import scipy.stats as stats
from geopy.geocoders import Nominatim

### Reading in Data

In [2]:
##Taken from Prof. Bianco's getCitiBikeCSV.py file

def getCitiBikeCSV(datestring):
    print ("Downloading", datestring)
    ### First I will check that it is not already there
    if not os.path.isfile(os.getenv("PUIDATA") + "/" + datestring + "-citibike-tripdata.csv"):
        if os.path.isfile(datestring + "-citibike-tripdata.csv"):
            # if in the current dir just move it
            if os.system("mv " + datestring + "-citibike-tripdata.csv " + os.getenv("PUIDATA")):
                print ("Error moving file!, Please check!")
        #otherwise start looking for the zip file
        else:
            if not os.path.isfile(os.getenv("PUIDATA") + "/" + datestring + "-citibike-tripdata.zip"):
                if not os.path.isfile(datestring + "-citibike-tripdata.zip"):
                    os.system("curl -O https://s3.amazonaws.com/tripdata/" + datestring + "-citibike-tripdata.zip")
                ###  To move it I use the os.system() functions to run bash commands with arguments
                os.system("mv " + datestring + "-citibike-tripdata.zip " + os.getenv("PUIDATA"))
            ### unzip the csv 
            os.system("unzip " + os.getenv("PUIDATA") + "/" + datestring + "-citibike-tripdata.zip")
            ## NOTE: old csv citibike data had a different name structure. 
            if '2014' in datestring:
                os.system("mv " + datestring[:4] + '-' +  datestring[4:] + 
                          "\ -\ Citi\ Bike\ trip\ data.csv " + datestring + "-citibike-tripdata.csv")
            os.system("mv " + datestring + "-citibike-tripdata.csv " + os.getenv("PUIDATA"))
    ### One final check:
    if not os.path.isfile(os.getenv("PUIDATA") + "/" + datestring + "-citibike-tripdata.csv"):
        print ("WARNING!!! something is wrong: the file is not there!")

    else:
        print ("file in place, you can continue")

In [3]:
datestring1 = '201501'
getCitiBikeCSV(datestring1)
datestring2 = '201502'
getCitiBikeCSV(datestring2)

Downloading 201501
file in place, you can continue
Downloading 201502
file in place, you can continue


In [4]:
df1 = pd.read_csv(os.getenv("PUIDATA") + "/" + datestring1 + '-citibike-tripdata.csv')
df2 = pd.read_csv(os.getenv("PUIDATA") + "/" + datestring2 + '-citibike-tripdata.csv')
df = df1.append(df2)

In [5]:
df['date'] = pd.to_datetime(df['starttime'])

### Splitting by Category

**H0:** There is no statistical relationship between the trip duration of day time and night time trips, $$ \alpha = 0.05 $$

**H0:** There is no statistical relationship between age of bikers in brooklyn vs. manhattan, $$ \alpha = 0.05 $$

#### Part 1: Splitting by Day time/night time trip durations

day time: 7am to 7pm (7 - 19)
night time: 8pm to 6am(20 - 23, 0-6)

In [6]:
df['hour'] = pd.to_datetime(df['starttime']).dt.hour

In [7]:
df = df.sort_values("hour").reset_index()

In [8]:
df["Daytime?"] = 0

In [9]:
df[(df['hour'] == 7)].head(1)

Unnamed: 0,index,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,date,hour,Daytime?
26250,44654,472,1/6/2015 7:35,1/6/2015 7:43,510,W 51 St & 6 Ave,40.76066,-73.98042,72,W 52 St & 11 Ave,40.767272,-73.993929,17512,Subscriber,1957.0,1,2015-01-06 07:35:00,7,0


In [10]:
df[df['hour'] == 20].head(1)

Unnamed: 0,index,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,date,hour,Daytime?
432356,51328,242,1/6/2015 20:04,1/6/2015 20:08,324,DeKalb Ave & Hudson Ave,40.689888,-73.981013,241,DeKalb Ave & S Portland Ave,40.68981,-73.974931,20809,Subscriber,1984.0,1,2015-01-06 20:04:00,20,0


In [94]:
pd.options.mode.chained_assignment = None  # default='warn'
day_index1 = 26250  #the first index at hour 7
day_index2 = 432356 - 1 #the last index at hour 19
df["Daytime?"].iloc[day_index1:day_index2] = 1 #setting all daytime rows to 1

#### Part 2: Splitting by Manhattan/Brooklyn, accounting for ages

In [112]:
lat_min = 40.700402
lat_max = 40.810445
long_min = -74.027210
long_max = -73.971527

In [113]:
def borough_sort(lat_long): #bounding "Manhattan" into a rough box, assuming everything outside "Manhattan" is "BKLYN"
    lat, long = lat_long
    if (lat >= lat_min) & (lat <= lat_max) & (long >= long_min) & (long <= long_max):
        return 1
    else:
        return 0

In [114]:
df["Manhattan?"] = df[['start station latitude' , 'start station longitude']].apply(borough_sort, axis = 1)

In [115]:
df["Age"] = 2015 - df['birth year']

### Data Wrangling

**(a) with Manhattan/Brooklyn Ages (same size and sorted)**

In [116]:
#Creating an array to do pearson and spearman, dropping nan values

manhattan = df.Age[df["Manhattan?"] == 1]
manhattan = manhattan.dropna()
bklyn = df.Age[df["Manhattan?"] == 0]
bklyn = bklyn.dropna()

In [117]:
#Turning into an np array

manhattan = np.asarray(manhattan)
bklyn = np.asarray(bklyn)

In [118]:
print(len(manhattan), len(bklyn))

434270 40317


In [119]:
#Taking a sampling from manhattan that is the same size as the bklyn dataset
manhattan_sample = np.random.choice(manhattan, size = 40317, replace = False)

In [120]:
#Sorting the arrays
manhattan_sample = sorted(manhattan_sample)
bklyn = sorted(bklyn)

**(b) with Day + Night Trip Durations (same size and sorted)**

In [121]:
#Creating an array to do pearson and spearman, dropping nan values

day = df.tripduration[df["Daytime?"] == 1]
night = df.tripduration[df["Daytime?"] == 0]

In [122]:
#Turning into an np array

day = np.asarray(day)
night = np.asarray(night)

In [123]:
print(len(day), len(night))

406105 76377


In [124]:
#Taking a sampling from manhattan that is the same size as the bklyn dataset

day_sample = np.random.choice(day, size = 76377, replace = False)

In [125]:
#Sorting the arrays

day_sample = sorted(day_sample)
night = sorted(night)

### KS Test

In [126]:
#For Manhattan/Brooklyn Ages, Day + Night Trip Durations

ks2 = stats.ks_2samp(df.Age[df["Manhattan?"] == 0].dropna(), df.Age[df["Manhattan?"] == 1].dropna())
ks1 = stats.ks_2samp(df.tripduration[df["Daytime?"] == 0], df.tripduration[df["Daytime?"] == 1])
print(ks1, ks2)

Ks_2sampResult(statistic=0.054503158602050683, pvalue=1.8609701709516374e-166) Ks_2sampResult(statistic=0.0564987187433732, pvalue=7.6700793684706055e-103)


Because both of our KS statistics are less than 1.36 (c(alpha = .05)) (which is less than 1.36 * squareroot(sum diff/product diff)), we cannot reject our null hypotheses.

### Pearson's Test

In [127]:
#Test
pt1 = stats.pearsonr(day_sample, night)
pt2 = stats.pearsonr(manhattan_sample, bklyn)
print(pt1, pt2)

(0.93761232642674908, 0.0) (0.99506626400172016, 0.0)


The high pearson's coefficients and low p value suggest a linear relationship between day-time/night-time durations and manhattan/brooklyn ages. This suggests that they may be of the same distribution, meaning that their is likely some statistical relationship between our two pairs. While we cannot reject the null hypothesis outright it would seem to point that way for both our tests.

### Spearman's Test

In [128]:
#Test
st1 = stats.spearmanr(day_sample,night) 
st2 = stats.spearmanr(manhattan_sample, bklyn)
print(st1,st2)

SpearmanrResult(correlation=0.99999860750938596, pvalue=0.0) SpearmanrResult(correlation=0.99922398508105226, pvalue=0.0)


The high pearson's coefficients and low p value suggest a linear relationship between day-time/night-time durations and manhattan/brooklyn ages. This suggests that they may be of the same distribution, meaning that their is likely some statistical relationship between our two pairs. While we cannot reject the null hypothesis outright it would seem to point that way for both our tests.