-  reference : https://www.kaggle.com/javalex/nyc-bike-data-analysis-subscribers-and-customers

In [2]:

import numpy as np
import pandas as pd
import geopandas
from shapely.geometry import Point, Polygon

import os 
import sys

import calendar
import glob
import math

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_csv('NYC-BikeShare-2015-2017-combined.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender,Trip_Duration_in_min
0,0,376,2015-10-01 00:16:26,2015-10-01 00:22:42,3212,Christ Hospital,40.734786,-74.050444,3207,Oakland Ave,40.737604,-74.052478,24470,Subscriber,1960.0,1,6
1,1,739,2015-10-01 00:27:12,2015-10-01 00:39:32,3207,Oakland Ave,40.737604,-74.052478,3212,Christ Hospital,40.734786,-74.050444,24481,Subscriber,1960.0,1,12
2,2,2714,2015-10-01 00:32:46,2015-10-01 01:18:01,3193,Lincoln Park,40.724605,-74.078406,3193,Lincoln Park,40.724605,-74.078406,24628,Subscriber,1983.0,1,45
3,3,275,2015-10-01 00:34:31,2015-10-01 00:39:06,3199,Newport Pkwy,40.728745,-74.032108,3187,Warren St,40.721124,-74.038051,24613,Subscriber,1975.0,1,5
4,4,561,2015-10-01 00:40:12,2015-10-01 00:49:33,3183,Exchange Place,40.716247,-74.033459,3192,Liberty Light Rail,40.711242,-74.055701,24668,Customer,1984.0,0,9


## parameter

In [None]:
year = 2017

#age
age_ranges = ['<20','20-29','30-39','40-49','50-59','60+'] # 각 나이 범위를 구함
age_ranges_limits = [0, 20, 30, 40, 50, 60, np.inf]
age_min = 0
age_max = 100

# trip duration
duration_min = 2 # 2초 이상 사용한 사람이 최소 사용이라고 말할 수 있다
duration_max = 30 * 24 * 60 * 60 # 30일 사용

usertypes = ['all', 'subscriber','customer']

#plotting
font_scale = 1.5


## data validation

### load data :

In [None]:
df = pd.read_csv('NYC-BikeShare-2015-2017-combined.csv')
df.describe()

In [None]:
# 시간관련 문자 데이터를 datetime 으로 변경
df['Start time'] = pd.to_datetime(df['Start Time'])
df['Stop Time'] = pd.to_datetime(df['Stop Time'])
df['Birth Year'] = pd.to_numeric(df['Birth Year'], downcast='integer') # 태어난 날을 정수화
df.drop(['Unnamed: 0'], axis=1, inplace=True) # 불필요한 칼럼 삭제
df.head()


In [None]:
# 무시해야할 필요가 있는 칼럼을 우선 생성해둠
df["ignore"] = False 
df["ignore_reason"] = ""

In [None]:
# df.shape = (735502,20) 이나 
duplicates = df.duplicated(subset=None, keep='first') # 중복여부 확인, 중복이 있다면 첫번째 값만 남긴다
# duplicates 는 해당 값이 True / False 로 된 값을 가지고 있음

df.insert(len(df.columns), "duplicate", duplicates, allow_duplicates = True)
# df 라는 이름의 dataframe 에 len(df.columns) 번째에 'duplicate' 라는 이름의 칼럼을 넣어라
# 실제 값은 duplicates 를 넣는 것

print("Found {} duplicate rows".format(len(df[duplicates])))
# df[duplicates] 는 df.duplicate = True 인 것만 불러오는 것


In [None]:
# df bike id, start station id, end station 이 값이 없다면, ignore_reason 칼럼 값에 값 추가
df.loc[df["Bike ID"].isna(), "ignore_reason"] += "Bike ID empty; "
df.loc[df["Start Station ID"].isna(), "ignore_reason"] += "Start Station empty; "
df.loc[df["End Station ID"].isna(), "ignore_reason"] += "End Station empty; "

# 
df.loc[~df["User Type"].isin(["Subscriber", "Customer"]), "ignore_reason"] += "User Type invalid; "


In [None]:
df["age"] = YEAR - df["Birth Year"]
print("Max age: {}.".format(df["age"].max()))
df.loc[df["age"] > AGE_MAX, "ignore_reason"] += "implausible age; "
df.loc[df["age"] < AGE_MIN, "ignore_reason"] += "implausible age; "

print("Min duration: {}.".format(df["Trip Duration"].min()))
df.loc[df["Trip Duration"] < DURATION_MIN, "ignore_reason"] += "Trip Duration implausible; "

print("Max duration: {}.".format(df["Trip Duration"].max()))
df.loc[df["Trip Duration"] > DURATION_MAX, "ignore_reason"] += "Trip Duration implausible; "

df.loc[df["duplicate"] == True, "ignore_reason"] += "duplicate; "

In [None]:
df_subscribers = df[df["User Type"] == "Subscriber"]
df_customers = df[df["User Type"] == "Customer"]
DATAFRAMES = [df, df_subscribers, df_customers]