In [None]:
import pandas as pd
import numpy as np

data =  open('links.txt').read().splitlines()
PATH = 'data/'
data



In [None]:
dfs = [pd.read_csv(PATH + path) for path in data]
df = pd.concat(dfs, ignore_index=True)

In [None]:
df.info()

In [None]:
df.columns = df.columns.str.strip()

df.columns

In [None]:
df['DATETIME'] = pd.to_datetime(df.DATE + " " + df.TIME, format='%m/%d/%Y %H:%M:%S')
df.head(5)

In [None]:
# Make sure there are no duplicate entries
df.drop_duplicates(subset=["C/A", "UNIT", "SCP", "STATION", "DATETIME"], inplace=True)

In [None]:
df[["PREV_DATETIME", "PREV_ENTRIES", "PREV_EXITS"]] = (df.groupby(["C/A", "UNIT", "SCP", "STATION"])["DATETIME", "ENTRIES", "EXITS"]
                                            .transform(lambda grp: grp.shift(1)))
df.head(5)

In [None]:
df.dropna(subset=["PREV_DATETIME"], axis=0, inplace=True)

In [None]:
df['TIME_INTERVAL'] = df['DATETIME'] - df['PREV_DATETIME']
df = df.drop(df[(df['TIME_INTERVAL'] > '05:00:00') & (df['TIME_INTERVAL'] < '03:00:00')].index)
df.head(5)

In [None]:
# modify entry counts that are negative and remove the ones with outrageous values
df['ENTRY_COUNT'] = df.ENTRIES - df.PREV_ENTRIES
df.ix[df.ENTRY_COUNT < 0, 'ENTRY_COUNT'] =  - df['ENTRY_COUNT']
df = df.drop(df[df.ENTRY_COUNT > 1000000].index)

In [None]:
# modify exit counts that are negative and remove the ones with outrageous values
df['EXIT_COUNT'] = df.EXITS - df.PREV_EXITS
df.ix[df.EXIT_COUNT < 0, 'EXIT_COUNT'] =  - df['EXIT_COUNT']
df = df.drop(df[df.EXIT_COUNT > 1000000].index)
df.head(5)

In [None]:
df['DATE'] = pd.to_datetime(df['DATE'], format='%m/%d/%Y')
df['WEEKDAY'] = df['DATE'].dt.dayofweek
df['TIME'] = pd.to_datetime(df['TIME'], format='%H:%M:%S')
df['HOUR'] = df['TIME'].dt.hour
df.head(5)

In [None]:
df = df[(df['DATE'] >= '05/01/2016') & (df['DATE'] < '07/01/2016')]
df.head(5)

In [None]:

df_weekday = df[df.WEEKDAY < 5]
df_weekend = df[df.WEEKDAY > 4]

df_weekend.head(5)

In [None]:

df_weekend.head(5)

In [None]:
df_weekend = df_weekend[['STATION', 'LINENAME', 'WEEKDAY', 'HOUR', 'ENTRY_COUNT', 'EXIT_COUNT', 'DATE']]
df_weekend.head(5)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Image

%matplotlib inline
import seaborn as sns

ax = plt.axes()
df_weekend_top_times = df_weekend.groupby('HOUR',as_index=False)['ENTRY_COUNT'].mean()
sns.barplot(x='HOUR', y='ENTRY_COUNT', data=df_weekend_top_times)
ax.set_ylabel('4 hour entry counts')

In [None]:
df_weekend_daily_entries = df_weekend.groupby(['STATION', 'LINENAME', 'DATE'], as_index=False)['ENTRY_COUNT'].sum()


df_weekend_daily_entries = df_weekend_daily_entries.groupby(['STATION', 'LINENAME'], as_index=False)['ENTRY_COUNT'].mean()
df_weekend_daily_entries.head(10)

In [None]:
df_weekend_top_stations = df_weekend_daily_entries.sort_values(by=['ENTRY_COUNT']).tail(10)
df_weekend_top_stations

In [None]:
sns.barplot(x='STATION', y='ENTRY_COUNT', data=df_weekend_top_stations)
ax = plt.axes()
ax.set_title('Top stations on the weekends')
ax.set_ylabel('Daily entries')
for l in ax.get_xticklabels():
    l.set_rotation(90)

In [None]:
df_weekend_top_times = df_weekend.groupby(['STATION', 'DATE', 'WEEKDAY', 'HOUR'], as_index=False)['ENTRY_COUNT'].sum()

In [None]:
df_weekend_top_times = df_weekend_top_times.groupby(['STATION', 'WEEKDAY', 'HOUR'], as_index=False)['ENTRY_COUNT'].mean()
df_weekend_top_times = df_weekend_top_times.sort_values(by=['ENTRY_COUNT']).tail(20)
weekend_dict = {5: "Saturday", 6: "Sunday"}
df_weekend_top_times['WEEKDAY'] = df_weekend_top_times['WEEKDAY'].map(weekend_dict)
df_weekend_top_times['DESCRIPTION'] = df_weekend_top_times.STATION + " " \
    + df_weekend_top_times.WEEKDAY + " at " + \
    df_weekend_top_times.HOUR.map(str)
sns.barplot(x='DESCRIPTION', y='ENTRY_COUNT', data=df_weekend_top_times)
ax = plt.axes()
ax.set_title('Top stations on the weekends and its time')
ax.set_ylabel('Daily entries')
for l in ax.get_xticklabels():
    l.set_rotation(90)

In [None]:
df_herald_sq = df[df.STATION == '34 ST-HERALD SQ'][['DATE', 'TIME', 'ENTRY_COUNT']]
df_herald_sq.head(10)

In [None]:
table = df_herald_sq.pivot_table('ENTRY_COUNT', 'DATE', 'TIME')
sns.heatmap(table.head(20))