In [2]:
import pandas as pd
import numpy as np

import datetime 
from datetime import timedelta


import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
filename = 'datasets/JC-201907-citibike-tripdata.csv'


year_datafiles = {"May'20": 'datasets/JC-202005-citibike-tripdata.csv',
               "Apr'20": 'datasets/JC-202004-citibike-tripdata.csv',
               "Mar'20": 'datasets/JC-202003-citibike-tripdata.csv',
               "Feb'20": 'datasets/JC-202002-citibike-tripdata.csv',
               "Jan'20": 'datasets/JC-202001-citibike-tripdata.csv',
               "Dec'19": 'datasets/JC-201912-citibike-tripdata.csv',
               "Nov'19": 'datasets/JC-201911-citibike-tripdata.csv',
               "Oct'19": 'datasets/JC-201910-citibike-tripdata.csv',
               "Sep'19": 'datasets/JC-201909-citibike-tripdata.csv',
               "Aug'19": 'datasets/JC-201908-citibike-tripdata.csv',
               "Jul'19": 'datasets/JC-201907-citibike-tripdata.csv',
               "Jun'19": 'datasets/JC-201906-citibike-tripdata.csv'}

In [4]:
def prepare_data(filename):
    df = pd.read_csv(filename)

    df = df.drop(df.index[(df['tripduration'] > 7200)])
    df = df.drop(df.index[(df['start station id'] == df['end station id'])])
    df = df[(df['birth year'] < 2000) & (df['birth year'] > 1970)]

    df["tripduration"]= df["tripduration"].astype(int) 
    df["start station id"]= df["start station id"].astype(int) 
    df["end station id"]= df["end station id"].astype(int) 
    df["birth year"]= df["birth year"].astype(int) 
    df["gender"]= df["gender"].astype(int) 
    df["starttime"]= pd.to_datetime(df["starttime"], format="%Y-%m-%d %H:%M:%S.%f")
    df["stoptime"] = pd.to_datetime(df["stoptime"], format="%Y-%m-%d %H:%M:%S.%f")


    df.drop(['tripduration', 'usertype', 'start station latitude', 'start station longitude', 'end station longitude', 'end station latitude', 'birth year', 'gender', 'bikeid'], axis=1, inplace=True)
    return df 

In [5]:
def detect_groups(df):
    # start time id

    sorted_by_time = df.sort_values(by=['starttime'])

    sorted_by_time.insert(1, "diff_start", sorted_by_time['starttime'].diff(), True)
    sorted_by_time.insert(1, "id_start", None, True) 


    group_id = 0
    for idx, r in sorted_by_time.iterrows():
        if r.diff_start > datetime.timedelta(minutes=5):
            group_id+=1
        sorted_by_time['id_start'][idx] = group_id


    # end time id

    sorted_by_time = sorted_by_time.sort_values(by=['stoptime'])

    sorted_by_time.insert(1, "diff_end", sorted_by_time['stoptime'].diff(), True) 
    sorted_by_time.insert(1, "id_end", None, True) 


    group_id = 0
    for idx, r in sorted_by_time.iterrows():
        if r.diff_end > datetime.timedelta(minutes=5):
            group_id+=1
        sorted_by_time['id_end'][idx] = group_id


    sorted_by_time.insert(1, 'group_id', sorted_by_time.groupby(['start station name', 'end station name', 'id_start', 'id_end']).ngroup(), True)

    #dropping all groups with less than 2 people
    groups = sorted_by_time[sorted_by_time.duplicated(subset=['group_id'], keep=False)]
    
    return groups

In [122]:
year_report = pd.DataFrame(columns = ['mounth', 'group_trips'])

for n, f in year_datafiles.items():
    df = prepare_data(f)
    groups = detect_groups(df)
    year_report = year_report.append({'mounth':n, 'group_trips': groups.group_id.nunique()}, ignore_index=True)
        
year_report

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sorted_by_time['id_start'][idx] = group_id
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sorted_by_time['id_end'][idx] = group_id


Unnamed: 0,mounth,group_trips
0,May'20,1613
1,Apr'20,348
2,Mar'20,1260
3,Feb'20,1875
4,Jan'20,2324
5,Dec'19,1451
6,Nov'19,2852
7,Oct'19,4439
8,Sep'19,5501
9,Aug'19,5330


In [7]:
df = prepare_data(filename)
groups = detect_groups(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sorted_by_time['id_start'][idx] = group_id
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sorted_by_time['id_end'][idx] = group_id


In [8]:
#popular routes
popular_routes = pd.DataFrame({'routes': groups['start station name'] + ' -> ' +  groups['end station name']})
popular_routes = popular_routes.groupby(['routes']).size().reset_index(name='counts')

In [None]:
ax = sns.barplot(popular_routes['counts'], popular_routes['routes'], palette="GnBu_d")
ax.set_title('The most popular routes for group trips', fontsize = 12)
ax.set(xlabel='station name', ylabel='number of group trips')
# rcParams['figure.figsize'] = 12,7
# # padding = 0.1 * meanpacesbyhour.max()
# for index, pace in enumerate(popular_routes):
#     ax.text(index,pace - padding, round(pace, 1), 
#             color='white', ha="center", fontsize = 10)
# plt.show()

[Text(0, 0.5, 'number of group trips'), Text(0.5, 0, 'station name')]

In [79]:
# popular end destination
popular_destination = groups.groupby(['end station name']).size().reset_index(name='counts')