In [373]:
import pandas as pd
import numpy as np
from datetime import datetime as dt

# displaying options for data frames
pd.set_option("display.max_columns", 5400)
pd.set_option("display.max_colwidth", 500)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

## Preparation of the dataset

In [374]:
# reading clean data sets into dataframes
bike_data = pd.read_csv("../data_clean/london_merged_clean.csv", encoding = "ISO-8859-1")
terrorism_data = pd.read_csv("../data_clean/global_terrorism_clean_uk_only.csv" , encoding = 'utf-8') #"ISO-8859-1")

In [375]:
# dropping duplicate index column from terrorism_data
terrorism_data = terrorism_data.drop(["Unnamed: 0"], axis=1)

In [376]:
# merging terrorism data set with London bike sharing data set
df = bike_data.merge(terrorism_data, how = 'left', on = 'date')

In [377]:
# renaming event_id column to have it represent if there has been an incident on the day or not
df.rename(columns = {"event_id" : "attack(y/n)"}, inplace = True)

# converting event_id column to boolean
df["attack(y/n)"]  = df["attack(y/n)"].replace(np.nan, 'NAN')
df["attack(y/n)"] = np.where(df["attack(y/n)"] != "NAN", True, False)

In [378]:
# converting date column to datetime object
df['date'] = pd.to_datetime(df['date'])

In [379]:
# setting the date column as index column
df = df.set_index("date")

In [380]:
# creating a first working dataframe containing only the date as an index, the number of bike rents per day
# as well as if there has been an incident on these days or not
df_base = df[[ "bike_rents", 'attack(y/n)']].copy()

In [381]:
# dropping duplicate date index rows
df_base = df_base.loc[~df_base.index.duplicated(keep='first')]

## Basic analysis

### Calculating the mean difference (before/after)

In [382]:
# defining function to get all days where terrorism incident occured from giving dataframe
def get_dates_of_attack_lst(df):
    dates_of_attack = list(df.loc[df["attack(y/n)"] == True].index)
    return list(dict.fromkeys(dates_of_attack))

In [383]:
# defining function to get all dates before and after the terrorism attack
# takes a single (incident) date and the number of days that should be retuned before and after
# returns a list of dates
def get_date_index(date, days = 8):
    date_indexes = []
    week_before = list(pd.date_range(end = date, periods = days))
    week_after = list(pd.date_range(start = date, periods = days))
    for entry in week_before:
        date_indexes.append(entry)
    for entry in week_after:
        date_indexes.append(entry)
    return pd.to_datetime(list(dict.fromkeys(date_indexes)))

In [384]:
# defining function to get all bike rent count values for a time series
# takes a dataframe, a list of dates and the number of days that should be returned before and after the attack
# returns a Series containing dates and corresponding bike rent count values
def get_rents_counts(df, date_index, days):
    time_ser = get_date_index(date_index , days)
    return df['bike_rents'][time_ser]

In [385]:
# defining function to calculate the difference between the sum of bike rent count values before an incident 
# and after an incident
# takes a Series containing bike rent count values
# returns the difference as an integer
def get_diff(ser):
    chunk_size = len(ser)//2
    before_sum = sum(ser[0:chunk_size])
    after_sum =  sum(ser[(chunk_size+1):len(ser)])
    return after_sum - before_sum 

In [386]:
# defining a function which processes a dataframe to get the mean difference between before and after incident 
# bike rents
# takes a dataframe and the number of days to look at before and after an incident
# returns the mean difference of before and after incident bike rent counts as an integer
def get_diff_df(df, days = 8):
    lst_diff = [] 
    # get list of terrorism attack dates from df
    lst_of_attacks = get_dates_of_attack_lst(df)
    # for each attack date get list of surrounding dates depending on days passed to the function
    # and from that list get a a series of bike rent counts for that time period
    for elem in lst_of_attacks:
        time_series = get_rents_counts(df, elem, days)
        # dealing with missing values at the beginning and the end of the date series by skipping them
        if time_series.isnull().sum() != 0:
            continue
        lst_diff.append(get_diff(time_series))
    return sum(lst_diff)/len(lst_diff)

In [387]:
# testing mean difference values for different time spans
print("Mean difference for 14 days: ", get_diff_df(df_base, 14))
print("Mean difference for 12 days: ", get_diff_df(df_base, 12))
print("Mean difference for 8 days: ", get_diff_df(df_base, 8))
print("Mean difference for 4 days: ", get_diff_df(df_base, 4))
print("Mean difference for 3 days: ", get_diff_df(df_base, 3))
print("Mean difference for 2 days: ", get_diff_df(df_base, 2))

Mean difference for 14 days:  415.07738095238096
Mean difference for 12 days:  -226.20118343195267
Mean difference for 8 days:  -1360.7882352941176
Mean difference for 4 days:  -1310.3139534883721
Mean difference for 3 days:  181.27167630057804
Mean difference for 2 days:  504.7586206896552


### Attempt for a pearson correlation

In [388]:
# defining function to get bike rent count values for before and after an incident as a tuple
# takes a Series of bike rent count values
# returns a tuple holding before and after count values as integers
def get_before_after_tup(ser):
    lst_b_a = []
    chunk_size = len(ser)//2
    lst_b_a.append(sum(ser[0:chunk_size]))
    lst_b_a.append(sum(ser[(chunk_size+1):len(ser)]))
    return tuple(lst_b_a) 

In [389]:
# defining a function which processes a dataframe to get the bike rent count values for before and after 
# an incident occured
# takes a dataframe and the number of days to look at before and after an incident
# returns a dataframe containing columns for before count, after count and date (index)
def get_before_after_df(df, days = 8):
    lst_before_after = [] 
    # get list of terrorism attack dates from df
    lst_of_attacks = get_dates_of_attack_lst(df)
    # for each attack date get list of surrounding dates depending on days passed to the function
    # and from that list get a a series of bike rent counts for that time period
    for elem in lst_of_attacks:
        time_series = get_rents_counts(df, elem, days)
        # dealing with missing values at the beginning and the end of the date series by skipping them
        if time_series.isnull().sum() != 0:
            continue
        lst_before_after.append(get_before_after_tup(time_series))
    return pd.DataFrame(lst_before_after, columns = ["before_sum", "after_sum"])

In [390]:
# defining a dataframe holding values for bike rent count before and after an incident for the given amount of
# days
df_corr = get_before_after_df(df_base, 3)

In [391]:
# trying to visualize a pearson correlation matrix to see if we can draw conclusions from this
df_corr.corr(method="pearson")

Unnamed: 0,before_sum,after_sum
before_sum,1.0,0.574
after_sum,0.574,1.0


## Excluding incidents happened in Northern Ireland

In [392]:
# defining new dataframe containing date (index), bike rent counts, whether there has been an incident and
# state/area where the incident took place
df_wo_ni = df[[ "bike_rents", 'attack(y/n)', 'state']].copy()

In [393]:
# dropping duplicate index rows
df_wo_ni = df_wo_ni.loc[~df_wo_ni.index.duplicated(keep='first')]

In [394]:
# defining a function which processes a dataframe to get the mean difference between before and after incident 
# bike rents, filtering by incident, excluding incidents happend in Northern Ireland
# takes a dataframe and the number of days to look at before and after an incident
# returns the mean difference of before and after incident bike rent counts as an integer
def get_diff_df_wo_ni(df, days = 8):
    lst_diff = [] 
    # get list of terrorism attack dates from df
    lst_of_attacks = list(df.loc[(df["attack(y/n)"] == True) & (df["state"] != "Northern Ireland")].index)
    lst_of_attacks = list(dict.fromkeys(lst_of_attacks))
    # for each attack date get list of surrounding dates depending on days passed to the function
    # and from that list get a a series of bike rent counts for that time period
    for elem in lst_of_attacks:
        time_series = get_rents_counts(df, elem, days)
        # dealing with missing values at the beginning and the end of the date series by skipping them
        if time_series.isnull().sum() != 0:
            continue
        lst_diff.append(get_diff(time_series))
    return sum(lst_diff)/len(lst_diff)

In [395]:
# testing mean difference values for different time spans
print("Mean difference for 14 days: ", get_diff_df_wo_ni(df_wo_ni, 14))
print("Mean difference for 12 days: ", get_diff_df_wo_ni(df_wo_ni, 12))
print("Mean difference for 8 days: ", get_diff_df_wo_ni(df_wo_ni, 8))
print("Mean difference for 4 days: ", get_diff_df_wo_ni(df_wo_ni, 4))
print("Mean difference for 3 days: ", get_diff_df_wo_ni(df_wo_ni, 3))
print("Mean difference for 2 days: ", get_diff_df_wo_ni(df_wo_ni, 2))

Mean difference for 14 days:  3956.923076923077
Mean difference for 12 days:  282.11538461538464
Mean difference for 8 days:  919.5
Mean difference for 4 days:  -1929.423076923077
Mean difference for 3 days:  -569.1538461538462
Mean difference for 2 days:  197.03846153846155


## Attempt to calculate a factor representing the weather conditions

In [396]:
# the mean bike rent count value
mean_rent = int(df["bike_rents"].mean())

In [397]:
# the mean bike rent count value when weather condition is rain
df_rain = df[df["weather_condition"] == "rain"]
mean_rain = int(df_rain["bike_rents"].mean())

In [398]:
# difference between the mean bike rent count value and the mean bike rent count value when weather condition
# is rain
diff = mean_rain - mean_rent

In [399]:
# finding the mean bike rent count for every weather condition and calculating the difference to the general
# mean bike rent count value
weather_values = list(dict(df["weather_condition"].value_counts()).keys())
diffrence_from_mean = {} 

for weather_condition in weather_values:
    df_WC = df[df["weather_condition"] == weather_condition].copy()
    diff_WC_from_mean = int(df_WC["bike_rents"].mean()) - mean_rent
    diffrence_from_mean[weather_condition] = diff_WC_from_mean
    
print(diffrence_from_mean)

{'clear': 3224, 'few clouds': 1153, 'partly cloudy': -2069, 'rain': -8690, 'cloudy': -4514, 'snow': -15424}


In [400]:
# creating new dataframe holding date (index), bike rent count, attack(True/False) and weather condition
df_WC = df[[ "bike_rents", 'attack(y/n)', "weather_condition"]].copy()

# dropping duplicate index rows
df_WC = df_WC.loc[~df_WC.index.duplicated(keep='first')]

In [401]:
# applying the calculated weather factor to the dataframe
df_WC["weather_condition"] = df_WC["weather_condition"].map(diffrence_from_mean)
df_WC = df_WC.assign(WC_factored_rents = lambda x : x["bike_rents"] + x["weather_condition"])

In [402]:
# defining function to get all bike rent count values for a time series
# takes a dataframe, a list of dates and the number of days that should be returned before and after the attack
# returns a Series containing dates and corresponding weather factorized bike rent count
def get_rents_counts_WC_fatored(df, date_index, days):
    time_ser = get_date_index(date_index , days)
    return df['WC_factored_rents'][time_ser]

In [410]:
# defining a function which processes a dataframe to get the bike rent count values for before and after 
# an incident occured factorized with the weather conditions
# takes a dataframe and the number of days to look at before and after an incident
# returns a dataframe containing columns for before count, after count and date (index)
def get_before_after_df_WC_factored(df, days = 8):
    lst_before_after = [] 
    # get list of terrorism attack dates from df
    lst_of_attacks = get_dates_of_attack_lst(df)
    # for each attack date get list of surrounding dates depending on days passed to the function
    # and from that list get a a series of bike rent counts for that time period
    for elem in lst_of_attacks:
        time_series = get_rents_counts_WC_fatored(df, elem, days)
        # dealing with missing values at the beginning and the end of the date series by skipping them
        if time_series.isnull().sum() != 0:
            continue
        lst_before_after.append(get_before_after_tup(time_series))
    return pd.DataFrame(lst_before_after, columns = ["before_sum", "after_sum"])

In [405]:
# defining a function which processes a dataframe to get the mean difference between before and after incident 
# weather factorized bike rents
# takes a dataframe and the number of days to look at before and after an incident
# returns the mean difference of before and after incident bike rent counts as an integer
def get_diff_factored_WC(df, days = 8):
    lst_diff = [] 
    # get list of terrorism attack dates from df
    lst_of_attacks = get_dates_of_attack_lst(df)
    # for each attack date get list of surrounding dates depending on days passed to the function
    # and from that list get a a series of bike rent counts for that time period
    for elem in lst_of_attacks:
        time_series = get_rents_counts_WC_fatored(df, elem, days)
        # dealing with missing values at the beginning and the end of the date series by skipping them
        if time_series.isnull().sum() != 0:
            continue
        lst_diff.append(get_diff(time_series))
    return sum(lst_diff)/len(lst_diff)


In [412]:
# testing mean difference values for different time spans
print("Mean difference for 14 days: ", get_diff_factored_WC(df_WC, 14))
print("Mean difference for 12 days: ", get_diff_factored_WC(df_WC, 12))
print("Mean difference for 8 days: ", get_diff_factored_WC(df_WC, 8))
print("Mean difference for 4 days: ", get_diff_factored_WC(df_WC, 4))
print("Mean difference for 3 days: ", get_diff_factored_WC(df_WC, 3))
print("Mean difference for 2 days: ", get_diff_factored_WC(df_WC, 2))

Mean difference for 14 days:  867.4642857142857
Mean difference for 12 days:  -148.92899408284023
Mean difference for 8 days:  -2083.1235294117646
Mean difference for 4 days:  -1588.4709302325582
Mean difference for 3 days:  452.53757225433526
Mean difference for 2 days:  1080.8333333333333
