In [188]:
import pandas as pd
import numpy as np
from datetime import datetime as dt

# displaying options for data frames
pd.set_option("display.max_columns", 5400)
pd.set_option("display.max_colwidth", 500)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [189]:
bike_data = pd.read_csv("../data_clean/london_merged_clean.csv", encoding = "ISO-8859-1")
terrorism_data = pd.read_csv("../data_clean/global_terrorism_clean_uk_only.csv" , encoding = 'utf-8') #"ISO-8859-1")

In [190]:
terrorism_data = terrorism_data.drop(["Unnamed: 0"], axis=1)

In [191]:
df = bike_data.merge(terrorism_data, how = 'left', on = 'date')

In [192]:
df["event_id"]  = df["event_id"].replace(np.nan, 'NAN')

In [193]:
df["event_id"] = np.where(df["event_id"] != "NAN", True, False)

In [194]:
df.rename(columns = {"event_id" : "attack(y/n)"}, inplace = True)

In [195]:
df['date'] = pd.to_datetime(df['date'])

In [196]:
df = df.set_index("date")

In [197]:
df_base = df[[ "bike_rents", 'attack(y/n)']].copy()

In [198]:
df_base

# dropping duplicate date index rows
df_base = df_base.loc[~df_base.index.duplicated(keep='first')]

In [199]:
def get_dates_of_attack_lst(df):
    dates_of_attack = list(df.loc[df["attack(y/n)"] == True].index)
    return list(dict.fromkeys(dates_of_attack))
    

In [200]:
def get_date_index(date, days = 8):
    date_indexes = []
    week_before = list(pd.date_range(end = date, periods = days))
    week_after = list(pd.date_range(start = date, periods = days))
    for entry in week_before:
        date_indexes.append(entry)
    for entry in week_after:
        date_indexes.append(entry)
    return pd.to_datetime(list(dict.fromkeys(date_indexes)))

In [201]:
def get_rents_counts(df, date_index):
    time_ser = get_date_index(date_index)
    return df['bike_rents'][time_ser]

In [202]:
def get_diff(ser):
    chunk_size = len(ser)//2
    before_sum = sum(ser[0:chunk_size])
    after_sum =  sum(ser[(chunk_size+1):len(ser)])
    return after_sum - before_sum 

In [203]:
def get_diff_df(df, days = 8):
    lst_diff = [] 
    # get list of terrorism attack dates from df
    lst_of_attacks = get_dates_of_attack_lst(df)
    # for each attack date get list of surrounding dates depending on days passed to the function
    # and from that list get a a series of bike rent counts for that time period
    for elem in lst_of_attacks:
        time_series = get_rents_counts(df, elem)
        # dealing with missing values at the beginning and the end of the date series by skipping them
        if time_series.isnull().sum() != 0:
            continue
        lst_diff.append(get_diff(time_series))
    return sum(lst_diff)/len(lst_diff)


In [204]:
get_diff_df(df_base, 8)

-1360.7882352941176