In [None]:
import numpy as np
import pandas as pd
import pandas_profiling as pp

import warnings
warnings.filterwarnings('ignore')

from geopy.distance import geodesic

df = pd.read_csv("data_hackathon_v04.csv", sep="|", parse_dates=["created_at"], index_col="ride_id")

In [None]:
from h3 import h3

df["cell_id"] = df[["pickup_lat","pickup_lng"]].apply(lambda x: h3.geo_to_h3(*x, 7), axis=1)

In [None]:
def make_df_time(df):
    df_time = df[["created_at"]]

    df_time["day"] = df_time.created_at.dt.day
    df_time["month"] = df_time.created_at.dt.month
    df_time["year"] = df_time.created_at.dt.year
    df_time["is_year_start"] = np.int8(df_time.created_at.dt.is_year_start)
    df_time["is_year_end"] = np.int8(df_time.created_at.dt.is_year_end)
    df_time["dayofyear"] = df_time.created_at.dt.dayofyear
    df_time["dayofweek"] = df_time.created_at.dt.dayofweek

    df_time["week"] = df_time.created_at.dt.week

    df_time["hour"] = df_time.created_at.dt.hour
    df_time["minute"] = df_time.created_at.dt.minute
    df_time["final_time"] = (df_time["hour"] + df_time["minute"] // 30) % 24
    df_time["dayofweek"] = df_time.created_at.dt.dayofweek
    df_time["is_weekend"] = df_time["dayofweek"] // 5

    df_time.drop(["created_at"], inplace=True, axis=1)
    return df_time

def in_square(x, y):
    square = (49.895468, 23.907734, 49.769849, 24.121698)
    return (square[2] <= x <= square[0]) and (square[1] <= y <= square[3])

def clear_df(df):
    df["in_Lviv"] = (df[["pickup_lat", "pickup_lng"]].apply(lambda x: in_square(*x), axis=1) & df[["dropoff_lat", "dropoff_lng"]].apply(lambda x: in_square(*x), axis=1))
    df = df[df["in_Lviv"]]
    df["simple_distance"] = np.array(list(map(lambda x: geodesic(x[:2], x[2:]).km, 
                                          df[["pickup_lat", "pickup_lng", "dropoff_lat", "dropoff_lng"]].values)))
    df = df[abs(df["simple_distance"] - df["ride_distance"]) < 10]
    df.drop(["in_Lviv", "simple_distance"], axis=1, inplace=True)
    return df

def add_agg(df_in, df_out, group_by, name, aggs, add_name):
    if "mode" in aggs:
        aggs.remove("mode")
        _ = df_in.groupby(group_by)[name].agg(aggs)
        _["mode"] = df_in.groupby(group_by)[name].apply(lambda x: np.bincount(x).argmax())
    else:
        _ = df_in.groupby(group_by)[name].agg(aggs)
    _.columns = ["_".join([add_name, group_by, name, column]) for column in _.columns]
    return df_out.merge(_, left_index=True, right_index=True, how="left")

def make_aggs(df, name, days, LAST_DAY):
    df__ = pd.DataFrame(index=df[name].unique())

    df_ = df[(LAST_DAY - df.created_at).dt.days > 0]
    df__ = add_agg(df_, df__, name, "canceled_by_client", ["mean", "median", "std", "count"], "all")
    df__ = df__[df__[f"all_{name}_canceled_by_client_count"] > days]
    df__ = add_agg(df_, df__, name, "canceled_by_driver", ["mean", "median", "std"], "all")
    df__ = add_agg(df_, df__, name, "ride_to_suburb", ["mean", "std"], "all")
    df__ = add_agg(df_, df__, name, 'ride_distance', ["mean", "std", "median", "min", "max"], "all")
    df__ = add_agg(df_, df__, name, 'dayofweek', ["median", "mode"], "all")
    df__ = add_agg(df_, df__, name, 'final_time', ["median", "mode"], "all")
    df__ = add_agg(df_, df__, name, 'is_weekend', ["median", "std", "mean", "mode"], "all")
    
    df_ = df[(LAST_DAY - df.created_at).dt.days // 8 == 0]
    df__ = add_agg(df_, df__, name, "canceled_by_client", ["mean", "median", "std", "count"], "3_month")
    df__ = add_agg(df_, df__, name, "canceled_by_driver", ["mean", "median", "std"], "3_month")
    df__ = add_agg(df_, df__, name, "ride_to_suburb", ["mean", "std"], "3_month")
    df__ = add_agg(df_, df__, name, 'ride_distance', ["mean", "std", "median", "min", "max"], "3_month")
    df__ = add_agg(df_, df__, name, 'dayofweek', ["median", "mode"], "3_month")
    df__ = add_agg(df_, df__, name, 'final_time', ["median", "mode"], "3_month")
    df__ = add_agg(df_, df__, name, 'is_weekend', ["median", "std", "mean", "mode"], "3_month")

    df_ = df[(LAST_DAY - df.created_at).dt.days // 6 == 0]
    df__ = add_agg(df_, df__, name, "canceled_by_client", ["mean", "median", "std", "count"], "3_month")
    df__ = add_agg(df_, df__, name, "canceled_by_driver", ["mean", "median", "std"], "3_month")
    df__ = add_agg(df_, df__, name, "ride_to_suburb", ["mean", "std"], "3_month")
    df__ = add_agg(df_, df__, name, 'ride_distance', ["mean", "std", "median", "min", "max"], "3_month")
    df__ = add_agg(df_, df__, name, 'dayofweek', ["median", "mode"], "3_month")
    df__ = add_agg(df_, df__, name, 'final_time', ["median", "mode"], "3_month")
    df__ = add_agg(df_, df__, name, 'is_weekend', ["median", "std", "mean", "mode"], "3_month")

    df_ = df[(LAST_DAY - df.created_at).dt.days // 4 == 0]
    df__ = add_agg(df_, df__, name, "canceled_by_client", ["mean", "median", "std", "count"], "month")
    df__ = add_agg(df_, df__, name, "canceled_by_driver", ["mean", "median", "std"], "month")
    df__ = add_agg(df_, df__, name, "ride_to_suburb", ["mean", "std"], "month")
    df__ = add_agg(df_, df__, name, 'ride_distance', ["mean", "std", "median", "min", "max"], "month")
    df__ = add_agg(df_, df__, name, 'dayofweek', ["median", "mode"], "month")
    df__ = add_agg(df_, df__, name, 'final_time', ["median", "mode"], "month")
    df__ = add_agg(df_, df__, name, 'is_weekend', ["median", "std", "mean", "mode"], "month")
    df__[f"{name}_active"] = df__.index.isin(df_[name])

    df_ = df[(LAST_DAY - df.created_at).dt.days // 3 == 0]
    df__ = add_agg(df_, df__, name, "canceled_by_client", ["mean", "median", "std", "count"], "week")
    df__ = add_agg(df_, df__, name, "canceled_by_driver", ["mean", "median", "std"], "week")
    df__ = add_agg(df_, df__, name, "ride_to_suburb", ["mean", "std"], "week")
    df__ = add_agg(df_, df__, name, 'ride_distance', ["mean", "std", "median", "min", "max"], "week")
    df__ = add_agg(df_, df__, name, 'dayofweek', ["median", "mode"], "week")
    df__ = add_agg(df_, df__, name, 'final_time', ["median", "mode"], "week")
    df__ = add_agg(df_, df__, name, 'is_weekend', ["median", "std", "mean", "mode"], "week")

    df_ = df[(LAST_DAY - df.created_at).dt.days // 2 == 0]
    df__ = add_agg(df_, df__, name, "canceled_by_client", ["mean", "median", "std", "count"], "day")
    df__ = add_agg(df_, df__, name, "canceled_by_driver", ["mean", "median", "std"], "day")
    df__ = add_agg(df_, df__, name, "ride_to_suburb", ["mean", "std"], "day")
    df__ = add_agg(df_, df__, name, 'ride_distance', ["mean", "std", "median", "min", "max"], "day")
    df__ = add_agg(df_, df__, name, 'dayofweek', ["median", "mode"], "day")
    df__ = add_agg(df_, df__, name, 'final_time', ["median", "mode"], "day")
    df__ = add_agg(df_, df__, name, 'is_weekend', ["median", "std", "mean", "mode"], "day")
    print(df__.shape)
    df__.fillna(0.0, inplace=True)
    return df__

In [None]:
%%time
df = clear_df(df)

LAST_DAY = df.created_at.max()
LAST_DAY

In [None]:
%%time
df_time = make_df_time(df)
df = df.merge(df_time, how="left", left_index=True, right_index=True)

In [None]:
# %%time
# df_driver = make_aggs(df, "driver_id", 30)
# %%time
# df_user = make_aggs(df, "user_id", 10)
# df = df.merge(df_driver, how="left", left_on="driver_id", right_index=True)
# df = df.merge(df_user, how="left", left_on="user_id", right_index=True)

In [None]:
df.dropna(0, inplace=True)
df_cell = make_aggs(df, "cell_id", 26, LAST_DAY)
df = df[df.cell_id.isin(df_cell.index)]
df.to_csv("new_csv.csv")

In [None]:
df_cell["all_cell_id_canceled_by_client_count"]

In [None]:
df.shape

In [None]:
center = list(df_cell.index)
center_size = len(center)

In [None]:
df_ = df[df["cell_id"].isin(center)]

In [None]:
from datetime import timedelta

from keras.utils import to_categorical

In [None]:
day = LAST_DAY.date()
day = day - timedelta(0)

In [None]:
day_nums = 30
X = []
y = np.zeros((24, day_nums, center_size))
for k in range(0, day_nums):
    day = LAST_DAY.date() - timedelta(k + 22)
    redundant_cols=["hour", "minute", "final_time"]

    df__time = make_df_time(pd.DataFrame({"created_at": [pd.Timestamp(day)]}))
    df__time.drop(redundant_cols, axis=1, inplace=True)
    df__time = np.int16(np.concatenate((to_categorical(df__time["month"], 12)[0],
                   to_categorical(df__time["dayofweek"], 7)[0],
                   df__time.drop(["month", "dayofweek"], axis=1).values[0])))
    
    
    res = make_aggs(df_, "cell_id", 0, day).values.reshape(1, -1)[0]
    X.append(np.concatenate((df__time, res)).reshape(1, -1))
    
    group = df_[df_.created_at.dt.date == day].groupby(["final_time", "cell_id"]).created_at.count()
    for i in range(24):
        for j, cell in enumerate(center):
            if i in group:
                if cell in group.loc[i]:
                    y[i][k][j] = group.loc[i, cell]

In [None]:
X = np.array(X)
X.shape

In [None]:
y.shape

In [None]:
from keras.layers import Dense, Flatten, LSTM, Input, BatchNormalization, Dropout
from keras.models import Model
import keras.backend as K 

In [None]:
inp = Input(tuple(X.shape[1:]))
lstm1 = LSTM(units=128, activation="relu", return_sequences=True)(inp)
# lstm2 = LSTM(units=128, activation="relu", return_sequences=True)(lstm1)
outs= []
for i in range(y.shape[0]):
    flt = Flatten()(lstm1)
    dns = Dense(units=128, activation="relu")(flt)
    bn = BatchNormalization()(dns)
    outs.append(Dense(center_size, activation="relu")(bn))
model = Model(inp, outs)

In [None]:
model.summary()

In [None]:
from keras.metrics import mean_squared_logarithmic_error

In [None]:
model = Model(inp, outs)
def rmsle(y_true, y_pred):
        return K.sqrt(K.mean(K.square(K.log(y_pred + 1.) - K.log(y_true + 1.)), axis=-1)) 

model.compile(optimizer = "adam", loss = mean_squared_logarithmic_error, metrics =["accuracy"])

In [None]:
model.fit(X, list(y), batch_size=2, epochs=10000)