In [17]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.neighbors import KNeighborsRegressor

In [18]:
df = pd.read_csv('../../assets/data/Train.csv')
submission = pd.read_csv('../../assets/data/SampleSubmission.csv')

In [19]:
df = df.dropna(subset=['clicks']) # delete rows where clicks is NaN
df.reset_index(drop=True, inplace=True)


In [20]:
df = df[['ID', 'date', 'clicks']]
df = df.sort_values(by=['ID', 'date']).reset_index(drop=True)


In [21]:
grouped_df = df.groupby(['date', 'ID']).sum().reset_index()


In [22]:
grouped_df['date'] = pd.to_datetime(grouped_df['date'])


set nearest neighbours as window

In [23]:
forecast_columns = ['clicks']
# This function uses KNN to forecast future values of a time series using the index as input.
def knn_forecast(series, window, forecast_horizon, n_neighbors): 
# it takes in a series(the series is the 'clicks' column in this case), 
# window size(num of past data in the series to consider - in this case I used 13 (approx 2 weeks in the past)), 
# forecast horizon (num of days to forecast into the future - in this case I used 16 (approx 2 weeks into the future)),
# and number of neighbors (number of neighbours to consider in the KNN model - in this case I used 1)
    # Prepare training data for KNN
    X_train = np.arange(len(series)).reshape(-1, 1) # index of the series
    y_train = series.values # values of the series

    # Initialize KNN model
    knn = KNeighborsRegressor(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train) # input is index, output is value

    # Predict the next values using the trained KNN model
    last_index = len(series) - 1 # index of the last value in the series
    X_forecast = np.arange(last_index + 1, last_index + 1 + forecast_horizon).reshape(-1, 1) # index of the values to forecast so as to predict the values for that index
    forecast = knn.predict(X_forecast)

    return forecast # return the forecasted values


In [24]:
# this fn takes in group (a df with data for one specific ID), 
# and returns a lookalike df with forecasted clicks for the next forecast horizon(16days/2weeks) 
# based on the past window(13days/2weeks) of clicks from the group
def add_knn_forecasts(group, forecast_horizon, window):
    # group: A DataFrame containing data for one specific ID
    group = group.set_index('date') # set the date column as the index, these are dates for one specific ID
    group.index = pd.to_datetime(group.index) # set the datatype of the index(which is now date) to datetime
    group = group.asfreq('D', method='ffill') # set the frequency of the index to daily and forward fill the missing values

    last_date = group.index.max() # get the last date in the index (last date for that specific ID)
    forecast_dates = pd.date_range(start=last_date, periods=forecast_horizon + 1, freq='D')[1:] # a range of dates to be forecasted (from after the last date to the forecast horizon)
    forecast_data = {} # for storing forecasted values for each column in forecast_columns (in this case, clicks)

    for col in forecast_columns: # in this case, for clicks:
        forecast_data[col] = knn_forecast(group[col], window, forecast_horizon, n_neighbors= window) # clicks = (the fn takes in d clicks column for each date(day) for one specific ID, then returrns the future clicks for the forecast horizon (16days/2weeks))

    forecast_df = pd.DataFrame(forecast_data, index=forecast_dates) # a df containing forcasted clicks with forecast_dates as index
    forecast_df['ID'] = group['ID'].iloc[0] # sets all the ID as the specific ID of this group (since this forecast is for only one ID)
    forecast_df['is_forecast'] = True # indicates thst these rows have been forecasted

    group = group.reset_index() # turning group back to original form
    forecast_df = forecast_df.reset_index().rename(columns={'index': 'date'}) # making forecast_df look exactly like group

    return forecast_df # returns the forecasted df with columns: date, ID, clicks, is_forecast=True)


In [25]:
all_data = [] # empty list for all data
window_size = 1
forecast_horizon = 16
for name, group in grouped_df.groupby(['ID']): # for each group of specific IDs:
    forecast_df = add_knn_forecasts(group, forecast_horizon, window_size) # get the forecasted df for that specific ID
    all_data.append(pd.concat([group.reset_index(drop=True), forecast_df])) # concat the df for that ID with its forecasted df and append to empty list of all_data

grouped_df = pd.concat(all_data).sort_values(by=['ID', 'date']) # turn the list of all_data to a df and sort by ID and date
grouped_df['is_forecast'] = grouped_df['is_forecast'].fillna(False) # set missing values in is_forecast to False (to show that these rows were not forecasted)


  grouped_df['is_forecast'] = grouped_df['is_forecast'].fillna(False) # set missing values in is_forecast to False (to show that these rows were not forecasted)


In [26]:
train = grouped_df[grouped_df['is_forecast'] == False]
test = grouped_df[grouped_df['is_forecast'] == True] # test = a df containing only forecasted rows


In [27]:
sub = submission.copy()

              # split ID column into date and ID 
sub[['year', 'month', 'day']] = sub['ID'].str.extract(r'_(\d{4})_(\d{2})_(\d{2})')
sub['ID'] = sub['ID'].str.replace(r'(_\d{4}_\d{2}_\d{2})$', '', regex=True)
sub['date'] = pd.to_datetime(sub[['year', 'month', 'day']])


In [28]:
filtered_test = pd.merge(test, sub[['ID', 'date']], on=['ID', 'date']) # all rows in test r kept
merge_df = pd.merge(sub, filtered_test, on=['ID', 'date'], how='left') # all rows in sub are kept


In [29]:
merge_df.head()

Unnamed: 0,ID,clicks_x,year,month,day,date,clicks_y,is_forecast
0,ID_5da86e71bf5dee4cf5047046,0,2024,1,22,2024-01-22,114.0,True
1,ID_5da86e71bf5dee4cf5047046,0,2024,1,29,2024-01-29,114.0,True
2,ID_5e43c29e6279884e2827d894,0,2024,2,21,2024-02-21,6.0,True
3,ID_5e43c29e6279884e2827d894,0,2024,2,28,2024-02-28,6.0,True
4,ID_5e4e7b480e374330ee151305,0,2023,12,4,2023-12-04,2.0,True


In [30]:
click_sums = merge_df.groupby(['ID', 'date'])['clicks_y'].sum().reset_index()
click_sums.rename(columns={'clicks_y': 'sum_clicks'}, inplace=True)


In [31]:
sub = pd.merge(sub, click_sums, on=['ID', 'date'], how='left')
sub['clicks'] = sub['sum_clicks']

sub.drop(columns='sum_clicks', inplace=True)
sub['clicks'] = sub['clicks'].round() # round to nearest whole number
submission['clicks'] = sub['clicks']


In [32]:
submission.to_csv('submission.csv', index=False)
