In [21]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.neighbors import KNeighborsRegressor

In [22]:
df = pd.read_csv('Competition_data.csv')
sub = pd.read_csv('sub.csv')

In [23]:
df.head()

Unnamed: 0,Date,Close
0,1/1/2023 23:58:00,448.08
1,1/2/2023 23:58:00,448.08
2,1/3/2023 23:58:00,448.55
3,1/4/2023 23:58:00,449.01
4,1/5/2023 23:58:00,449.53


In [24]:
# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Extract date and time into separate columns
df['Only_Date'] = df['Date'].dt.date
df['Only_Time'] = df['Date'].dt.time

# Sort the DataFrame by 'Only_Date' and 'Only_Time'
df = df.sort_values(by=['Only_Date', 'Only_Time'])

# Reset index if desired
df.reset_index(drop=True, inplace=True)

In [25]:
df.head()

Unnamed: 0,Date,Close,Only_Date,Only_Time
0,2023-01-01 23:58:00,448.08,2023-01-01,23:58:00
1,2023-01-02 23:58:00,448.08,2023-01-02,23:58:00
2,2023-01-03 23:58:00,448.55,2023-01-03,23:58:00
3,2023-01-04 23:58:00,449.01,2023-01-04,23:58:00
4,2023-01-05 23:58:00,449.53,2023-01-05,23:58:00


In [26]:
df.Date.max()

Timestamp('2024-05-06 23:58:00')

In [27]:
forecast_columns = ['Close']
# This function uses KNN to forecast future values of a time series using the index as input.
def knn_forecast(series, window, forecast_horizon, n_neighbors): 
# it takes in a series(the series is the 'clicks' column in this case), 
# window size(num of past data in the series to consider - in this case I used 13 (approx 2 weeks in the past)), 
# forecast horizon (num of days to forecast into the future - in this case I used 16 (approx 2 weeks into the future)),
# and number of neighbors (number of neighbours to consider in the KNN model - in this case I used 1)
    # Prepare training data for KNN
    X_train = np.arange(len(series)).reshape(-1, 1) # index of the series
    y_train = series.values # values of the series

    # Initialize KNN model
    knn = KNeighborsRegressor(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train) # input is index, output is value

    # Predict the next values using the trained KNN model
    last_index = len(series) - 1 # index of the last value in the series
    X_forecast = np.arange(last_index + 1, last_index + 1 + forecast_horizon).reshape(-1, 1) # index of the values to forecast so as to predict the values for that index
    forecast = knn.predict(X_forecast)

    return forecast # return the forecasted values
    

In [28]:
# this fn takes in group (a df with data for one specific ID), 
# and returns a lookalike df with forecasted clicks for the next forecast horizon(16days/2weeks) 
# based on the past window(13days/2weeks) of clicks from the group
def add_knn_forecasts(group, forecast_horizon, window):
    # group: A DataFrame containing data for one specific ID
    group = group.set_index('Date') # set the Date column as the index, these are dates for one specific ID
    group = group.asfreq('D', method='ffill') # set the frequency of the index to daily and forward fill the missing values

    last_date = group.index.max() # get the last Date in the index (last Date for that specific ID)
    forecast_dates = pd.date_range(start=last_date, periods=forecast_horizon + 1, freq='D')[1:] # a range of dates to be forecasted (from after the last Date to the forecast horizon)
    forecast_data = {} # for storing forecasted values for each column in forecast_columns (in this case, clicks)

    for col in forecast_columns: # in this case, for clicks:
        forecast_data[col] = knn_forecast(group[col], window, forecast_horizon, n_neighbors= window) # clicks = (the fn takes in d clicks column for each Date(day) for one specific ID, then returrns the future clicks for the forecast horizon (16days/2weeks))

    forecast_df = pd.DataFrame(forecast_data, index=forecast_dates) # a df containing forcasted clicks with forecast_dates as index
    forecast_df['is_forecast'] = True # indicates thst these rows have been forecasted

    group = group.reset_index() # turning group back to original form
    forecast_df = forecast_df.reset_index().rename(columns={'index': 'Date'}) # making forecast_df look exactly like group

    return forecast_df # returns the forecasted df with columns: Date, ID, clicks, is_forecast=True)


In [29]:
import math
window_size = int(math.sqrt(len(df))) # window size is the square root of the length of the df
forecast_horizon = 14
forecast_df = add_knn_forecasts(df, forecast_horizon, window_size) # get the forecasted df for that specific ID
df = pd.concat([df.reset_index(drop=True), forecast_df]) # concat the df for that ID with its forecasted df and append to empty list of all_data

df['is_forecast'] = df['is_forecast'].fillna(False) # set missing values in is_forecast to False (to show that these rows were not forecasted)


  df['is_forecast'] = df['is_forecast'].fillna(False) # set missing values in is_forecast to False (to show that these rows were not forecasted)


In [30]:
df.head()

Unnamed: 0,Date,Close,Only_Date,Only_Time,is_forecast
0,2023-01-01 23:58:00,448.08,2023-01-01,23:58:00,False
1,2023-01-02 23:58:00,448.08,2023-01-02,23:58:00,False
2,2023-01-03 23:58:00,448.55,2023-01-03,23:58:00,False
3,2023-01-04 23:58:00,449.01,2023-01-04,23:58:00,False
4,2023-01-05 23:58:00,449.53,2023-01-05,23:58:00,False


In [31]:
train = df[df['is_forecast'] == False]
test = df[df['is_forecast'] == True] # test = a df containing only forecasted rows

In [32]:
test.head()

Unnamed: 0,Date,Close,Only_Date,Only_Time,is_forecast
0,2024-05-07 23:58:00,1248.522727,,,True
1,2024-05-08 23:58:00,1248.522727,,,True
2,2024-05-09 23:58:00,1248.522727,,,True
3,2024-05-10 23:58:00,1248.522727,,,True
4,2024-05-11 23:58:00,1248.522727,,,True


In [33]:
# sub = test[['Date', 'Close']] # sub = a df containing only Date and close columns
# sub['Date'] = pd.date_range(start='2024-05-22 23:58:00', periods=len(sub), freq='D') # set the Date column to a range of dates from 2021-07-01 to 16 days after
# # Format 'Date' column to match the desired format
# sub['Date'] = sub['Date'].dt.strftime('%-m/%-d/%Y %H:%M:%S')

In [34]:
# # Set the start_date with four-digit year including century
# start_date = pd.to_datetime('2024-05-22 23:58:00')

# # Generate date list with desired length
# date_list = pd.date_range(start=start_date, periods=len(sub), freq='D')

# # Assign the formatted date list to the 'Date' column
# sub['Date'] = date_list.strftime('%m/%d/%Y %H:%M:%S')

In [35]:
# # Convert 'Date' column to datetime
# sub['Date'] = pd.to_datetime(sub['Date'])

# # Format dates to remove leading zeros in month and day
# sub['Date'] = sub['Date'].dt.strftime('%-m/%-d/%Y %H:%M:%S')


In [36]:
# data = {
#     'Date': [
#         '5/22/2024 23:58:00', '5/23/2024 23:58:00', '5/24/2024 23:58:00',
#         '5/25/2024 23:58:00', '5/26/2024 23:58:00', '5/27/2024 23:58:00',
#         '5/28/2024 23:58:00', '5/29/2024 23:58:00', '5/30/2024 23:58:00',
#         '5/31/2024 23:58:00', '6/1/2024 23:58:00', '6/2/2024 23:58:00',
#         '6/3/2024 23:58:00', '6/4/2024 23:58:00'
#     ],
#     'Close': [1248.522727] * 14
# }
# df = pd.DataFrame(data)

In [37]:
# # Define the data
# data = {
#     'Date': [
#         '1/1/2023 23:58:00', '1/2/2023 23:58:00', '1/3/2023 23:58:00',
#         '1/4/2023 23:58:00', '1/5/2023 23:58:00', '1/6/2023 23:58:00',
#         '1/7/2023 23:58:00', '1/8/2023 23:58:00', '1/9/2023 23:58:00',
#         '1/10/2023 23:58:00', '1/11/2023 23:58:00', '1/12/2023 23:58:00',
#         '1/13/2023 23:58:00', '1/14/2023 23:58:00'
#     ],
#     'Close': [1248.522727] * 14
# }

# # Create DataFrame
# df = pd.DataFrame(data)

In [38]:
sub['Close'] = test['Close']

In [39]:
sub.head()

Unnamed: 0,Date,Close
0,2024-05-22,1248.522727
1,2024-05-23,1248.522727
2,2024-05-24,1248.522727
3,2024-05-25,1248.522727
4,2024-05-26,1248.522727


In [40]:
sub.to_csv('submission.csv', index=False)