In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.neighbors import KNeighborsRegressor

In [2]:
df = pd.read_csv('Competition_data.csv')
sub = pd.read_csv('sub.csv')

In [3]:
df.head()

Unnamed: 0,Date,Close
0,1/1/2023 23:58:00,448.08
1,1/2/2023 23:58:00,448.08
2,1/3/2023 23:58:00,448.55
3,1/4/2023 23:58:00,449.01
4,1/5/2023 23:58:00,449.53


In [4]:
# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Extract date and time into separate columns
df['Only_Date'] = df['Date'].dt.date
df['Only_Time'] = df['Date'].dt.time

# Sort the DataFrame by 'Only_Date' and 'Only_Time'
df = df.sort_values(by=['Only_Date', 'Only_Time'])

df['Date'] = df['Only_Date']
df['Date'] = pd.to_datetime(df['Date'])
df.drop(['Only_Date', 'Only_Time'], axis=1, inplace=True)

# Reset index if desired
df.reset_index(drop=True, inplace=True)

In [5]:
df.head()

Unnamed: 0,Date,Close
0,2023-01-01,448.08
1,2023-01-02,448.08
2,2023-01-03,448.55
3,2023-01-04,449.01
4,2023-01-05,449.53


In [6]:
df.Date.max()

Timestamp('2024-05-06 00:00:00')

In [7]:
import numpy as np
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA

# This function uses ARIMA to forecast future values of a time series using the index as input.
def knn_forecast(series, window, forecast_horizon, n_neighbors):
    """
    series: The time series data (e.g., historical_close).
    window: Not used in ARIMA, included for compatibility with original function signature.
    forecast_horizon: The number of future time steps to forecast.
    order: Tuple indicating the order of the ARIMA model (p, d, q).
    """
    # Fit the ARIMA model
    model = ARIMA(series, order=(1, 1, 1))
    model_fit = model.fit()

    # Forecast future values
    forecast = model_fit.forecast(steps=forecast_horizon)

    return forecast

# Example usage:
# Assuming `historical_close` is your time series data (a pandas Series).
forecast_columns = ['Close'] # This can be used as needed in your broader code context.


# Call the function
# forecast = knn_forecast(historical_close, window=13, forecast_horizon=forecast_horizon, order=order)

    

In [8]:
# this fn takes in group (a df with data for one specific ID), 
# and returns a lookalike df with forecasted clicks for the next forecast horizon(16days/2weeks) 
# based on the past window(13days/2weeks) of clicks from the group
def add_knn_forecasts(group, forecast_horizon, window):
    # group: A DataFrame containing data for one specific ID
    group = group.set_index('Date') # set the Date column as the index, these are dates for one specific ID
    group = group.asfreq('D', method='ffill') # set the frequency of the index to daily and forward fill the missing values

    last_date = group.index.max() # get the last Date in the index (last Date for that specific ID)
    forecast_dates = pd.date_range(start=last_date, periods=forecast_horizon + 1, freq='D')[1:] # a range of dates to be forecasted (from after the last Date to the forecast horizon)
    forecast_data = {} # for storing forecasted values for each column in forecast_columns (in this case, clicks)

    for col in forecast_columns: # in this case, for clicks:
        forecast_data[col] = knn_forecast(group[col], window, forecast_horizon, n_neighbors= window) # clicks = (the fn takes in d clicks column for each Date(day) for one specific ID, then returrns the future clicks for the forecast horizon (16days/2weeks))

    forecast_df = pd.DataFrame(forecast_data, index=forecast_dates) # a df containing forcasted clicks with forecast_dates as index
    forecast_df['is_forecast'] = True # indicates thst these rows have been forecasted

    group = group.reset_index() # turning group back to original form
    forecast_df = forecast_df.reset_index().rename(columns={'index': 'Date'}) # making forecast_df look exactly like group

    return forecast_df # returns the forecasted df with columns: Date, ID, clicks, is_forecast=True)


In [9]:
import math
window_size = int(math.sqrt(len(df))) # window size is the square root of the length of the df
forecast_horizon = 14
forecast_df = add_knn_forecasts(df, forecast_horizon, window_size) # get the forecasted df for that specific ID
df = pd.concat([df.reset_index(drop=True), forecast_df]) # concat the df for that ID with its forecasted df and append to empty list of all_data

df['is_forecast'] = df['is_forecast'].fillna(False) # set missing values in is_forecast to False (to show that these rows were not forecasted)


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


  df['is_forecast'] = df['is_forecast'].fillna(False) # set missing values in is_forecast to False (to show that these rows were not forecasted)


In [10]:
df.head()

Unnamed: 0,Date,Close,is_forecast
0,2023-01-01,448.08,False
1,2023-01-02,448.08,False
2,2023-01-03,448.55,False
3,2023-01-04,449.01,False
4,2023-01-05,449.53,False


In [11]:
train = df[df['is_forecast'] == False]
test = df[df['is_forecast'] == True] # test = a df containing only forecasted rows

In [12]:
test.head()

Unnamed: 0,Date,Close,is_forecast
0,2024-05-07,1383.96751,True
1,2024-05-08,1383.967398,True
2,2024-05-09,1383.967398,True
3,2024-05-10,1383.967398,True
4,2024-05-11,1383.967398,True


In [13]:
# sub = test[['Date', 'Close']] # sub = a df containing only Date and close columns
# sub['Date'] = pd.date_range(start='2024-05-22 23:58:00', periods=len(sub), freq='D') # set the Date column to a range of dates from 2021-07-01 to 16 days after
# # Format 'Date' column to match the desired format
# sub['Date'] = sub['Date'].dt.strftime('%-m/%-d/%Y %H:%M:%S')

In [14]:
# # Set the start_date with four-digit year including century
# start_date = pd.to_datetime('2024-05-22 23:58:00')

# # Generate date list with desired length
# date_list = pd.date_range(start=start_date, periods=len(sub), freq='D')

# # Assign the formatted date list to the 'Date' column
# sub['Date'] = date_list.strftime('%m/%d/%Y %H:%M:%S')

In [15]:
# # Convert 'Date' column to datetime
# sub['Date'] = pd.to_datetime(sub['Date'])

# # Format dates to remove leading zeros in month and day
# sub['Date'] = sub['Date'].dt.strftime('%-m/%-d/%Y %H:%M:%S')


In [16]:
# data = {
#     'Date': [
#         '5/22/2024 23:58:00', '5/23/2024 23:58:00', '5/24/2024 23:58:00',
#         '5/25/2024 23:58:00', '5/26/2024 23:58:00', '5/27/2024 23:58:00',
#         '5/28/2024 23:58:00', '5/29/2024 23:58:00', '5/30/2024 23:58:00',
#         '5/31/2024 23:58:00', '6/1/2024 23:58:00', '6/2/2024 23:58:00',
#         '6/3/2024 23:58:00', '6/4/2024 23:58:00'
#     ],
#     'Close': [1248.522727] * 14
# }
# df = pd.DataFrame(data)

In [17]:
# # Define the data
# data = {
#     'Date': [
#         '1/1/2023 23:58:00', '1/2/2023 23:58:00', '1/3/2023 23:58:00',
#         '1/4/2023 23:58:00', '1/5/2023 23:58:00', '1/6/2023 23:58:00',
#         '1/7/2023 23:58:00', '1/8/2023 23:58:00', '1/9/2023 23:58:00',
#         '1/10/2023 23:58:00', '1/11/2023 23:58:00', '1/12/2023 23:58:00',
#         '1/13/2023 23:58:00', '1/14/2023 23:58:00'
#     ],
#     'Close': [1248.522727] * 14
# }

# # Create DataFrame
# df = pd.DataFrame(data)

In [18]:
sub['Close'] = test['Close']

In [19]:
sub.head()

Unnamed: 0,Date,Close
0,2024-05-22,1383.96751
1,2024-05-23,1383.967398
2,2024-05-24,1383.967398
3,2024-05-25,1383.967398
4,2024-05-26,1383.967398


In [20]:
sub.to_csv('submission.csv', index=False)