In [None]:
from config import API_KEY
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt


In [None]:
# Data Collection
from tiingo import TiingoClient
from datetime import datetime, timedelta

config = {
    'api_key': API_KEY,
    'session': True  # Use requests.Session
}
client = TiingoClient(config)

# Calculate the start date as today's date minus 2 years
end_date = datetime.now()
start_date = end_date - timedelta(days=20*365)

# Convert dates to string format
start_date_str = start_date.strftime('%Y-%m-%d')
end_date_str = end_date.strftime('%Y-%m-%d')

df = client.get_dataframe('MSFT', frequency='daily',startDate=start_date_str, endDate=end_date_str)

In [None]:
df2  = df
df

In [None]:
df.index = pd.to_datetime(df.index).to_period('B')



# df.reset_index(inplace=True)
# df.rename(columns={'index': 'new_index'}, inplace=True)  # Rename the old index column
# df['date'] = df['date'].dt.date  # Extract the date part from the datetime index
# df

In [None]:
df.index



In [None]:
idx = pd.period_range(min(df.index), max(df.index))
idx



In [None]:
# the days that data was not available - market was closed
df.index.symmetric_difference(idx)

In [None]:
# reindex
df = df.reindex(idx,fill_value=np.nan)
df.head()

In [None]:
df = df.fillna(method='ffill')
df.head()

In [None]:
df.plot()
plt.show()

In [None]:
# Check if the index is a RangeIndex


In [None]:
# df.reset_index(inplace=True)
# df.rename(columns={'index': 'new_index'}, inplace=True)  # Rename the old index column
# df['date'] = df['new_index'].to_timestamp('D').date
# df['date'] = df['date'].dt.date  # Extract the date part from the datetime index
# # 
# # Convert the 'date' column to datetime format


df

In [None]:
#  extract month and year from dates
# df['Month'] = [i.month for i in df['date']]
# df['Year'] = [i.year for i in df['date']]
# df['Day'] = [i.day for i in df['date']]

In [None]:
 
# # create a sequence of numbers
# df['Series'] = np.arange(1,len(df)+1)

In [None]:
df.drop(['volume','close','high','low','open','adjVolume','divCash'],axis=1,inplace=True)
df = df[['Year','Month','Day','adjClose','adjHigh','adjLow','adjOpen','date']]
df.head()




In [None]:

# Assuming your data is sorted by time
# Create a lag feature to represent the previous values
for i in range(1, 22):  # Assuming you want to use the first 15 points to predict the next one
    df[f'lag_{i}'] = df['adjClose'].shift(i)

In [None]:
# Drop the first row since it doesn't have a previous day's closing price to compare with
df = df.dropna()

In [None]:
df_copy = df.copy()
df_copy

In [None]:
df_copy.drop(['date'],axis=1,inplace=True)

In [None]:
df_copy.head()

In [None]:
# Split Train Test into 80/20 ratio

pos=int(round(len(df)*(0.8)))
train=df[:pos]
test=df[pos:]
train.shape, test.shape
# int(round(len(df)*(1-0.2)))


In [None]:
# split data into train-test set
# train = df[df['Year'] < 2023]
# test = df[df['Year'] >= 2023]
# check shape

lag_features = ', '.join([f'"{lag}"' for lag in df.columns if 'lag_' in lag])
print(lag_features)




In [None]:
# Setup the classification task with time series configuration
# import the regression module

from pycaret.regression import *
features = lag_features
close_s = setup(data=train, test_data=test, target='adjClose', fold_strategy='timeseries', feature_selection=lag_features, data_split_shuffle=False, fold_shuffle=False, session_id=123)


# from pycaret.regression import *
# # initialize setup
# close_s = setup(data=train, test_data=test, target='adjClose', fold_strategy='timeseries', numeric_features=['Year'], data_split_shuffle=False, fold_shuffle=False, session_id=123)

# # import pycaret classification and init setup
# from pycaret.time_series import *
# close_s=setup(data=train,target='adjClose',fh = 7, fold = 3, seasonal_period = 5, session_id = 123)

In [None]:
close_best = compare_models(sort = 'MAE')

Analyze Model

In [None]:
evaluate_model(close_best)

In [None]:
prediction_holdout = predict_model(close_best)

Prediction

In [None]:
# generate predictions on the original dataset
# predictions = predict_model(close_best)
predictions = predict_model(close_best, data=df)


In [None]:
predictions

In [None]:
predictions['date'] = df_copy['date']

In [None]:
predictions

In [None]:
predictions

In [None]:
#  Line plot
fig = px.line(predictions, x='date', y=["adjOpen","Label"], template='plotly_dark')

# Add a vertical rectangle for test-set separation
fig.add_vrect(x0="2023-01-01", x1="2024-31-01", fillcolor="grey", opacity=0.25, line_width=0)

# Show the plot
fig.show()

In [None]:
final_best = finalize_model(close_best)
final_best

In [None]:
# # Create a future scoring dataset

future_dates = pd.date_range(start='2024-02-17', end='2024-03-17', freq='B')

# Create a DataFrame for future dates
future_df = pd.DataFrame({
    'date': future_dates,
    'Series': np.arange(7548, 7548 + len(future_dates)),  # Assuming you want to continue the series numbering
    'Year': [d.year for d in future_dates],
    'Month': [d.month for d in future_dates],
    'Day': [d.day for d in future_dates],
    
    
    
    
})

future_df.head()

In [None]:
# #  Get the last date in your original DataFrame
# last_date = df_copy['date'].max()

# # Generate future dates starting from the next day after the last date
# future_dates = pd.date_range(start=last_date + pd.DateOffset(days=1), periods=365, freq='D')

# # Create a DataFrame with the future dates
# future_df = pd.DataFrame({'date': future_dates})

# # Add other columns to the future DataFrame if needed
# future_df['Year'] = future_df['date'].dt.year
# future_df['Month'] = future_df['date'].dt.month
# future_df['Day'] = future_df['date'].dt.day
# future_df

In [None]:

predictions_future = predict_model(final_best, data=future_df)
predictions_future

In [None]:
concat_df = pd.concat([predictions, predictions_future], axis=0)

# Set the index of concat_df to the date column
concat_df.set_index('date', inplace=True)

# Plot the data
fig = px.line(concat_df, x=concat_df.index, y=["close", "prediction_label"], template='plotly_dark')
fig.show()