<a href="https://colab.research.google.com/github/kubohenrique/VT_Forecast_Project/blob/main/VT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# VT ETF Monthly Forecast by Henrique Kubo

## Library

In [1]:
import pandas as pd
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error



## Data import and cleaning

In [2]:
# If this is commented, the plots will show in Jupyter / Colab.
# If it gets executed, the plots will be embedded into the notebook HTML itself.

#import plotly.offline as pyo
#pyo.init_notebook_mode(connected=True)

In [3]:
data = pd.read_csv('VT.csv')

data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2008-07-01,49.130001,49.689999,46.34,48.209999,34.467892,743400
1,2008-08-01,48.189999,50.02,46.139999,47.32,33.831573,719900
2,2008-09-01,47.32,47.689999,35.849998,43.009998,30.750139,1610800
3,2008-10-01,41.59,42.830002,29.0,33.790001,24.158274,3581500
4,2008-11-01,34.650002,35.990002,26.6,31.49,22.513874,2968700


In [4]:
data.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')

In [5]:
data.drop(['Open', 'High', 'Low', 'Adj Close', 'Volume'], axis=1, inplace=True)

data

Unnamed: 0,Date,Close
0,2008-07-01,48.209999
1,2008-08-01,47.320000
2,2008-09-01,43.009998
3,2008-10-01,33.790001
4,2008-11-01,31.490000
...,...,...
183,2023-10-01,90.459999
184,2023-11-01,98.610001
185,2023-12-01,102.879997
186,2024-01-01,102.879997


In [6]:
fig = px.line(data, x='Date', y='Close', title = 'VT Montly Price Over Time')
fig.show()

## Data Preparation for Training the Model (Rolling Window)

In [7]:
def create_rolling_window_df(data, window_size):
    """Generate a DataFrame with rolling windows of closing prices."""
    columns = [f'N-{window_size-i}' for i in range(window_size)] + ['N']
    df = pd.DataFrame(columns=['Target Date'] + columns)

    for i in range(len(data) - window_size):
        row = [data.index[i + window_size]] + list(data[i:i + window_size + 1])
        df.loc[i] = row

    return df

window_size = 5
data_with_dates = data.set_index('Date')['Close']
window_df = create_rolling_window_df(data_with_dates, window_size)

window_df

Unnamed: 0,Target Date,N-5,N-4,N-3,N-2,N-1,N
0,2008-12-01,48.209999,47.320000,43.009998,33.790001,31.490000,32.980000
1,2009-01-01,47.320000,43.009998,33.790001,31.490000,32.980000,29.400000
2,2009-02-01,43.009998,33.790001,31.490000,32.980000,29.400000,26.600000
3,2009-03-01,33.790001,31.490000,32.980000,29.400000,26.600000,28.950001
4,2009-04-01,31.490000,32.980000,29.400000,26.600000,28.950001,32.439999
...,...,...,...,...,...,...,...
178,2023-10-01,92.269997,96.980003,100.589996,97.730003,93.180000,90.459999
179,2023-11-01,96.980003,100.589996,97.730003,93.180000,90.459999,98.610001
180,2023-12-01,100.589996,97.730003,93.180000,90.459999,98.610001,102.879997
181,2024-01-01,97.730003,93.180000,90.459999,98.610001,102.879997,102.879997


In [8]:
X = window_df.iloc[:, 1:-1].values  # excluding Target Date and N column
y = window_df['N'].values

X.shape, y.shape

((183, 5), (183,))

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((146, 5), (37, 5), (146,), (37,))

In [10]:
model = LinearRegression()
model.fit(X_train, y_train)

model.intercept_, model.coef_

(0.7904369702048655,
 array([ 0.01678739, -0.02397864,  0.12123529, -0.09112064,  0.97147133]))

## Visualization and Evaluation

In [11]:
# Predict on training set
y_train_pred = model.predict(X_train)

# Create a DataFrame for visualization
train_dates = window_df['Target Date'].iloc[:-len(y_test)].reset_index(drop=True)
train_results = pd.DataFrame({'Date': train_dates, 'Actual': y_train, 'Predicted': y_train_pred})

# Plot the predictions vs actual values
fig = px.line(train_results, x='Date', y=['Actual', 'Predicted'], title='Actual vs Predicted Stock Prices on Training Data (Monthly)')
fig.show()

In [12]:

# Predict on test set
from sklearn.metrics import mean_absolute_error

y_pred = model.predict(X_test)

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")

Mean Absolute Error: 3.74


In [13]:

# Visualize the predictions vs actual values
test_dates = data['Date'].iloc[-len(y_test):]
results = pd.DataFrame({'Date': test_dates, 'Actual': y_test, 'Predicted': y_pred})

fig = px.line(results, x='Date', y=['Actual', 'Predicted'], title='Actual vs Predicted Stock Prices')
fig.show()


In [14]:
!jupyter nbconvert --to html VT.ipynb

[NbConvertApp] Converting notebook VT.ipynb to html
[NbConvertApp] Writing 673129 bytes to VT.html
