In [2]:
import sys
import os

# Add the project directory to the Python path
project_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_dir not in sys.path:
    sys.path.append(project_dir)

In [None]:
!pip install pandas openpyxl
!pip install statsmodels
!pip install arch
!pip install tensorflow
!pip install boto3
!pip install requests
!pip install numpy
!pip install joblib matplotlib
!pip install prophet
!pip install tabulate

Collecting numpy>=1.22.3 (from statsmodels)
  Downloading numpy-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.22.4
    Uninstalling numpy-1.22.4:
      Successfully uninstalled numpy-1.22.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mkl-fft 1.3.10 requires mkl, which is not installed.
hdijupyterutils 0.21.0 requires pandas<2.0.0,>=0.17.1, but you have pa

In [None]:
import pandas as pd
import numpy as np
import boto3
import joblib
import tarfile
from prophet import Prophet
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from scripts.preprocessing import load_data_csv, make_stationary, split_data, save_model_local, upload_model_s3, calculate_metrics
from prophet import Prophet
from scipy import stats
import seaborn as sns
from tabulate import tabulate

In [None]:
# Load data
bucket = 'bk-price-prediction-data'
file_key = 'data/OIL_OLIVE/OIL_OLIVE_DATASET_1.csv'
separator = ','
df = load_data_csv(bucket, file_key, separator)
df.info()

In [None]:
df.dropna(inplace=True)
# Get the median price per date
df_merge = df.groupby('DATE')['PRICE'].mean().reset_index()

# Display the result
print(df_merge.info())

In [None]:
df_merge['DATE'] = pd.to_datetime(df_merge['DATE'])
df_merge.set_index('DATE', inplace=True)

In [None]:
# Remove outliers from the DataFrame
df_merge = df_merge[(np.abs(stats.zscore(df_merge['PRICE'])) < 3)]
df_merge.info()

In [None]:
# Normalize data
scaler = MinMaxScaler()
df_merge['PRICE'] = scaler.fit_transform(df_merge[['PRICE']])

In [None]:
decomposition = seasonal_decompose(df_merge['PRICE'], model='additive', period=52)

# Plot the decomposition
decomposition.plot()
plt.show()

In [None]:
# Preprocess data
df_stationary = make_stationary(df_merge)
df_prophet = df_stationary.reset_index().rename(columns={'DATE': 'ds', 'PRICE': 'y'})

In [None]:
train, test = split_data(df_prophet)

In [None]:
# Initialize and fit Prophet model
model = Prophet(
    weekly_seasonality=True,
    yearly_seasonality=False,
    changepoint_prior_scale=0.1,  # Adjust this parameter
    seasonality_prior_scale=10.0  # Adjust this parameter
)
model.fit(train)

In [None]:
# Make predictions
forecast = model.predict(test)

In [None]:
# Evaluate the model
predictions = forecast['yhat'][-len(test):].values
true_values = test['y'].values

In [None]:
results=calculate_metrics(true_values,predictions)

In [None]:
print(tabulate(results, headers=["Metric", "Value"], tablefmt="grid"))

In [None]:
name_model = 'model_aceite_oliva'
save_model_local(model,name_model)

In [None]:
# Upload the model to S3
upload_model_s3(name_model, bucket)