In [0]:
pip install lightgbm

In [0]:
%pip install -i https://pypi.org/simple tensorflow

In [0]:
pip install keras

In [0]:
pip install auto_ts

In [0]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from numpy import log
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt
import statsmodels.api as sm
from pyspark.sql import SparkSession
from sklearn.model_selection import train_test_split
#from hyperopt import hp, fmin, tpe, Trials
from sklearn.metrics import mean_squared_error
from auto_ts import auto_timeseries 
#from autots import AutoTS

In [0]:
### Import dataset
data = spark.table('crm_ah.msdyn_purchaseorder')

In [0]:
FS = spark.table('crm_shared_entity.cmhc_fundingsource')

In [0]:
cmhc_budgetallocationdate:timestamp

program one-many client file (CRM file) one-one advance one-one disburesement

No1. task 
predict the count of PO in the future in daily frequency
 

No2. task

In [0]:
spark = SparkSession.builder.getOrCreate()
selected_data_spark = data.select('msdyn_totalamount','createdon','cmhc_budgetallocationdate')
ts_pandas = selected_data_spark.toPandas()
ts_pandas.dropna(inplace=True)

In [0]:
print(ts_pandas.dtypes)
ts_pandas['msdyn_totalamount'] = ts_pandas['msdyn_totalamount'].astype(float)
ts_pandas_d = ts_pandas
ts_pandas_d['createdon'] = pd.to_datetime(ts_pandas['createdon'])
#ts_pandas_d['createdon'] = ts_pandas_d['createdon'].dt.strftime("%Y-%m-%d %H:%M:%S")
ts_pandas_d.set_index('createdon', inplace=True)
df_daily = ts_pandas_d.resample('D').sum().reset_index()

In [0]:
#plot the data
from pylab import rcParams
rcParams['figure.figsize'] = 15,7
ts_pandas.plot()

In [0]:
decomposition = sm.tsa.seasonal_decompose(df_daily['msdyn_totalamount'], period=121)
trend = decomposition.trend
seasonal = decomposition.seasonal
residuals = decomposition.resid
plt.figure(figsize=(30,10))
plt.subplot(411)
plt.plot(df_daily['msdyn_totalamount'],label='Original')
plt.legend(loc='upper left')
plt.xticks(rotation = 45)
plt.subplot(412)
plt.plot(trend,label='Trend')
plt.legend(loc='upper left')
plt.xticks(rotation = 45)
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='upper left')
plt.xticks(rotation = 45)
plt.subplot(414)
plt.plot(residuals,label='Residuals')
plt.legend(loc='upper left')
plt.xticks(rotation = 45)
plt.tight_layout()
plt.show()

pre program trends

In [0]:
result_d = adfuller(df_daily.msdyn_totalamount)
print("Daily ADF Statistics : %f" % result_d[0])

In [0]:
plot_acf(df_daily.msdyn_totalamount,lags=10)
plot_pacf(df_daily.msdyn_totalamount,lags=10)

In [0]:
#df_daily.reset_index(inplace=True)
# Split the dataset into train and test sets
train_size_d = int(len(df_daily) * 0.8)  # 80% for training, adjust as needed
train_data_d = df_daily[:train_size_d]
test_data_d = df_daily[train_size_d:]

In [0]:
#build a arima model using the parameters gained from previous steps
model_d = ARIMA(train_data_d['msdyn_totalamount'], order=(6, 7, 6))
model_d = model_d.fit()
forecast_steps_d = len(test_data_d)
forecast_values_d = model_d.forecast(steps=forecast_steps_d)
actual_values_d = test_data_d['msdyn_totalamount']
# Calculate the RMSE
rmse_d = np.sqrt(mean_squared_error(actual_values_d, forecast_values_d))
print("RMSE(D):", rmse_d)

In [0]:
#Use Automatic Method
import pmdarima as pm
# Fit the auto ARIMA model
opt_model_d = pm.auto_arima(train_data_d['msdyn_totalamount'], seasonal=False, error_action='ignore', suppress_warnings=True)
# Generate the forecast
opt_forecast_d = opt_model_d.predict(n_periods=len(test_data_d))
# Calculate the RMSE
rmse_d = np.sqrt(np.mean((test_data_d['msdyn_totalamount'] - opt_forecast_d) ** 2))
# Print the best model and its corresponding RMSE
print("Best Daily Model: ARIMA", opt_model_d.order)
print("Best Daily RMSE:", rmse_d)

In [0]:
def objective(params):
    order = (params['p'], params['d'], params['q'])
    model = ARIMA(train_data_d['msdyn_totalamount'])
    model_fit = model.fit()
    forecast = model_fit.forecast(steps=len(test_data_d))
    predictions = np.maximum(forecast, 0)
    mse = mean_squared_error(test_data_d['msdyn_totalamount'], predictions)
    return mse
space = {
    'p': hp.quniform('p', 1, 20, 1),
    'd': hp.quniform('d', 1, 5, 1),
    'q': hp.quniform('q', 1, 20, 1)
}
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)
best_p = int(best['p'])
best_d = int(best['d'])
best_q = int(best['q'])

In [0]:
# Train the optimal model with the best hyperparameters
optimal_model = ARIMA(train_data_d['msdyn_totalamount'], order=(best_p, best_d, best_q))
optimal_model_fit = optimal_model.fit()
# Make predictions on the test data
forecast = optimal_model_fit.forecast(steps=len(test_data_d))
predictions = np.maximum(forecast, 0)
# Calculate RMSE
mse = mean_squared_error(test_data_d["msdyn_totalamount"], predictions)
rmse = np.sqrt(mse)
print("Optimal ARIMA Model:")
print(optimal_model_fit.summary())
print("RMSE:", rmse)

In [0]:
test_data_d

In [0]:
last_date_d = test_data_d['createdon'].iloc[-1]
future_dates = pd.date_range(start=last_date_d, periods=30, freq='D')
future_dates = future_dates.shift(1, freq='D')
n_forecast_steps = len(future_dates)
n_forecast = optimal_model_fit.predict(steps=n_forecast_steps)
n_forecast = np.maximum(n_forecast, 0)
n_forecast = n_forecast[:n_forecast_steps]
forecast_data = pd.DataFrame({'createdon': future_dates, 'total disbursement amount': n_forecast})
forecast_data.reset_index(drop=True, inplace=True)
print(forecast_data)

In [0]:
merged_dates_d = pd.concat([test_data_d['createdon'], forecast_data['createdon']], ignore_index=True)
merged_forecast_d = pd.concat([opt_forecast_d, forecast_data['total disbursement amount']], ignore_index=True)
confidence_level = 0.90
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(test_data_d['createdon'], test_data_d['msdyn_totalamount'], label='Actual')
ax.plot(merged_dates_d, merged_forecast_d, label='Forecast')
lower_bound_d = n_forecast - (rmse * confidence_level)
upper_bound_d = n_forecast + (rmse * confidence_level)
ax.set_xlabel('Time')
ax.set_ylabel('Total Disbursement Amount')
ax.set_title('Forecast with Confidence Interval')
ax.legend()
plt.show()

In [0]:
# Specify the column names for timestamp and target variable
timestamp_column = 'createdon'
target_column = 'msdyn_totalamount'

# Set the frequency of the time series data to hourly
frequency = 'D'

# Split the dataset into train and test sets
train_size = int(len(df_daily) * 0.8)  # 80% for training, adjust as needed
train_data = df_daily[:train_size]
test_data = df_daily[train_size:]

merged_dates_d = pd.concat([test_data['createdon'], forecast_data['createdon']], ignore_index=True)

In [0]:
# Specify the column names for timestamp and target variable
timestamp_column = 'createdon'
target_column = 'msdyn_totalamount'

# Set the frequency of the time series data to hourly
frequency = 'D'

# Split the dataset into train and test sets
train_size = int(len(df_daily) * 0.8)  # 80% for training, adjust as needed
train_data = df_daily[:train_size]
test_data = df_daily[train_size:]

# Run the AutoTS model to automatically generate time series forecasts
model = auto_timeseries(
    score_type='rmse',  # Specify the evaluation metric for model selection (optional)
    forecast_period=len(test_data),  # Number of steps to forecast (same as test data length)
    time_interval=frequency,  # Set the frequency of the time series data to hourly
    non_seasonal_pdq=None,  # Specify the non-seasonal order (p,d,q) of the ARIMA model (optional)
    seasonal_PDQ=None,  # Specify the seasonal order (P,D,Q,s) of the SARIMA model (optional)
    model_type='best',  # Specify the model type or use 'best' for automatic model selection
    verbose=2  # Set verbosity level
)

# Fit the AutoTS model to the training data
model.fit(traindata=train_data, ts_column=timestamp_column, target=target_column)

# Get the summary of the optimal model
summary = model.get_leaderboard()

# Print the summary
print(summary)

# Generate forecasts for the test data
forecast = model.predict(testdata=forecast_data)

In [0]:
print(forecast)

In [0]:
# Specify the column names for timestamp and target variable
timestamp_column = 'createdon'
target_column = 'msdyn_totalamount'

# Split the dataset into train and test sets
train_size = int(len(df_daily) * 0.8)  # 80% for training, adjust as needed
train_data = df_daily[:train_size]
test_data = df_daily[train_size:]
frequency = 'D'

# Run the AutoTS model to automatically generate time series forecasts
model = AutoTS(
    forecast_length=len(test_data),  # Number of steps to forecast (same as test data length)
    frequency='D',  # Set the frequency of the time series data
    prediction_interval=0.95,
    ensemble=None,
    models_mode='deep',
    model_list = 'univariate',# or ['ARIMA','ETS']
    max_generations=10,
    num_validations=3,
    no_negatives=True,
    n_jobs='auto'
)

# Fit the AutoTS model to the training data
model.fit(train_data)

# Get the summary of the optimal model
summary = model.get_leaderboard()

# Print the summary
print(summary)

# Generate forecasts for the test data
test_df = model.predict(testdata=test_data)

# Print the forecasted values
print(test_df)

In [0]:
import tensorflow as tensorflow
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Preprocess the data
scaler = MinMaxScaler()
df_daily['msdyn_totalamount'] = scaler.fit_transform(df_daily['msdyn_totalamount'].values.reshape(-1, 1))

# Split the dataset into train and test sets
train_size = int(len(df_daily) * 0.8)  # 80% for training, adjust as needed
train_data = df_daily[:train_size]
test_data = df_daily[train_size:]

# Define the LSTM model
model = Sequential()
model.add(LSTM(units=50, activation='relu', input_shape=(1, 1)))
model.add(Dense(units=1))
model.compile(optimizer='adam', loss='mean_squared_error')

# Prepare the training data
X_train = np.array(train_data['createdon']).reshape(-1, 1, 1)
y_train = np.array(train_data['amount_scaled'])

# Train the LSTM model
model.fit(X_train, y_train, epochs=10, batch_size=1)

# Prepare the test data
X_test = np.array(test_data['createdon']).reshape(-1, 1, 1)
y_test = np.array(test_data['amount_scaled'])

# Generate predictions on the test data
predictions_scaled = model.predict(X_test)

# Inverse scale the predictions
predictions = scaler.inverse_transform(predictions_scaled)

# Create a DataFrame for the predictions
forecast_df = pd.DataFrame({'createdon': test_data['createdon'], 'amount': predictions.flatten()})

# Print the forecasted values
print(forecast_df)