In [1]:
# Import the modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics


In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path("Resources/gld_price_data.csv")
df_gld = pd.read_csv(file_path)

# Review the DataFrame
df_gld.head()

Unnamed: 0,Date,SPX,GLD,USO,SLV,EUR/USD
0,1/2/2008,1447.160034,84.860001,78.470001,15.18,1.471692
1,1/3/2008,1447.160034,85.57,78.370003,15.285,1.474491
2,1/4/2008,1411.630005,85.129997,77.309998,15.167,1.475492
3,1/7/2008,1416.180054,84.769997,75.5,15.053,1.468299
4,1/8/2008,1390.189941,86.779999,76.059998,15.59,1.557099


In [3]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df_gld["GLD"]

# Separate the X variable, the features
x = df_gld.drop(columns=["Date", "GLD"], axis=1)

In [4]:
# Review the y variable Series
print(y)

0        84.860001
1        85.570000
2        85.129997
3        84.769997
4        86.779999
           ...    
2285    124.589996
2286    124.330002
2287    125.180000
2288    124.489998
2289    122.543800
Name: GLD, Length: 2290, dtype: float64


In [5]:
# Review the X variable DataFrame
x.head()

Unnamed: 0,SPX,USO,SLV,EUR/USD
0,1447.160034,78.470001,15.18,1.471692
1,1447.160034,78.370003,15.285,1.474491
2,1411.630005,77.309998,15.167,1.475492
3,1416.180054,75.5,15.053,1.468299
4,1390.189941,76.059998,15.59,1.557099


In [6]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2290 entries, 0 to 2289
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   SPX      2290 non-null   float64
 1   USO      2290 non-null   float64
 2   SLV      2290 non-null   float64
 3   EUR/USD  2290 non-null   float64
dtypes: float64(4)
memory usage: 71.7 KB


In [7]:
# Check the target values
y.value_counts()

115.940002    5
117.959999    4
122.209999    4
126.300003    4
124.589996    4
             ..
147.179993    1
146.240005    1
145.729996    1
148.339996    1
122.543800    1
Name: GLD, Length: 1930, dtype: int64

In [9]:
# Split the data using train_test_split
# Assign a random_state of 2 to the function
x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    random_state=2, 
                                                    stratify=y)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
y

In [None]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
regressor = LogisticRegression(solver='lbfgs', random_state=1)

In [None]:
# Fit the model using training data
lr_model = regressor.fit(x_train,y_train)

In [None]:
# Make a prediction using the testing data
predictions = lr_model.predict(x_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

In [None]:
# Calculate the R-Squared error from the predicted value
error_score = metrics.r2_score(Y_test, test_data_prediction)
print("R squared error : ", error_score)

In [None]:
# Convert Y_test values into a list
y_test = list(y_test)

In [None]:
# Plotting values of actual prices versus the predicted prices 
plt.figure(figsize=(10, 6))
plt.plot(y_test, color='blue', label='Actual Value')
plt.plot(test_data_prediction, color='green', label='Predicted Value')
plt.title('Actual Price vs Predicted Price')
plt.xlabel('Number of values')
plt.ylabel('Gold Price')
plt.legend()
plt.show()

In [None]:
# Data Visualization - Interactive Time Series using plotly
fig = px.line(df_gld, x=df_gld.index, y='GLD', title='GLD Price Time Series')
fig.update_xaxes(title_text='Date')
fig.update_yaxes(title_text='GLD Price')
fig.show()