# Gold Price Prediction

## Data Preprocessing

In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [91]:
df = pd.read_csv("gold_price.csv")
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SP_open,SP_high,SP_low,...,GDX_Low,GDX_Close,GDX_Adj Close,GDX_Volume,USO_Open,USO_High,USO_Low,USO_Close,USO_Adj Close,USO_Volume
0,2011-12-15,154.740005,154.949997,151.710007,152.330002,152.330002,21521900,123.029999,123.199997,121.989998,...,51.57,51.68,48.973877,20605600,36.900002,36.939999,36.049999,36.130001,36.130001,12616700
1,2011-12-16,154.309998,155.369995,153.899994,155.229996,155.229996,18124300,122.230003,122.949997,121.300003,...,52.040001,52.68,49.921513,16285400,36.18,36.5,35.73,36.27,36.27,12578800
2,2011-12-19,155.479996,155.860001,154.360001,154.869995,154.869995,12547200,122.059998,122.32,120.029999,...,51.029999,51.169998,48.490578,15120200,36.389999,36.450001,35.93,36.200001,36.200001,7418200
3,2011-12-20,156.820007,157.429993,156.580002,156.979996,156.979996,9136300,122.18,124.139999,120.370003,...,52.369999,52.990002,50.215282,11644900,37.299999,37.610001,37.220001,37.560001,37.560001,10041600
4,2011-12-21,156.979996,157.529999,156.130005,157.160004,157.160004,11996100,123.93,124.360001,122.75,...,52.419998,52.959999,50.186852,8724300,37.669998,38.240002,37.52,38.110001,38.110001,10728000


In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1718 entries, 0 to 1717
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           1718 non-null   object 
 1   Open           1718 non-null   float64
 2   High           1718 non-null   float64
 3   Low            1718 non-null   float64
 4   Close          1718 non-null   float64
 5   Adj Close      1718 non-null   float64
 6   Volume         1718 non-null   int64  
 7   SP_open        1718 non-null   float64
 8   SP_high        1718 non-null   float64
 9   SP_low         1718 non-null   float64
 10  SP_close       1718 non-null   float64
 11  SP_Ajclose     1718 non-null   float64
 12  SP_volume      1718 non-null   int64  
 13  DJ_open        1718 non-null   float64
 14  DJ_high        1718 non-null   float64
 15  DJ_low         1718 non-null   float64
 16  DJ_close       1718 non-null   float64
 17  DJ_Ajclose     1718 non-null   float64
 18  DJ_volum

In [93]:
df.quantile([0, 0.25, 0.5, 0.75, 1], numeric_only=True).T

Unnamed: 0,0.00,0.25,0.50,0.75,1.00
Open,1.009200e+02,1.162200e+02,1.219150e+02,1.284275e+02,1.732000e+02
High,1.009900e+02,1.165400e+02,1.223250e+02,1.290875e+02,1.740700e+02
Low,1.002300e+02,1.157400e+02,1.213700e+02,1.278400e+02,1.729200e+02
Close,1.005000e+02,1.160525e+02,1.217950e+02,1.284700e+02,1.736100e+02
Adj Close,1.005000e+02,1.160525e+02,1.217950e+02,1.284700e+02,1.736100e+02
...,...,...,...,...,...
USO_High,8.030000e+00,1.150000e+01,1.663500e+01,3.466750e+01,4.230000e+01
USO_Low,7.670000e+00,1.130000e+01,1.604000e+01,3.411000e+01,4.130000e+01
USO_Close,7.960000e+00,1.139250e+01,1.634500e+01,3.441750e+01,4.201000e+01
USO_Adj Close,7.960000e+00,1.139250e+01,1.634500e+01,3.441750e+01,4.201000e+01


### Removing all outliers using IQR formula

In [94]:
def remove_outliers_iqr(df):
    """
    Removes outliers from all numeric columns of the DataFrame
    using the IQR (Interquartile Range) method.
    """
    df_clean = df.copy()
    numeric_cols = df_clean.select_dtypes(include='number').columns

    for col in numeric_cols:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

    return df_clean


In [95]:
df1 = remove_outliers_iqr(df)
df1.shape

(715, 81)

In [96]:
df1.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SP_open,SP_high,SP_low,...,GDX_Low,GDX_Close,GDX_Adj Close,GDX_Volume,USO_Open,USO_High,USO_Low,USO_Close,USO_Adj Close,USO_Volume
359,2013-06-17,133.889999,134.059998,133.520004,133.770004,133.770004,4072300,164.289993,165.220001,163.220001,...,27.950001,28.190001,27.06736,8644600,34.860001,34.939999,34.599998,34.759998,34.759998,4296100
367,2013-07-02,121.339996,121.470001,119.779999,120.050003,120.050003,10550300,161.119995,162.300003,160.5,...,23.459999,23.75,22.804176,34482700,34.900002,35.360001,34.860001,35.209999,35.209999,8762200
368,2013-07-03,120.699997,121.769997,120.540001,120.739998,120.739998,6750300,160.479996,161.770004,160.220001,...,23.76,24.18,23.217056,21909200,35.93,36.18,35.599998,35.84,35.84,12203200
369,2013-07-05,118.080002,118.18,116.739998,118.089996,118.089996,11592500,162.470001,163.080002,161.300003,...,22.790001,23.42,22.48732,43649000,36.049999,36.59,35.939999,36.560001,36.560001,7227600
370,2013-07-08,119.089996,119.660004,118.93,119.510002,119.510002,8652200,163.860001,164.389999,163.080002,...,22.9,22.9,21.988029,21770700,36.290001,36.669998,36.220001,36.41,36.41,4212300


### Splitting Date object to Year, Month, Day columns

In [97]:
df1["Date"] = pd.to_datetime(df1["Date"])
df1.Date.describe()

count                              715
mean     2015-12-06 08:19:28.111888128
min                2013-06-17 00:00:00
25%                2014-07-08 12:00:00
50%                2016-02-29 00:00:00
75%                2017-05-05 12:00:00
max                2018-11-30 00:00:00
Name: Date, dtype: object

In [98]:
df1["year"] = df1["Date"].dt.year

In [99]:
df1["month"] = df1["Date"].dt.month
df1["day"] = df1["Date"].dt.day

## Preparing Model

### Splitting x and y

In [100]:
y = df1["USO_Adj Close"]
x = df1.drop(["USO_Adj Close", "Date", "USO_Close"], axis=1)

In [101]:
y

359     34.759998
367     35.209999
368     35.840000
369     36.560001
370     36.410000
          ...    
1614    14.280000
1636    14.710000
1655    15.520000
1677    13.790000
1698    10.730000
Name: USO_Adj Close, Length: 715, dtype: float64

### Scaling Data

In [102]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_scaled= scaler.fit_transform(x)

In [103]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2)

### Finding the best Model

In [104]:
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
            
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        },
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

In [105]:
find_best_model_using_gridsearchcv(x_scaled, y)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\user\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\anaconda3\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.99996,{}
1,lasso,0.944105,"{'alpha': 1, 'selection': 'cyclic'}"
2,decision_tree,0.99924,"{'criterion': 'friedman_mse', 'splitter': 'ran..."


### Linear Regression Model

In [106]:
model = LinearRegression()
model.fit(x_train, y_train)

In [107]:
model.score(x_test, y_test)

0.9999624752529412

### Decision Tree Model

In [108]:
model2 = DecisionTreeRegressor()
model2.fit(x_train, y_train)

In [109]:
model2.score(x_test, y_test)

0.9987275623339258