# Section 1
### Import packages for data cleaning and download csv file

In [22]:
import numpy as np
import pandas as pd
import yfinance as yf

In [24]:
ticker_symbol = "^GSPC"

start_date = "2022-06-23"
end_date = "2023-06-23"

data = yf.download(ticker_symbol, start=start_date, end=end_date)

data.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-06-23,3774.709961,3802.580078,3743.52002,3795.72998,3795.72998,5098640000
2022-06-24,3821.75,3913.649902,3821.75,3911.73999,3911.73999,8120260000
2022-06-27,3920.76001,3927.719971,3889.659912,3900.110107,3900.110107,4325310000
2022-06-28,3913.0,3945.860107,3820.139893,3821.550049,3821.550049,4270120000
2022-06-29,3825.090088,3836.5,3799.02002,3818.830078,3818.830078,4211240000


# Section 2
### Check for missing values or other inconsistencies in the dataset

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 251 entries, 2022-06-23 to 2023-06-22
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       251 non-null    float64
 1   High       251 non-null    float64
 2   Low        251 non-null    float64
 3   Close      251 non-null    float64
 4   Adj Close  251 non-null    float64
 5   Volume     251 non-null    int64  
dtypes: float64(5), int64(1)
memory usage: 13.7 KB


#### Drop any eventual duplicates

In [26]:
data.drop_duplicates(subset=None, inplace=True)

#### Check for eventual missing values and drop if present

In [28]:
if data.isnull().values.any():
    data.dropna(inplace=True)
else:
    print("No null values found in the data")

No null values found in the data


# Section 3
### Convert to datetime format and sort dataframe by date

In [30]:
if isinstance(data.index, pd.DatetimeIndex):
    print("Datetime index already set.")
else:
    try:
        data.set_index('Date', inplace=True)
        data.index = pd.to_datetime(data.index)
        print('Datetime index has been set.')
    except KeyError:
        print('Unable to set datetime index.')


Datetime index already set.


In [31]:
data.sort_values(by='Date', inplace=True, ascending=False)

In [32]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-06-22,4355.399902,4382.25,4351.819824,4381.890137,4381.890137,3511000000
2023-06-21,4380.009766,4386.220215,4360.140137,4365.689941,4365.689941,3709330000
2023-06-20,4396.109863,4400.149902,4367.189941,4388.709961,4388.709961,4055790000
2023-06-16,4440.950195,4448.470215,4407.439941,4409.589844,4409.589844,6848600000
2023-06-15,4365.330078,4439.200195,4362.600098,4425.839844,4425.839844,4176690000


# Export csv file after cleaning
#### Save cleaned dataset csv file as "FET-USD-CLEANED.csv" to "data" folder

In [34]:
try: 
    data.to_csv('/Users/jotech/Milestone Repo/milestone-project-market-prediction-model/data/^GSPC.csv', index='Date')
    print('Data saved to csv file.')
except:
    print('Unable to save data to csv file.')

Data saved to csv file.
