# Finding data: S&P 500

## Required libraries

In [1]:
import yfinance as yf
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

## Read the data

In [2]:
nq = yf.Ticker("NQ=F")
nq_10y = nq.history(period="10y") #Return a pandas dataframe of daily info
display(nq_10y)

  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-02-18 00:00:00-05:00,3660.00,3684.75,3654.00,3675.00,220169,0.0,0.0
2014-02-19 00:00:00-05:00,3676.00,3679.50,3645.25,3654.50,277354,0.0,0.0
2014-02-20 00:00:00-05:00,3654.50,3677.00,3638.25,3671.50,269725,0.0,0.0
2014-02-21 00:00:00-05:00,3671.50,3686.75,3660.75,3664.50,186798,0.0,0.0
2014-02-24 00:00:00-05:00,3665.75,3701.50,3655.00,3688.00,206372,0.0,0.0
...,...,...,...,...,...,...,...
2024-02-09 00:00:00-05:00,17854.25,18071.00,17852.25,18039.25,499282,0.0,0.0
2024-02-12 00:00:00-05:00,18040.00,18121.50,17911.25,17965.00,538132,0.0,0.0
2024-02-13 00:00:00-05:00,17928.50,17963.25,17542.00,17676.75,858230,0.0,0.0
2024-02-14 00:00:00-05:00,17703.00,17887.00,17669.25,17881.00,858230,0.0,0.0


In [3]:
df_nq = nq_10y.copy()
df_nq.reset_index(inplace=True)

### Describing and renaming columns

In [4]:
df_nq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2517 entries, 0 to 2516
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype                           
---  ------        --------------  -----                           
 0   Date          2517 non-null   datetime64[ns, America/New_York]
 1   Open          2517 non-null   float64                         
 2   High          2517 non-null   float64                         
 3   Low           2517 non-null   float64                         
 4   Close         2517 non-null   float64                         
 5   Volume        2517 non-null   int64                           
 6   Dividends     2517 non-null   float64                         
 7   Stock Splits  2517 non-null   float64                         
dtypes: datetime64[ns, America/New_York](1), float64(6), int64(1)
memory usage: 157.4 KB


***Description of columns***
- **Date:** Date
- **Open:** Opening price of the Nasdaq index on a given day.
- **High:** Highest price of the Nasdaq index during the trading day.
- **Low:** Lowest price of the Nasdaq index during the trading day.
- **Close:** Closing price of the Nasdaq index on a given day.
- **Volume:** Trading volume, representing the total number of shares traded on a given day.
- **Dividends:** Dividends paid on the Nasdaq index on a given day.
- **Stock Splits:** Number of stock splits that occurred on the Nasdaq index on a given day.

In [5]:
df_nq.rename(columns={'Date':'date', 'Open':'open', 'High':'high', 'Low':'low', 'Close':'close', 'Volume':'vol', 'Dividends':'divs', 'Stock Splits':'stock_splits'}, inplace=True)
df_nq['date'] = pd.to_datetime(df_nq['date'])
df_nq['date'] = df_nq['date'].dt.strftime('%Y-%m-%d')
df_nq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2517 entries, 0 to 2516
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          2517 non-null   object 
 1   open          2517 non-null   float64
 2   high          2517 non-null   float64
 3   low           2517 non-null   float64
 4   close         2517 non-null   float64
 5   vol           2517 non-null   int64  
 6   divs          2517 non-null   float64
 7   stock_splits  2517 non-null   float64
dtypes: float64(6), int64(1), object(1)
memory usage: 157.4+ KB


Columns have been renamed for easing the use

## Understanding and examining data

In [6]:
df_nq.head()

Unnamed: 0,date,open,high,low,close,vol,divs,stock_splits
0,2014-02-18,3660.0,3684.75,3654.0,3675.0,220169,0.0,0.0
1,2014-02-19,3676.0,3679.5,3645.25,3654.5,277354,0.0,0.0
2,2014-02-20,3654.5,3677.0,3638.25,3671.5,269725,0.0,0.0
3,2014-02-21,3671.5,3686.75,3660.75,3664.5,186798,0.0,0.0
4,2014-02-24,3665.75,3701.5,3655.0,3688.0,206372,0.0,0.0


In [7]:
df_nq.shape

(2517, 8)

In [8]:
df_nq.dtypes

date             object
open            float64
high            float64
low             float64
close           float64
vol               int64
divs            float64
stock_splits    float64
dtype: object

In [9]:
df_nq.describe()

Unnamed: 0,open,high,low,close,vol,divs,stock_splits
count,2517.0,2517.0,2517.0,2517.0,2517.0,2517.0,2517.0
mean,8678.984605,8759.525336,8595.622169,8684.632411,424343.9,0.0,0.0
std,4121.057998,4163.632121,4076.25022,4122.478972,217775.5,0.0,0.0
min,3435.75,3485.0,3404.75,3444.0,0.0,0.0,0.0
25%,4737.5,4779.25,4707.25,4743.75,239635.0,0.0,0.0
50%,7400.0,7465.0,7350.75,7405.25,395579.0,0.0,0.0
75%,12487.0,12637.25,12312.5,12492.75,584743.0,0.0,0.0
max,18040.0,18121.5,17911.25,18039.25,1580041.0,0.0,0.0


In [10]:
for i in df_nq.columns:
  print(i, len(df_nq[i].unique()))
#print(df.nunique())

date 2517
open 2424
high 2414
low 2436
close 2440
vol 2507
divs 1
stock_splits 1


'divs' and 'stock_splits' columns only have one value each, so they are being removed.

In [11]:
df_nq.drop(columns=['divs','stock_splits'])
df_nq.head()

Unnamed: 0,date,open,high,low,close,vol,divs,stock_splits
0,2014-02-18,3660.0,3684.75,3654.0,3675.0,220169,0.0,0.0
1,2014-02-19,3676.0,3679.5,3645.25,3654.5,277354,0.0,0.0
2,2014-02-20,3654.5,3677.0,3638.25,3671.5,269725,0.0,0.0
3,2014-02-21,3671.5,3686.75,3660.75,3664.5,186798,0.0,0.0
4,2014-02-24,3665.75,3701.5,3655.0,3688.0,206372,0.0,0.0


In [12]:
df_nq.to_csv('data/nasdaq.csv', index=False)