# API from yfianance Stock Market
* stocks from 1970 through 2018
* The S&P 500 (^GSPC), or just the S&P, is a stock market index that measures the stock performance of 500 large companies listed on stock exchanges in the United States.
* Remove incomplete rows
* Deal with error-prone columns
* Drop un-needed columns
* Change casing
* save to csv

In [1]:
from pandas_datareader import data as pdr
import pandas as pd
import yfinance as yf

#### Pulling S&P 500 stock market index

In [2]:
yf.pdr_override() # <== that's all it takes :-)

# download dataframe
df = pdr.get_data_yahoo("^GSPC", start="1970-01-01", end="2019-01-01")

[*********************100%***********************]  1 of 1 completed


#### length of dataset

In [3]:
df.count()

Open         12361
High         12361
Low          12361
Close        12361
Adj Close    12361
Volume       12361
dtype: int64

#### Dropping any rows that are missing 

In [4]:
df = df.dropna()
df.count()

Open         12361
High         12361
Low          12361
Close        12361
Adj Close    12361
Volume       12361
dtype: int64

#### Looking at columns

In [5]:
df.columns

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')

#### Reseting the Index

In [6]:
df = df.reset_index()
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1970-01-02,92.059998,93.540001,91.790001,93.0,93.0,8050000
1,1970-01-05,93.0,94.25,92.529999,93.459999,93.459999,11490000
2,1970-01-06,93.459999,93.809998,92.129997,92.82,92.82,11460000
3,1970-01-07,92.82,93.379997,91.93,92.629997,92.629997,10010000
4,1970-01-08,92.629997,93.470001,91.989998,92.68,92.68,10670000


#### Checking data types

In [7]:
df.dtypes

Date         datetime64[ns]
Open                float64
High                float64
Low                 float64
Close               float64
Adj Close           float64
Volume                int64
dtype: object

#### Using .dt to extract year only

In [8]:
df['Date'] = df['Date'].dt.year
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1970,92.059998,93.540001,91.790001,93.0,93.0,8050000
1,1970,93.0,94.25,92.529999,93.459999,93.459999,11490000
2,1970,93.459999,93.809998,92.129997,92.82,92.82,11460000
3,1970,92.82,93.379997,91.93,92.629997,92.629997,10010000
4,1970,92.629997,93.470001,91.989998,92.68,92.68,10670000


#### Group by date and get the mean average

In [9]:
group_df = df.groupby(df['Date'])
group_df = group_df.mean()
group_df.head()

Unnamed: 0_level_0,Date,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1970,1970.0,83.152047,84.011457,82.292126,83.154134,83.154134,11601300.0
1971,1971.0,98.306877,99.071897,97.565138,98.31502,98.31502,15394230.0
1972,1972.0,109.071036,109.952789,108.27996,109.134821,109.134821,16483550.0
1973,1973.0,107.514167,108.554087,106.354048,107.438413,107.438413,16102300.0
1974,1974.0,82.886324,83.879367,81.795534,82.780909,82.780909,13903560.0


#### Checking data types

In [10]:
group_df.count()

Date         49
Open         49
High         49
Low          49
Close        49
Adj Close    49
Volume       49
dtype: int64

#### Drop unwanted column

In [11]:
group_df = group_df.drop(columns=['Date'])
group_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1970,83.152047,84.011457,82.292126,83.154134,83.154134,11601300.0
1971,98.306877,99.071897,97.565138,98.31502,98.31502,15394230.0
1972,109.071036,109.952789,108.27996,109.134821,109.134821,16483550.0
1973,107.514167,108.554087,106.354048,107.438413,107.438413,16102300.0
1974,82.886324,83.879367,81.795534,82.780909,82.780909,13903560.0


#### Remove index

In [12]:
group_df.reset_index(level=['Date'], inplace=True)
group_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1970,83.152047,84.011457,82.292126,83.154134,83.154134,11601300.0
1,1971,98.306877,99.071897,97.565138,98.31502,98.31502,15394230.0
2,1972,109.071036,109.952789,108.27996,109.134821,109.134821,16483550.0
3,1973,107.514167,108.554087,106.354048,107.438413,107.438413,16102300.0
4,1974,82.886324,83.879367,81.795534,82.780909,82.780909,13903560.0


#### lowercase/rename columns

In [13]:
group_df = group_df.rename(columns={'Date': 'year','Open':'open','High':'high','Low':'low','Close':'close','Adj Close':'adj_close','Volume':'volume'})
group_df.head()

Unnamed: 0,year,open,high,low,close,adj_close,volume
0,1970,83.152047,84.011457,82.292126,83.154134,83.154134,11601300.0
1,1971,98.306877,99.071897,97.565138,98.31502,98.31502,15394230.0
2,1972,109.071036,109.952789,108.27996,109.134821,109.134821,16483550.0
3,1973,107.514167,108.554087,106.354048,107.438413,107.438413,16102300.0
4,1974,82.886324,83.879367,81.795534,82.780909,82.780909,13903560.0


#### Create new df that only has year

In [14]:
year_df = group_df['year']
year_df.head()
year_df = pd.DataFrame(year_df)
year_df.head()

Unnamed: 0,year
0,1970
1,1971
2,1972
3,1973
4,1974


#### Exporting to a csv file

In [15]:
year_df.to_csv('../data_transformed/year.csv')
group_df.to_csv('../data_transformed/stock.csv')