In [4]:
import yfinance as yf
import pandas as pd

tsla = yf.Ticker("TSLA")
tslaData = tsla.history(period="max")
tslaData = tslaData[['Open', 'Close', 'Volume']]
tslaData.columns = ['TSLA_Open', 'TSLA_Close', 'TSLA_Volume']

aapl = yf.Ticker("AAPL")
aaplData = aapl.history(period="max")
aaplData = aaplData[['Open', 'Close', 'Volume']]
aaplData.columns = ['AAPL_Open', 'AAPL_Close', 'AAPL_Volume']

allData = tslaData.join(aaplData, how='outer')
allData.head()

Unnamed: 0_level_0,TSLA_Open,TSLA_Close,TSLA_Volume,AAPL_Open,AAPL_Close,AAPL_Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1980-12-12,,,,0.101261,0.101261,469033600
1980-12-15,,,,0.096418,0.095978,175884800
1980-12-16,,,,0.089374,0.088934,105728000
1980-12-17,,,,0.091135,0.091135,86441600
1980-12-18,,,,0.093777,0.093777,73449600


In [5]:
allData = allData.drop(['TSLA_Open', 'AAPL_Open'], axis=1) # inplace, axis = 1 is column, axis = 0 is row

In [9]:
allData.dropna(inplace=True) # how=any, all, axis, thresh, inplace
allData

Unnamed: 0_level_0,TSLA_Close,TSLA_Volume,AAPL_Close,AAPL_Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-06-29,4.778000,93831500.0,7.905947,1133344800
2010-06-30,4.766000,85935500.0,7.762749,739452000
2010-07-01,4.392000,41094000.0,7.668618,1022896000
2010-07-02,3.840000,25699000.0,7.621089,693842800
2010-07-06,3.222000,34334500.0,7.673245,615235600
...,...,...,...,...
2020-10-28,406.019989,25451400.0,111.199997,143937800
2020-10-29,410.829987,22655300.0,115.320000,146129200
2020-10-30,388.040009,42511300.0,108.860001,190272600
2020-11-02,400.510010,29021100.0,108.769997,122866900


In [10]:
allData["TSLA_Volume"].describe()  # automatic stat on teh df

count    2.607000e+03
mean     3.091239e+07
std      2.850968e+07
min      5.925000e+05
25%      1.084375e+07
50%      2.440200e+07
75%      3.998410e+07
max      3.046940e+08
Name: TSLA_Volume, dtype: float64

In [11]:
print(f'Count: {allData["TSLA_Volume"].count():,}')
print(f'Average: {allData["TSLA_Volume"].mean():,}')
print(f'Min: {allData["TSLA_Volume"].min():,}')
print(f'Max: {allData["TSLA_Volume"].max():,}')
print(f'5%: {allData["TSLA_Volume"].quantile(0.05):,}')
print(f'95%: {allData["TSLA_Volume"].quantile(0.95):,}')

Count: 2,607
Average: 30,912,392.40506329
Min: 592,500.0
Max: 304,694,000.0
5%: 3,238,200.0
95%: 88,921,200.0


In [12]:
# Remove volume outliers

volMin = allData["TSLA_Volume"].quantile(0.05)
volMax = allData["TSLA_Volume"].quantile(0.95)
allDataNoOutliers = allData[allData.TSLA_Volume.between(volMin, volMax)]

In [13]:
print(f'Count: {allDataNoOutliers["TSLA_Volume"].count():,}')
print(f'Average: {allDataNoOutliers["TSLA_Volume"].mean():,}')
print(f'Min: {allDataNoOutliers["TSLA_Volume"].min():,}')
print(f'Max: {allDataNoOutliers["TSLA_Volume"].max():,}')
print(f'5%: {allDataNoOutliers["TSLA_Volume"].quantile(0.05):,}')
print(f'95%: {allDataNoOutliers["TSLA_Volume"].quantile(0.95):,}')

Count: 2,345
Average: 27,648,836.119402986
Min: 3,248,000.0
Max: 88,892,500.0
5%: 4,257,900.0
95%: 67,591,999.99999999


# Winsorization

In [16]:
# Clip

volMin = allData["TSLA_Volume"].quantile(0.05)
volMax = allData["TSLA_Volume"].quantile(0.95)

allDataClipped = allData.copy()
allDataClipped['TSLA_Volume'] = allDataClipped['TSLA_Volume'].clip(volMin, volMax)  # Take anything below min and floor at min, anything above max, celling at max
allDataClipped

Unnamed: 0_level_0,TSLA_Close,TSLA_Volume,AAPL_Close,AAPL_Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-06-29,4.778000,88921200.0,7.905947,1133344800
2010-06-30,4.766000,85935500.0,7.762749,739452000
2010-07-01,4.392000,41094000.0,7.668618,1022896000
2010-07-02,3.840000,25699000.0,7.621089,693842800
2010-07-06,3.222000,34334500.0,7.673245,615235600
...,...,...,...,...
2020-10-28,406.019989,25451400.0,111.199997,143937800
2020-10-29,410.829987,22655300.0,115.320000,146129200
2020-10-30,388.040009,42511300.0,108.860001,190272600
2020-11-02,400.510010,29021100.0,108.769997,122866900


In [17]:
print(f'Count: {allDataClipped["TSLA_Volume"].count():,}')
print(f'Average: {allDataClipped["TSLA_Volume"].mean():,}')
print(f'Min: {allDataClipped["TSLA_Volume"].min():,}')
print(f'Max: {allDataClipped["TSLA_Volume"].max():,}')
print(f'5%: {allDataClipped["TSLA_Volume"].quantile(0.05):,}')
print(f'95%: {allDataClipped["TSLA_Volume"].quantile(0.95):,}')

Count: 2,607
Average: 29,501,113.195243575
Min: 3,238,200.0
Max: 88,921,200.0
5%: 3,241,140.0
95%: 88,912,590.0


In [18]:
# Winsorize

import scipy.stats.mstats

allDataWinsorized = allData.copy()
allDataWinsorized["TSLA_Volume"] = scipy.stats.mstats.winsorize(allDataWinsorized["TSLA_Volume"], [.05, .05])

In [19]:
print(f'Count: {allDataWinsorized["TSLA_Volume"].count():,}')
print(f'Average: {allDataWinsorized["TSLA_Volume"].mean():,}')
print(f'Min: {allDataWinsorized["TSLA_Volume"].min():,}')
print(f'Max: {allDataWinsorized["TSLA_Volume"].max():,}')
print(f'5%: {allDataWinsorized["TSLA_Volume"].quantile(0.0):,}')
print(f'95%: {allDataWinsorized["TSLA_Volume"].quantile(0.95):,}')

Count: 2,607
Average: 29,501,520.214806292
Min: 3,234,000.0
Max: 88,933,500.0
5%: 3,234,000.0
95%: 88,921,200.0


#### Imputing

In [20]:
import yfinance as yf
import pandas as pd

tsla = yf.Ticker("TSLA")
tslaData = tsla.history(period="max")
tslaData['Ticker'] = 'TSLA'

aapl = yf.Ticker("AAPL")
aaplData = aapl.history(period="max")
aaplData['Ticker'] = 'AAPL'

allData = pd.concat([tslaData, aaplData], axis=0)  #stack
allData.iloc[-2,-1] = 'aapl'
allData.iloc[-3,-1] = ' AAPL '
allData = allData.append(allData.iloc[-1,:])
allData = allData.append(allData.iloc[-5,:])
allData.iloc[-1,-1] = 'MSFT'

In [21]:
allData.iloc[-100, -4] = None
allData.iloc[50, -4] = None

In [22]:
allData.iloc[-101:-98]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-06-16,87.70686,88.141082,86.024895,87.86158,165428800.0,0.0,0.0,AAPL
2020-06-17,88.627701,88.690089,87.614529,87.739304,,0.0,0.0,AAPL
2020-06-18,87.694388,88.203473,87.147873,87.774246,96820400.0,0.0,0.0,AAPL


In [23]:
allData.iloc[48:52]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-09-07,4.122,4.2,4.1,4.108,1217000.0,0.0,0.0,TSLA
2010-09-08,4.132,4.19,4.12,4.18,1442000.0,0.0,0.0,TSLA
2010-09-09,4.2,4.21,4.138,4.142,,0.0,0.0,TSLA
2010-09-10,4.15,4.186,3.952,4.034,1933000.0,0.0,0.0,TSLA


In [266]:
#allData['Volume'].fillna(allData['Volume'].mean(), inplace=True) # Will include mean from both tickers!
#allData['Volume'].fillna(allData.groupby('Ticker')['Volume'].transform('mean'), inplace=True) # Better
#allData['Volume'].fillna(allData.groupby('Ticker')['Volume'].rolling(5,min_periods=1).mean().reset_index()['Volume'], inplace=True) # Rolling

In [26]:
allData['Volume'].interpolate(inplace=True)  # linear interpolate
allData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500.0,0.0,0.0,TSLA
2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500.0,0.0,0.0,TSLA
2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000.0,0.0,0.0,TSLA
2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000.0,0.0,0.0,TSLA
2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500.0,0.0,0.0,TSLA
...,...,...,...,...,...,...,...,...
2020-10-30,111.059998,111.989998,107.720001,108.860001,190272600.0,0.0,0.0,AAPL
2020-11-02,109.110001,110.680000,107.320000,108.769997,122866900.0,0.0,0.0,aapl
2020-11-03,109.660004,111.489998,108.730003,110.440002,107020000.0,0.0,0.0,AAPL
2020-11-03,109.660004,111.489998,108.730003,110.440002,107020000.0,0.0,0.0,AAPL


#### String cleaning

In [27]:
import yfinance as yf
import pandas as pd

tsla = yf.Ticker("TSLA")
tslaData = tsla.history(period="max")
tslaData['Ticker'] = 'TSLA'

aapl = yf.Ticker("AAPL")
aaplData = aapl.history(period="max")
aaplData['Ticker'] = 'AAPL'

allData = pd.concat([tslaData, aaplData], axis=0)
allData.iloc[-2,-1] = 'aapl'
allData.iloc[-3,-1] = ' AAPL '
allData = allData.append(allData.iloc[-1,:])
allData = allData.append(allData.iloc[-5,:])
allData.iloc[-1,-1] = 'MSFT'

In [28]:
allData.Ticker.unique()

array(['TSLA', 'AAPL', ' AAPL ', 'aapl', 'MSFT'], dtype=object)

In [31]:
allData['Ticker'] = allData['Ticker'].apply(lambda t:t.upper().strip())  # make everything upper case and remove space
allData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0.0,0.0,TSLA
2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0.0,0.0,TSLA
2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0.0,0.0,TSLA
2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0.0,0.0,TSLA
2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0.0,0.0,TSLA
...,...,...,...,...,...,...,...,...
2020-10-30,111.059998,111.989998,107.720001,108.860001,190272600,0.0,0.0,AAPL
2020-11-02,109.110001,110.680000,107.320000,108.769997,122866900,0.0,0.0,AAPL
2020-11-03,109.660004,111.489998,108.730003,110.440002,107020000,0.0,0.0,AAPL
2020-11-03,109.660004,111.489998,108.730003,110.440002,107020000,0.0,0.0,AAPL


In [32]:
allData.Ticker.unique()

array(['TSLA', 'AAPL', 'MSFT'], dtype=object)

In [33]:
allData = allData[allData.Ticker.isin(['AAPL', 'TSLA'])]

In [34]:
allData.Ticker.unique()

array(['TSLA', 'AAPL'], dtype=object)