# Tabular Data

### Creating a DF

In [1]:
import pandas as pd
df = pd.DataFrame({'Col1':[1,2,3,4], 'Col2':[5, 6, 7, 8]})  # Pass in a dict
df

Unnamed: 0,Col1,Col2
0,1,5
1,2,6
2,3,7
3,4,8


In [2]:
import pandas as pd
import numpy as np
arr = np.array([[1,2,3], [4,5,6]])  # ndArray
df = pd.DataFrame(arr)
df

df.columns = ['a', 'b', 'c']  # Name the column
df
df = pd.DataFrame(arr, columns=['a', 'b', 'c'])  
df
df = pd.DataFrame(arr, index=['z', 'x'], columns=['a', 'b', 'c'])  # Name indices
df

Unnamed: 0,a,b,c
z,1,2,3
x,4,5,6


In [3]:
# Renaming
df.columns = ['q', 'r', 's']  # Pass in the list, has to be the correct size
df

Unnamed: 0,q,r,s
z,1,2,3
x,4,5,6


### Importing a DF

In [4]:
import yfinance as yf
tsla = yf.Ticker("TSLA")
tslaData = tsla.history(period="max")
tslaData.head()  # Top 5 rows

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,3.8,5.0,3.508,4.778,93831500,0,0.0
2010-06-30,5.158,6.084,4.66,4.766,85935500,0,0.0
2010-07-01,5.0,5.184,4.054,4.392,41094000,0,0.0
2010-07-02,4.6,4.62,3.742,3.84,25699000,0,0.0
2010-07-06,4.0,4.0,3.166,3.222,34334500,0,0.0


In [5]:
tslaData.shape  # number of rows and columns

(2593, 7)

In [6]:
tslaData.index

DatetimeIndex(['2010-06-29', '2010-06-30', '2010-07-01', '2010-07-02',
               '2010-07-06', '2010-07-07', '2010-07-08', '2010-07-09',
               '2010-07-12', '2010-07-13',
               ...
               '2020-10-01', '2020-10-02', '2020-10-05', '2020-10-06',
               '2020-10-07', '2020-10-08', '2020-10-09', '2020-10-12',
               '2020-10-13', '2020-10-14'],
              dtype='datetime64[ns]', name='Date', length=2593, freq=None)

In [8]:
tslaData.reset_index()  # Date becomes a column and index has row number

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
1,2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0
2,2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
3,2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0
4,2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
...,...,...,...,...,...,...,...,...
2588,2020-10-08,438.440002,439.000000,425.299988,425.920013,40421100,0,0.0
2589,2020-10-09,430.130005,434.589996,426.459991,434.000000,28925700,0,0.0
2590,2020-10-12,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0
2591,2020-10-13,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0


In [10]:
tslaData.axes

[DatetimeIndex(['2010-06-29', '2010-06-30', '2010-07-01', '2010-07-02',
                '2010-07-06', '2010-07-07', '2010-07-08', '2010-07-09',
                '2010-07-12', '2010-07-13',
                ...
                '2020-10-01', '2020-10-02', '2020-10-05', '2020-10-06',
                '2020-10-07', '2020-10-08', '2020-10-09', '2020-10-12',
                '2020-10-13', '2020-10-14'],
               dtype='datetime64[ns]', name='Date', length=2593, freq=None),
 Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits'], dtype='object')]

In [11]:
display(tslaData)
print('\n\n')
display(tslaData.to_numpy())  # convert it to an ndarray

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0
2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0
2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
...,...,...,...,...,...,...,...
2020-10-08,438.440002,439.000000,425.299988,425.920013,40421100,0,0.0
2020-10-09,430.130005,434.589996,426.459991,434.000000,28925700,0,0.0
2020-10-12,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0
2020-10-13,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0







array([[3.79999995e+00, 5.00000000e+00, 3.50799990e+00, ...,
        9.38315000e+07, 0.00000000e+00, 0.00000000e+00],
       [5.15799999e+00, 6.08400011e+00, 4.65999985e+00, ...,
        8.59355000e+07, 0.00000000e+00, 0.00000000e+00],
       [5.00000000e+00, 5.18400002e+00, 4.05399990e+00, ...,
        4.10940000e+07, 0.00000000e+00, 0.00000000e+00],
       ...,
       [4.42000000e+02, 4.48739990e+02, 4.38579987e+02, ...,
        3.87911000e+07, 0.00000000e+00, 0.00000000e+00],
       [4.43350006e+02, 4.48890015e+02, 4.36600006e+02, ...,
        3.44637000e+07, 0.00000000e+00, 0.00000000e+00],
       [4.49779999e+02, 4.65899994e+02, 4.47350006e+02, ...,
        4.78797000e+07, 0.00000000e+00, 0.00000000e+00]])

In [12]:
tslaData.describe()  # Aggregate columns with default statistics

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits
count,2593.0,2593.0,2593.0,2593.0,2593.0,2593.0,2593.0
mean,51.100132,52.233443,49.940083,51.15181,31279780.0,0.0,0.001928
std,63.754763,65.749044,61.716913,63.953478,28974760.0,0.0,0.09819
min,3.228,3.326,2.996,3.16,592500.0,0.0,0.0
25%,7.19,7.318,7.008,7.2,10605500.0,0.0,0.0
50%,44.108002,44.759998,43.377998,43.998001,24467000.0,0.0,0.0
75%,59.543999,60.647999,58.380001,59.492001,40545000.0,0.0,0.0
max,502.140015,502.48999,470.51001,498.320007,304694000.0,0.0,5.0


In [14]:
#tslaData.sort_values(by='Volume', ascending=False, kind='mergesort', axis=0)
tslaData.sort_values(by='2010-06-30', ascending=False, kind='mergesort', axis=1)  # sort the dataframe, axis=0 -> operate on row

Unnamed: 0_level_0,Volume,High,Open,Close,Low,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,93831500,5.000000,3.800000,4.778000,3.508000,0,0.0
2010-06-30,85935500,6.084000,5.158000,4.766000,4.660000,0,0.0
2010-07-01,41094000,5.184000,5.000000,4.392000,4.054000,0,0.0
2010-07-02,25699000,4.620000,4.600000,3.840000,3.742000,0,0.0
2010-07-06,34334500,4.000000,4.000000,3.222000,3.166000,0,0.0
...,...,...,...,...,...,...,...
2020-10-08,40421100,439.000000,438.440002,425.920013,425.299988,0,0.0
2020-10-09,28925700,434.589996,430.130005,434.000000,426.459991,0,0.0
2020-10-12,38791100,448.739990,442.000000,442.299988,438.579987,0,0.0
2020-10-13,34463700,448.890015,443.350006,446.649994,436.600006,0,0.0


In [16]:
tslaData.isna()  # whether or not there's a null value
tslaData.dropna()  # drop rows that has null value, useful for data cleaning

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0
2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0
2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
...,...,...,...,...,...,...,...
2020-10-08,438.440002,439.000000,425.299988,425.920013,40421100,0,0.0
2020-10-09,430.130005,434.589996,426.459991,434.000000,28925700,0,0.0
2020-10-12,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0
2020-10-13,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0


In [20]:
#dct = tslaData.to_dict()
dct = tslaData.to_dict(orient='list')  # convert dataframe to dictionary
#display(dct)
pd.DataFrame.from_dict(dct)  # reload the dict originally from the df and recreate the df

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
1,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0
2,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
3,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0
4,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
...,...,...,...,...,...,...,...
2588,438.440002,439.000000,425.299988,425.920013,40421100,0,0.0
2589,430.130005,434.589996,426.459991,434.000000,28925700,0,0.0
2590,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0
2591,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0


### Slicing

In [21]:
tslaData = tslaData.reset_index()

In [22]:
# Slicing
tslaData[2:5]



Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
2,2010-07-01,5.0,5.184,4.054,4.392,41094000,0,0.0
3,2010-07-02,4.6,4.62,3.742,3.84,25699000,0,0.0
4,2010-07-06,4.0,4.0,3.166,3.222,34334500,0,0.0


In [23]:
tslaData[-2:]  # data from the last 2 row

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
2591,2020-10-13,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0
2592,2020-10-14,449.779999,465.899994,447.350006,461.299988,47879700,0,0.0


In [24]:
tslaData['Open'][2]  # give 2nd value from the 'Open' column

5.0

In [25]:
tslaData['Open'][0] 

3.799999952316284

In [26]:
tslaData[['Open', 'Close']]  #see multiple columns

Unnamed: 0,Open,Close
0,3.800000,4.778000
1,5.158000,4.766000
2,5.000000,4.392000
3,4.600000,3.840000
4,4.000000,3.222000
...,...,...
2588,438.440002,425.920013
2589,430.130005,434.000000
2590,442.000000,442.299988
2591,443.350006,446.649994


In [27]:
tslaData.loc[2:5,'Close']  # see row 2:5 from 'Close' column

2    4.392
3    3.840
4    3.222
5    3.160
Name: Close, dtype: float64

In [29]:
tslaData.iloc[:,3]  # similar to loc but require a numeric index, i.e. 3 instead of 'Close'

0         3.508000
1         4.660000
2         4.054000
3         3.742000
4         3.166000
           ...    
2588    425.299988
2589    426.459991
2590    438.579987
2591    436.600006
2592    447.350006
Name: Low, Length: 2593, dtype: float64

### Filtering

In [30]:
len(tslaData)  # how many rows

2593

In [31]:
tslaData[tslaData.Volume > 1e7]  # only see row where the volume exceed 10^7

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
1,2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0
2,2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
3,2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0
4,2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
...,...,...,...,...,...,...,...,...
2588,2020-10-08,438.440002,439.000000,425.299988,425.920013,40421100,0,0.0
2589,2020-10-09,430.130005,434.589996,426.459991,434.000000,28925700,0,0.0
2590,2020-10-12,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0
2591,2020-10-13,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0


In [32]:
tslaData[(tslaData.Volume > 1e7)&(tslaData.Open < tslaData.Close)]

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
6,2010-07-08,3.228000,3.504000,3.114000,3.492000,38557000,0,0.0
9,2010-07-13,3.478000,3.728000,3.380000,3.628000,13400500,0,0.0
10,2010-07-14,3.588000,4.030000,3.552000,3.968000,20976000,0,0.0
13,2010-07-19,4.274000,4.450000,4.184000,4.382000,12432500,0,0.0
...,...,...,...,...,...,...,...,...
2587,2020-10-07,419.869995,429.899994,413.850006,425.299988,43127700,0,0.0
2589,2020-10-09,430.130005,434.589996,426.459991,434.000000,28925700,0,0.0
2590,2020-10-12,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0
2591,2020-10-13,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0


In [34]:
tslaData.loc[lambda x: x.index % 2 == 0]  # slice using a callable (lambda)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
2,2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
4,2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
6,2010-07-08,3.228000,3.504000,3.114000,3.492000,38557000,0,0.0
8,2010-07-12,3.590000,3.614000,3.400000,3.410000,11012500,0,0.0
...,...,...,...,...,...,...,...,...
2584,2020-10-02,421.390015,439.130005,415.000000,415.089996,71430000,0,0.0
2586,2020-10-06,423.790009,428.779999,406.049988,413.980011,49146300,0,0.0
2588,2020-10-08,438.440002,439.000000,425.299988,425.920013,40421100,0,0.0
2590,2020-10-12,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0


In [None]:
import pandas as pd
import yfinance as yf

## Data Manipulation

In [None]:
tsla = yf.Ticker("TSLA")
tslaData = tsla.history(period="max")

msft = yf.Ticker("MSFT")
msftData = msft.history(period="max")
                        


In [None]:
display(tslaData)

display(msftData)

In [None]:
melted = tslaData.reset_index().melt(id_vars=['Date'], var_name="Attribute", value_name="Value")
melted

In [None]:
pivoted = melted.pivot(index='Date', columns='Attribute')
pivoted = pivoted['Value'].reset_index()
pivoted.columns.name = None
pivoted

In [None]:
tslaData['Ticker'] = 'TSLA'
msftData['Ticker'] = 'MSFT'
allData = pd.concat([tslaData, msftData], axis=0)
allData

In [None]:
import numpy as np
allData.pivot_table(index = ['Ticker'], columns=[], values=['Open', 'Close'], aggfunc=np.std) # also np.mean and others

In [None]:
allData.groupby(by=['Date', 'Ticker']).sum() # Also see as_index=False

In [None]:
# Custom aggregation
from scipy import stats
# We want to use stats.sem (standard error). Can also create your own function
allData.groupby(by=['Ticker']).agg({'Open': stats.sem, 'Close':np.sum}) # Can customize by column as well

### Joins


In [None]:
tsla = yf.Ticker("TSLA")
tslaData = tsla.history(period="max")

msft = yf.Ticker("MSFT")
msftData = msft.history(period="max")
                        


In [None]:
tslaData = tslaData[tslaData.Open<tslaData.Close]
msftData = msftData[msftData.Open<msftData.Close]

In [None]:
msftData.join(tslaData, how='outer', rsuffix='_tsla', lsuffix='_msft') # left, right, outer, inner

In [None]:
msftData.reset_index().join(tslaData, on='Date', how='right', lsuffix='_tsla', rsuffix='_msft') # left, right, outer, inner

In [None]:
tslaData.merge(msftData, how='left', left_on='Date', right_on='Date', suffixes=['_tsla', '_msft'])

In [None]:
pd.merge_asof(msftData, tslaData, left_on='Date', right_on='Date', suffixes=['_msft', '_tsla'], direction='nearest')

### MultiIndex

In [None]:
tickers = yf.Tickers('msft aapl goog tsla')
hist = tickers.history(group_by="ticker")

In [None]:
hist.head()

In [None]:
hist.columns

In [None]:
hist.shape

In [None]:
hist

In [None]:
hist.stack(level=0).reset_index().rename(columns = {'level_1':'Ticker'})

# Rename column level_1

In [None]:
hist.stack(level=0).unstack()

## Transformations

In [1]:
import pandas as pd
import yfinance as yf
tsla = yf.Ticker("TSLA")
tslaData = tsla.history(period="max")

msft = yf.Ticker("MSFT")
msftData = msft.history(period="max")
                        


In [3]:
# Calculated columns
msftData['Daily Gain %'] = 100*(msftData['Close']-msftData['Open'])/msftData['Open'] # Approach 1 (preferred generally)
msftData = msftData.assign(Daily_Gain=lambda x:x['Close']-x['Open']) # Approach 2

msftData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Daily Gain %,Daily_Gain
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1986-03-13,0.06,0.06,0.06,0.06,1031788800,0.0,0.0,0.000000,0.00
1986-03-14,0.06,0.07,0.06,0.06,308160000,0.0,0.0,0.000000,0.00
1986-03-17,0.06,0.07,0.06,0.07,133171200,0.0,0.0,16.666667,0.01
1986-03-18,0.07,0.07,0.06,0.06,67766400,0.0,0.0,-14.285714,-0.01
1986-03-19,0.06,0.06,0.06,0.06,47894400,0.0,0.0,0.000000,0.00
...,...,...,...,...,...,...,...,...,...
2020-07-09,216.33,216.38,211.47,214.32,33121700,0.0,0.0,-0.929136,-2.01
2020-07-10,213.62,214.08,211.08,213.67,26177600,0.0,0.0,0.023406,0.05
2020-07-13,214.48,215.80,206.50,207.07,38135600,0.0,0.0,-3.454868,-7.41
2020-07-14,206.13,208.85,202.03,208.35,37591800,0.0,0.0,1.076990,2.22


In [4]:
tslaData.apply(lambda x:x*1e-6)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,0.000019,0.000025,0.000018,0.000024,18.7663,0.0,0.0
2010-06-30,0.000026,0.000030,0.000023,0.000024,17.1871,0.0,0.0
2010-07-01,0.000025,0.000026,0.000020,0.000022,8.2188,0.0,0.0
2010-07-02,0.000023,0.000023,0.000019,0.000019,5.1398,0.0,0.0
2010-07-06,0.000020,0.000020,0.000016,0.000016,6.8669,0.0,0.0
...,...,...,...,...,...,...,...
2020-07-09,0.001397,0.001409,0.001351,0.001394,11.7176,0.0,0.0
2020-07-10,0.001396,0.001549,0.001376,0.001545,23.3376,0.0,0.0
2020-07-13,0.001659,0.001795,0.001471,0.001497,38.9854,0.0,0.0
2020-07-14,0.001556,0.001590,0.001431,0.001517,23.4181,0.0,0.0


In [5]:
tslaData['Volume'] = tslaData['Volume'].apply(lambda x:x*1e-6)
tslaData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,19.00,25.00,17.54,23.89,18.7663,0,0
2010-06-30,25.79,30.42,23.30,23.83,17.1871,0,0
2010-07-01,25.00,25.92,20.27,21.96,8.2188,0,0
2010-07-02,23.00,23.10,18.71,19.20,5.1398,0,0
2010-07-06,20.00,20.00,15.83,16.11,6.8669,0,0
...,...,...,...,...,...,...,...
2020-07-09,1396.99,1408.56,1351.28,1394.28,11.7176,0,0
2020-07-10,1396.00,1548.92,1376.01,1544.65,23.3376,0,0
2020-07-13,1659.00,1794.99,1471.11,1497.06,38.9854,0,0
2020-07-14,1556.00,1590.00,1431.00,1516.80,23.4181,0,0


In [6]:
msftData['Volume'] = msftData['Volume'].transform(lambda x:x*1e-6)
msftData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Daily Gain %,Daily_Gain
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1986-03-13,0.06,0.06,0.06,0.06,1031.7888,0.0,0.0,0.000000,0.00
1986-03-14,0.06,0.07,0.06,0.06,308.1600,0.0,0.0,0.000000,0.00
1986-03-17,0.06,0.07,0.06,0.07,133.1712,0.0,0.0,16.666667,0.01
1986-03-18,0.07,0.07,0.06,0.06,67.7664,0.0,0.0,-14.285714,-0.01
1986-03-19,0.06,0.06,0.06,0.06,47.8944,0.0,0.0,0.000000,0.00
...,...,...,...,...,...,...,...,...,...
2020-07-09,216.33,216.38,211.47,214.32,33.1217,0.0,0.0,-0.929136,-2.01
2020-07-10,213.62,214.08,211.08,213.67,26.1776,0.0,0.0,0.023406,0.05
2020-07-13,214.48,215.80,206.50,207.07,38.1356,0.0,0.0,-3.454868,-7.41
2020-07-14,206.13,208.85,202.03,208.35,37.5918,0.0,0.0,1.076990,2.22


In [8]:
tslaData['Ticker'] = 'TSLA'
msftData['Ticker'] = 'MSFT'
allData = pd.concat([tslaData, msftData], axis=0)


In [13]:
grouped = allData.groupby(['Ticker']).agg({'Volume': np.sum})
grouped = allData.groupby(['Ticker']).transform(np.sum)
grouped

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Daily Gain %,Daily_Gain
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-06-29,541838.91,552727.33,530569.19,542099.36,15423.1615,0.00,0.0,0.000000,0.00
2010-06-30,541838.91,552727.33,530569.19,542099.36,15423.1615,0.00,0.0,0.000000,0.00
2010-07-01,541838.91,552727.33,530569.19,542099.36,15423.1615,0.00,0.0,0.000000,0.00
2010-07-02,541838.91,552727.33,530569.19,542099.36,15423.1615,0.00,0.0,0.000000,0.00
2010-07-06,541838.91,552727.33,530569.19,542099.36,15423.1615,0.00,0.0,0.000000,0.00
...,...,...,...,...,...,...,...,...,...
2020-07-09,221517.93,223850.39,219131.64,221573.97,521137.3700,18.64,17.0,636.946054,56.04
2020-07-10,221517.93,223850.39,219131.64,221573.97,521137.3700,18.64,17.0,636.946054,56.04
2020-07-13,221517.93,223850.39,219131.64,221573.97,521137.3700,18.64,17.0,636.946054,56.04
2020-07-14,221517.93,223850.39,219131.64,221573.97,521137.3700,18.64,17.0,636.946054,56.04


In [None]:
#allData['TickerAvgVolume'] = allData.groupby(['Ticker']).agg({'Volume': 'sum'}) # doesnt work
allData.join(allData.groupby(['Ticker']).agg({'Volume': 'mean'}).rename(columns={'Volume':'TickerAvgVolume'}), on='Ticker')

In [16]:
allData.join(allData.groupby(['Ticker']).agg({'Volume': 'mean'}).rename(columns={'Volume':'TickerAvgVolume'}), on='Ticker')

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,Daily Gain %,Daily_Gain,TickerAvgVolume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2010-06-29,19.00,25.00,17.54,23.89,18.7663,0.0,0.0,TSLA,,,6.098522
2010-06-30,25.79,30.42,23.30,23.83,17.1871,0.0,0.0,TSLA,,,6.098522
2010-07-01,25.00,25.92,20.27,21.96,8.2188,0.0,0.0,TSLA,,,6.098522
2010-07-02,23.00,23.10,18.71,19.20,5.1398,0.0,0.0,TSLA,,,6.098522
2010-07-06,20.00,20.00,15.83,16.11,6.8669,0.0,0.0,TSLA,,,6.098522
...,...,...,...,...,...,...,...,...,...,...,...
2020-07-09,216.33,216.38,211.47,214.32,33.1217,0.0,0.0,MSFT,-0.929136,-2.01,60.205334
2020-07-10,213.62,214.08,211.08,213.67,26.1776,0.0,0.0,MSFT,0.023406,0.05,60.205334
2020-07-13,214.48,215.80,206.50,207.07,38.1356,0.0,0.0,MSFT,-3.454868,-7.41,60.205334
2020-07-14,206.13,208.85,202.03,208.35,37.5918,0.0,0.0,MSFT,1.076990,2.22,60.205334


In [18]:
allData.groupby(['Ticker']).transform('mean')[['Volume']] # vs agg

Unnamed: 0_level_0,Volume
Date,Unnamed: 1_level_1
2010-06-29,6.098522
2010-06-30,6.098522
2010-07-01,6.098522
2010-07-02,6.098522
2010-07-06,6.098522
...,...
2020-07-09,60.205334
2020-07-10,60.205334
2020-07-13,60.205334
2020-07-14,60.205334


In [None]:
# Cleaner!
allData['TickerAvgVolume'] = allData.groupby(['Ticker']).transform('mean')[['Volume']].rename(columns={'Volume':'TickerAvgVolume'})
allData

### Time Series

#### Timestamps

In [19]:
# Using datetime
import datetime
import pytz

dateToConvert = datetime.datetime(2020,7,1,15,31,24) # This is a 'naive' time without timezone
dateToConvert

datetime.datetime(2020, 7, 1, 15, 31, 24)

In [20]:
dateToConvert.astimezone(pytz.UTC) # Set to UTC, Time stays the same
#dateToConvert.astimezone(pytz.timezone('US/Eastern')) # Set to Eastern, Time stays the same

datetime.datetime(2020, 7, 1, 19, 31, 24, tzinfo=<UTC>)

In [23]:
pytz.timezone('US/Eastern').localize(dateToConvert)

datetime.datetime(2020, 7, 1, 15, 31, 24, tzinfo=<UTC>)

In [25]:
#dateToConvert = dateToConvert.astimezone(pytz.UTC)
pytz.timezone('America/Chicago').localize(dateToConvert).astimezone(pytz.UTC) # America/Chicago

datetime.datetime(2020, 7, 1, 20, 31, 24, tzinfo=<UTC>)

In [26]:
start = pd.to_datetime('2015-01-01')
end = pd.to_datetime('2018-01-01')

def random_dates(start, end, n=10):

    start_u = start.value//10**9
    end_u = end.value//10**9

    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')

dates = random_dates(start, end, 1000000)
pyDates = list(dates.to_pydatetime())


In [None]:
pyDates

In [None]:
from timer import Timer

with Timer('ConvertDatesPy') as t:
    res = [pytz.timezone('US/Eastern').localize(r).astimezone(pytz.UTC) for r in pyDates]

with Timer('ConvertDatesNP') as t:
    res = dates.tz_localize('US/Eastern', nonexistent='shift_forward', ambiguous='NaT').tz_convert('UTC')

In [None]:
dates

In [None]:
res

In [31]:
# Add/Subtract business day
pd.to_datetime('2020.01.03') + pd.offsets.BDay()

Timestamp('2020-01-06 00:00:00')

In [33]:
# Date ranges
pd.date_range('1/1/2010', periods=10, freq=2 * pd.offsets.BDay())

DatetimeIndex(['2010-01-01', '2010-01-05', '2010-01-07', '2010-01-11',
               '2010-01-13', '2010-01-15', '2010-01-19', '2010-01-21',
               '2010-01-25', '2010-01-27'],
              dtype='datetime64[ns]', freq='2B')

In [None]:
#asfreq

In [34]:
dayMinuteBars = tsla.history(period='2d', interval='1m')

In [35]:
dayMinuteBars

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-07-14 09:30:00-04:00,1542.91,1556.97,1529.00,1529.00,1488159,0,0
2020-07-14 09:31:00-04:00,1534.86,1534.99,1526.25,1527.95,80603,0,0
2020-07-14 09:32:00-04:00,1530.00,1548.95,1526.54,1546.00,314374,0,0
2020-07-14 09:34:00-04:00,1558.10,1563.94,1555.18,1562.32,389478,0,0
2020-07-14 09:35:00-04:00,1562.25,1577.00,1560.71,1574.06,162266,0,0
...,...,...,...,...,...,...,...
2020-07-15 15:55:00-04:00,1542.45,1546.66,1542.45,1546.25,71227,0,0
2020-07-15 15:56:00-04:00,1546.38,1546.57,1543.29,1543.29,67381,0,0
2020-07-15 15:57:00-04:00,1543.46,1544.64,1542.08,1544.00,60059,0,0
2020-07-15 15:58:00-04:00,1543.75,1545.58,1543.20,1545.24,79103,0,0


In [36]:
def vwap(bars): 
    return ((bars.Close*bars.Volume).sum()/bars.Volume.sum()).round(2)

vwap(dayMinuteBars[dayMinuteBars.index.date==pd.to_datetime('2020-07-15')])

1503.87

In [41]:
dayMinuteBars.eval(
        'wgtd = Close * Volume', inplace=False
    ).groupby(dayMinuteBars.index.date).cumsum().eval('wgtd / Volume')

Datetime
2020-07-14 09:30:00-04:00    1529.000000
2020-07-14 09:31:00-04:00    1528.946051
2020-07-14 09:32:00-04:00    1531.793067
2020-07-14 09:34:00-04:00    1537.024738
2020-07-14 09:35:00-04:00    1539.492853
                                ...     
2020-07-15 15:55:00-04:00    1502.951659
2020-07-15 15:56:00-04:00    1503.126145
2020-07-15 15:57:00-04:00    1503.283130
2020-07-15 15:58:00-04:00    1503.494303
2020-07-15 15:59:00-04:00    1503.866655
Length: 775, dtype: float64

In [43]:
vwap = dayMinuteBars.assign(
    vwap=dayMinuteBars.eval(
        'wgtd = Close * Volume', inplace=False
    ).groupby(dayMinuteBars.index.date).cumsum().eval('wgtd / Volume')
).groupby(dayMinuteBars.index.date).agg({'vwap':lambda x: x.iloc[-1]})
vwap

Unnamed: 0,vwap
2020-07-14,1516.923739
2020-07-15,1503.866655


In [44]:
dayMinuteBars

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-07-14 09:30:00-04:00,1542.91,1556.97,1529.00,1529.00,1488159,0,0
2020-07-14 09:31:00-04:00,1534.86,1534.99,1526.25,1527.95,80603,0,0
2020-07-14 09:32:00-04:00,1530.00,1548.95,1526.54,1546.00,314374,0,0
2020-07-14 09:34:00-04:00,1558.10,1563.94,1555.18,1562.32,389478,0,0
2020-07-14 09:35:00-04:00,1562.25,1577.00,1560.71,1574.06,162266,0,0
...,...,...,...,...,...,...,...
2020-07-15 15:55:00-04:00,1542.45,1546.66,1542.45,1546.25,71227,0,0
2020-07-15 15:56:00-04:00,1546.38,1546.57,1543.29,1543.29,67381,0,0
2020-07-15 15:57:00-04:00,1543.46,1544.64,1542.08,1544.00,60059,0,0
2020-07-15 15:58:00-04:00,1543.75,1545.58,1543.20,1545.24,79103,0,0


In [48]:
dayMinuteBars.index.to_series().diff().shift(-1)/ np.timedelta64(1, 'm')

Datetime
2020-07-14 09:30:00-04:00    1.0
2020-07-14 09:31:00-04:00    1.0
2020-07-14 09:32:00-04:00    2.0
2020-07-14 09:34:00-04:00    1.0
2020-07-14 09:35:00-04:00    1.0
                            ... 
2020-07-15 15:55:00-04:00    1.0
2020-07-15 15:56:00-04:00    1.0
2020-07-15 15:57:00-04:00    1.0
2020-07-15 15:58:00-04:00    1.0
2020-07-15 15:59:00-04:00    NaN
Name: Datetime, Length: 775, dtype: float64

In [49]:
dayMinuteBars['TimeDiff'] = dayMinuteBars.index.to_series().diff().shift(-1)/ np.timedelta64(1, 's')
dayMinuteBars

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,TimeDiff
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-07-14 09:30:00-04:00,1542.91,1556.97,1529.00,1529.00,1488159,0,0,60.0
2020-07-14 09:31:00-04:00,1534.86,1534.99,1526.25,1527.95,80603,0,0,60.0
2020-07-14 09:32:00-04:00,1530.00,1548.95,1526.54,1546.00,314374,0,0,120.0
2020-07-14 09:34:00-04:00,1558.10,1563.94,1555.18,1562.32,389478,0,0,60.0
2020-07-14 09:35:00-04:00,1562.25,1577.00,1560.71,1574.06,162266,0,0,60.0
...,...,...,...,...,...,...,...,...
2020-07-15 15:55:00-04:00,1542.45,1546.66,1542.45,1546.25,71227,0,0,60.0
2020-07-15 15:56:00-04:00,1546.38,1546.57,1543.29,1543.29,67381,0,0,60.0
2020-07-15 15:57:00-04:00,1543.46,1544.64,1542.08,1544.00,60059,0,0,60.0
2020-07-15 15:58:00-04:00,1543.75,1545.58,1543.20,1545.24,79103,0,0,60.0


In [50]:
twap = dayMinuteBars.assign(
    twap=dayMinuteBars.eval(
        'wgtd = Close * TimeDiff', inplace=False
    ).groupby(dayMinuteBars.index.date).cumsum().eval('wgtd / TimeDiff')
).groupby(dayMinuteBars.index.date).agg({'twap':lambda x: x.iloc[-1]})
twap

Unnamed: 0,twap
2020-07-14,1514.839396
2020-07-15,


#### Functionality

In [51]:
tslaData.cumsum()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-06-29,19.00,25.00,17.54,23.89,18.7663,0,0,TSLA
2010-06-30,44.79,55.42,40.84,47.72,35.9534,0,0,TSLATSLA
2010-07-01,69.79,81.34,61.11,69.68,44.1722,0,0,TSLATSLATSLA
2010-07-02,92.79,104.44,79.82,88.88,49.3120,0,0,TSLATSLATSLATSLA
2010-07-06,112.79,124.44,95.65,104.99,56.1789,0,0,TSLATSLATSLATSLATSLA
...,...,...,...,...,...,...,...,...
2020-07-09,535684.91,546243.42,524834.07,535994.84,15321.1562,0,0,TSLATSLATSLATSLATSLATSLATSLATSLATSLATSLATSLATS...
2020-07-10,537080.91,547792.34,526210.08,537539.49,15344.4938,0,0,TSLATSLATSLATSLATSLATSLATSLATSLATSLATSLATSLATS...
2020-07-13,538739.91,549587.33,527681.19,539036.55,15383.4792,0,0,TSLATSLATSLATSLATSLATSLATSLATSLATSLATSLATSLATS...
2020-07-14,540295.91,551177.33,529112.19,540553.35,15406.8973,0,0,TSLATSLATSLATSLATSLATSLATSLATSLATSLATSLATSLATS...


In [52]:
tslaData['Volume'].cumsum()

Date
2010-06-29       18.7663
2010-06-30       35.9534
2010-07-01       44.1722
2010-07-02       49.3120
2010-07-06       56.1789
                 ...    
2020-07-09    15321.1562
2020-07-10    15344.4938
2020-07-13    15383.4792
2020-07-14    15406.8973
2020-07-15    15423.1615
Name: Volume, Length: 2529, dtype: float64

In [53]:
tslaData['CumVolume'] = tslaData['Volume'].cumsum()
tslaData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,CumVolume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-06-29,19.00,25.00,17.54,23.89,18.7663,0,0,TSLA,18.7663
2010-06-30,25.79,30.42,23.30,23.83,17.1871,0,0,TSLA,35.9534
2010-07-01,25.00,25.92,20.27,21.96,8.2188,0,0,TSLA,44.1722
2010-07-02,23.00,23.10,18.71,19.20,5.1398,0,0,TSLA,49.3120
2010-07-06,20.00,20.00,15.83,16.11,6.8669,0,0,TSLA,56.1789
...,...,...,...,...,...,...,...,...,...
2020-07-09,1396.99,1408.56,1351.28,1394.28,11.7176,0,0,TSLA,15321.1562
2020-07-10,1396.00,1548.92,1376.01,1544.65,23.3376,0,0,TSLA,15344.4938
2020-07-13,1659.00,1794.99,1471.11,1497.06,38.9854,0,0,TSLA,15383.4792
2020-07-14,1556.00,1590.00,1431.00,1516.80,23.4181,0,0,TSLA,15406.8973


In [54]:
tslaData['Volume'].cummax()

Date
2010-06-29    18.7663
2010-06-30    18.7663
2010-07-01    18.7663
2010-07-02    18.7663
2010-07-06    18.7663
               ...   
2020-07-09    60.9388
2020-07-10    60.9388
2020-07-13    60.9388
2020-07-14    60.9388
2020-07-15    60.9388
Name: Volume, Length: 2529, dtype: float64

In [55]:
tslaData['Volume'].cummin()

Date
2010-06-29    18.7663
2010-06-30    17.1871
2010-07-01     8.2188
2010-07-02     5.1398
2010-07-06     5.1398
               ...   
2020-07-09     0.1185
2020-07-10     0.1185
2020-07-13     0.1185
2020-07-14     0.1185
2020-07-15     0.1185
Name: Volume, Length: 2529, dtype: float64

In [56]:
tslaData['Volume'].cumprod()

Date
2010-06-29    1.876630e+01
2010-06-30    3.225383e+02
2010-07-01    2.650878e+03
2010-07-02    1.362498e+04
2010-07-06    9.356138e+04
                  ...     
2020-07-09             inf
2020-07-10             inf
2020-07-13             inf
2020-07-14             inf
2020-07-15             inf
Name: Volume, Length: 2529, dtype: float64

In [58]:
tslaData['Volume'].rolling(10).sum() # sum, mean, median, var, std, min, max, corr, cov, skew, kurt, quantile

Date
2010-06-29         NaN
2010-06-30         NaN
2010-07-01         NaN
2010-07-02         NaN
2010-07-06         NaN
                ...   
2020-07-09    144.7198
2020-07-10    158.8029
2020-07-13    188.9334
2020-07-14    203.3251
2020-07-15    202.6708
Name: Volume, Length: 2529, dtype: float64

In [59]:
zscore = lambda x: (x[-1] - x.mean()) / x.std(ddof=1)
tslaData['Volume'].rolling(10).apply(zscore)

Date
2010-06-29         NaN
2010-06-30         NaN
2010-07-01         NaN
2010-07-02         NaN
2010-07-06         NaN
                ...   
2020-07-09   -0.583061
2020-07-10    1.467559
2020-07-13    2.408939
2020-07-14    0.402659
2020-07-15   -0.519980
Name: Volume, Length: 2529, dtype: float64

In [62]:
# Expanding is very similar to cumulative, but 'waits' (NaN) until the minimum number specified

tslaData['Volume'].expanding(10).sum() # sum, mean, median, var, std, min, max, corr, cov, skew, kurt, quantile

Date
2010-06-29       18.7663
2010-06-30       35.9534
2010-07-01       44.1722
2010-07-02       49.3120
2010-07-06       56.1789
                 ...    
2020-07-09    15321.1562
2020-07-10    15344.4938
2020-07-13    15383.4792
2020-07-14    15406.8973
2020-07-15    15423.1615
Name: Volume, Length: 2529, dtype: float64

In [63]:
tslaData['Volume'].ewm(2).mean() # std, var, corr, cob

Date
2010-06-29    18.766300
2010-06-30    17.818780
2010-07-01    13.271421
2010-07-02     9.893671
2010-07-06     8.731735
                ...    
2020-07-09    15.493710
2020-07-10    18.108340
2020-07-13    25.067360
2020-07-14    24.517607
2020-07-15    21.766471
Name: Volume, Length: 2529, dtype: float64

In [65]:
# Deltas
tslaData['OpenDelta'] = tslaData.Open.diff().shift(-1)
tslaData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,CumVolume,OpenDelta
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2010-06-29,19.00,25.00,17.54,23.89,18.7663,0,0,TSLA,18.7663,6.79
2010-06-30,25.79,30.42,23.30,23.83,17.1871,0,0,TSLA,35.9534,-0.79
2010-07-01,25.00,25.92,20.27,21.96,8.2188,0,0,TSLA,44.1722,-2.00
2010-07-02,23.00,23.10,18.71,19.20,5.1398,0,0,TSLA,49.3120,-3.00
2010-07-06,20.00,20.00,15.83,16.11,6.8669,0,0,TSLA,56.1789,-3.60
...,...,...,...,...,...,...,...,...,...,...
2020-07-09,1396.99,1408.56,1351.28,1394.28,11.7176,0,0,TSLA,15321.1562,-0.99
2020-07-10,1396.00,1548.92,1376.01,1544.65,23.3376,0,0,TSLA,15344.4938,263.00
2020-07-13,1659.00,1794.99,1471.11,1497.06,38.9854,0,0,TSLA,15383.4792,-103.00
2020-07-14,1556.00,1590.00,1431.00,1516.80,23.4181,0,0,TSLA,15406.8973,-13.00


In [None]:
# Deltas
tslaData['OpenDelta'] = tslaData.Open.diff() # .shift(-1)
tslaData