### Creating a DF

In [1]:
import pandas as pd
df = pd.DataFrame({'Col1':[1,2,3,4], 'Col2':[5, 6, 7, 8]})  # Pass in a dict
df

Unnamed: 0,Col1,Col2
0,1,5
1,2,6
2,3,7
3,4,8


In [2]:
import pandas as pd
import numpy as np
arr = np.array([[1,2,3], [4,5,6]])  # ndArray
df = pd.DataFrame(arr)
df

df.columns = ['a', 'b', 'c']  # Name the column
df
df = pd.DataFrame(arr, columns=['a', 'b', 'c'])  
df
df = pd.DataFrame(arr, index=['z', 'x'], columns=['a', 'b', 'c'])  # Name indices
df

Unnamed: 0,a,b,c
z,1,2,3
x,4,5,6


In [3]:
# Renaming
df.columns = ['q', 'r', 's']  # Pass in the list, has to be the correct size
df

Unnamed: 0,q,r,s
z,1,2,3
x,4,5,6


### Importing a DF

In [4]:
import yfinance as yf
tsla = yf.Ticker("TSLA")
tslaData = tsla.history(period="max")
tslaData.head()  # Top 5 rows

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,3.8,5.0,3.508,4.778,93831500,0,0.0
2010-06-30,5.158,6.084,4.66,4.766,85935500,0,0.0
2010-07-01,5.0,5.184,4.054,4.392,41094000,0,0.0
2010-07-02,4.6,4.62,3.742,3.84,25699000,0,0.0
2010-07-06,4.0,4.0,3.166,3.222,34334500,0,0.0


In [5]:
tslaData.shape  # number of rows and columns

(2594, 7)

In [6]:
tslaData.index

DatetimeIndex(['2010-06-29', '2010-06-30', '2010-07-01', '2010-07-02',
               '2010-07-06', '2010-07-07', '2010-07-08', '2010-07-09',
               '2010-07-12', '2010-07-13',
               ...
               '2020-10-02', '2020-10-05', '2020-10-06', '2020-10-07',
               '2020-10-08', '2020-10-09', '2020-10-12', '2020-10-13',
               '2020-10-14', '2020-10-15'],
              dtype='datetime64[ns]', name='Date', length=2594, freq=None)

In [7]:
tslaData.reset_index()  # Date becomes a column and index has row number

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
1,2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0
2,2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
3,2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0
4,2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
...,...,...,...,...,...,...,...,...
2589,2020-10-09,430.130005,434.589996,426.459991,434.000000,28925700,0,0.0
2590,2020-10-12,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0
2591,2020-10-13,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0
2592,2020-10-14,449.779999,465.899994,447.350006,461.299988,48045400,0,0.0


In [8]:
tslaData.axes

[DatetimeIndex(['2010-06-29', '2010-06-30', '2010-07-01', '2010-07-02',
                '2010-07-06', '2010-07-07', '2010-07-08', '2010-07-09',
                '2010-07-12', '2010-07-13',
                ...
                '2020-10-02', '2020-10-05', '2020-10-06', '2020-10-07',
                '2020-10-08', '2020-10-09', '2020-10-12', '2020-10-13',
                '2020-10-14', '2020-10-15'],
               dtype='datetime64[ns]', name='Date', length=2594, freq=None),
 Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits'], dtype='object')]

In [9]:
display(tslaData)
print('\n\n')
display(tslaData.to_numpy())  # convert it to an ndarray

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0
2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0
2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
...,...,...,...,...,...,...,...
2020-10-09,430.130005,434.589996,426.459991,434.000000,28925700,0,0.0
2020-10-12,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0
2020-10-13,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0
2020-10-14,449.779999,465.899994,447.350006,461.299988,48045400,0,0.0







array([[3.79999995e+00, 5.00000000e+00, 3.50799990e+00, ...,
        9.38315000e+07, 0.00000000e+00, 0.00000000e+00],
       [5.15799999e+00, 6.08400011e+00, 4.65999985e+00, ...,
        8.59355000e+07, 0.00000000e+00, 0.00000000e+00],
       [5.00000000e+00, 5.18400002e+00, 4.05399990e+00, ...,
        4.10940000e+07, 0.00000000e+00, 0.00000000e+00],
       ...,
       [4.43350006e+02, 4.48890015e+02, 4.36600006e+02, ...,
        3.44637000e+07, 0.00000000e+00, 0.00000000e+00],
       [4.49779999e+02, 4.65899994e+02, 4.47350006e+02, ...,
        4.80454000e+07, 0.00000000e+00, 0.00000000e+00],
       [4.50309998e+02, 4.56570007e+02, 4.42500000e+02, ...,
        3.55542000e+07, 0.00000000e+00, 0.00000000e+00]])

In [10]:
tslaData.describe()  # Aggregate columns with default statistics

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits
count,2594.0,2594.0,2594.0,2594.0,2594.0,2594.0,2594.0
mean,51.254029,52.389317,50.091417,51.305136,31249510.0,0.0,0.001928
std,64.222579,66.21401,62.184532,64.416241,28935580.0,0.0,0.098171
min,3.228,3.326,2.996,3.16,592500.0,0.0,0.0
25%,7.1925,7.321,7.014,7.2005,10622620.0,0.0,0.0
50%,44.115002,44.759998,43.390999,44.0,24463250.0,0.0,0.0
75%,59.5725,60.7545,58.409999,59.551999,40492000.0,0.0,0.0
max,502.140015,502.48999,470.51001,498.320007,304694000.0,0.0,5.0


In [11]:
#tslaData.sort_values(by='Volume', ascending=False, kind='mergesort', axis=0)
tslaData.sort_values(by='2010-06-30', ascending=False, kind='mergesort', axis=1)  # sort the dataframe, axis=0 -> operate on row

Unnamed: 0_level_0,Volume,High,Open,Close,Low,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,93831500,5.000000,3.800000,4.778000,3.508000,0,0.0
2010-06-30,85935500,6.084000,5.158000,4.766000,4.660000,0,0.0
2010-07-01,41094000,5.184000,5.000000,4.392000,4.054000,0,0.0
2010-07-02,25699000,4.620000,4.600000,3.840000,3.742000,0,0.0
2010-07-06,34334500,4.000000,4.000000,3.222000,3.166000,0,0.0
...,...,...,...,...,...,...,...
2020-10-09,28925700,434.589996,430.130005,434.000000,426.459991,0,0.0
2020-10-12,38791100,448.739990,442.000000,442.299988,438.579987,0,0.0
2020-10-13,34463700,448.890015,443.350006,446.649994,436.600006,0,0.0
2020-10-14,48045400,465.899994,449.779999,461.299988,447.350006,0,0.0


In [12]:
tslaData.isna()  # whether or not there's a null value
tslaData.dropna()  # drop rows that has null value, useful for data cleaning

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0
2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0
2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
...,...,...,...,...,...,...,...
2020-10-09,430.130005,434.589996,426.459991,434.000000,28925700,0,0.0
2020-10-12,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0
2020-10-13,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0
2020-10-14,449.779999,465.899994,447.350006,461.299988,48045400,0,0.0


In [13]:
#dct = tslaData.to_dict()
dct = tslaData.to_dict(orient='list')  # convert dataframe to dictionary
#display(dct)
pd.DataFrame.from_dict(dct)  # reload the dict originally from the df and recreate the df

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
1,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0
2,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
3,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0
4,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
...,...,...,...,...,...,...,...
2589,430.130005,434.589996,426.459991,434.000000,28925700,0,0.0
2590,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0
2591,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0
2592,449.779999,465.899994,447.350006,461.299988,48045400,0,0.0


### Slicing

In [14]:
tslaData = tslaData.reset_index()

In [15]:
# Slicing
tslaData[2:5]



Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
2,2010-07-01,5.0,5.184,4.054,4.392,41094000,0,0.0
3,2010-07-02,4.6,4.62,3.742,3.84,25699000,0,0.0
4,2010-07-06,4.0,4.0,3.166,3.222,34334500,0,0.0


In [16]:
tslaData[-2:]  # data from the last 2 row

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
2592,2020-10-14,449.779999,465.899994,447.350006,461.299988,48045400,0,0.0
2593,2020-10-15,450.309998,456.570007,442.5,448.880005,35554200,0,0.0


In [17]:
tslaData['Open'][2]  # give 2nd value from the 'Open' column

5.0

In [18]:
tslaData['Open'][0] 

3.799999952316284

In [19]:
tslaData[['Open', 'Close']]  #see multiple columns

Unnamed: 0,Open,Close
0,3.800000,4.778000
1,5.158000,4.766000
2,5.000000,4.392000
3,4.600000,3.840000
4,4.000000,3.222000
...,...,...
2589,430.130005,434.000000
2590,442.000000,442.299988
2591,443.350006,446.649994
2592,449.779999,461.299988


In [20]:
tslaData.loc[2:5,'Close']  # see row 2:5 from 'Close' column

2    4.392
3    3.840
4    3.222
5    3.160
Name: Close, dtype: float64

In [21]:
tslaData.iloc[:,3]  # similar to loc but require a numeric index, i.e. 3 instead of 'Close'

0         3.508000
1         4.660000
2         4.054000
3         3.742000
4         3.166000
           ...    
2589    426.459991
2590    438.579987
2591    436.600006
2592    447.350006
2593    442.500000
Name: Low, Length: 2594, dtype: float64

### Filtering

In [22]:
len(tslaData)  # how many rows

2594

In [23]:
tslaData[tslaData.Volume > 1e7]  # only see row where the volume exceed 10^7

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
1,2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0
2,2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
3,2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0
4,2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
...,...,...,...,...,...,...,...,...
2589,2020-10-09,430.130005,434.589996,426.459991,434.000000,28925700,0,0.0
2590,2020-10-12,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0
2591,2020-10-13,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0
2592,2020-10-14,449.779999,465.899994,447.350006,461.299988,48045400,0,0.0


In [24]:
tslaData[(tslaData.Volume > 1e7)&(tslaData.Open < tslaData.Close)]

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
6,2010-07-08,3.228000,3.504000,3.114000,3.492000,38557000,0,0.0
9,2010-07-13,3.478000,3.728000,3.380000,3.628000,13400500,0,0.0
10,2010-07-14,3.588000,4.030000,3.552000,3.968000,20976000,0,0.0
13,2010-07-19,4.274000,4.450000,4.184000,4.382000,12432500,0,0.0
...,...,...,...,...,...,...,...,...
2587,2020-10-07,419.869995,429.899994,413.850006,425.299988,43127700,0,0.0
2589,2020-10-09,430.130005,434.589996,426.459991,434.000000,28925700,0,0.0
2590,2020-10-12,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0
2591,2020-10-13,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0


In [25]:
tslaData.loc[lambda x: x.index % 2 == 0]  # slice using a callable (lambda)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
2,2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
4,2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
6,2010-07-08,3.228000,3.504000,3.114000,3.492000,38557000,0,0.0
8,2010-07-12,3.590000,3.614000,3.400000,3.410000,11012500,0,0.0
...,...,...,...,...,...,...,...,...
2584,2020-10-02,421.390015,439.130005,415.000000,415.089996,71430000,0,0.0
2586,2020-10-06,423.790009,428.779999,406.049988,413.980011,49146300,0,0.0
2588,2020-10-08,438.440002,439.000000,425.299988,425.920013,40421100,0,0.0
2590,2020-10-12,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0


In [26]:
import pandas as pd
import yfinance as yf

## Data Manipulation

In [9]:
tsla = yf.Ticker("TSLA")
tslaData = tsla.history(period="max")

msft = yf.Ticker("MSFT")
msftData = msft.history(period="max")
                        


In [28]:
display(tslaData)

display(msftData)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0
2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0
2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
...,...,...,...,...,...,...,...
2020-10-09,430.130005,434.589996,426.459991,434.000000,28925700,0,0.0
2020-10-12,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0
2020-10-13,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0
2020-10-14,449.779999,465.899994,447.350006,461.299988,48045400,0,0.0


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1986-03-13,0.056514,0.064825,0.056514,0.062055,1031788800,0.0,0.0
1986-03-14,0.062055,0.065379,0.062055,0.064271,308160000,0.0,0.0
1986-03-17,0.064271,0.065933,0.064271,0.065379,133171200,0.0,0.0
1986-03-18,0.065379,0.065933,0.063163,0.063717,67766400,0.0,0.0
1986-03-19,0.063717,0.064271,0.062055,0.062609,47894400,0.0,0.0
...,...,...,...,...,...,...,...
2020-10-09,211.229996,215.860001,211.229996,215.809998,26458000,0.0,0.0
2020-10-12,218.789993,223.860001,216.809998,221.399994,40461400,0.0,0.0
2020-10-13,222.720001,225.210007,220.429993,222.860001,28950800,0.0,0.0
2020-10-14,223.000000,224.220001,219.130005,220.860001,23451700,0.0,0.0


In [10]:
# reverse pivot, id_vars = what you want to hold constant. Take the column value and turn into a row
# optional: var_name and value_name to rename
melted = tslaData.reset_index().melt(id_vars=['Date'], var_name="Attribute", value_name="Value")
melted

Unnamed: 0,Date,Attribute,Value
0,2010-06-29,Open,3.800
1,2010-06-30,Open,5.158
2,2010-07-01,Open,5.000
3,2010-07-02,Open,4.600
4,2010-07-06,Open,4.000
...,...,...,...
18167,2020-10-13,Stock Splits,0.000
18168,2020-10-14,Stock Splits,0.000
18169,2020-10-15,Stock Splits,0.000
18170,2020-10-16,Stock Splits,0.000


In [15]:
pivoted = melted.pivot(index='Date', columns='Attribute')  # Reverse melt state to pivot state
pivoted = pivoted['Value'].reset_index()
pivoted.columns.name = None
pivoted

Unnamed: 0,Date,Close,Dividends,High,Low,Open,Stock Splits,Volume
0,2010-06-29,4.778000,0.0,5.000000,3.508000,3.800000,0.0,93831500.0
1,2010-06-30,4.766000,0.0,6.084000,4.660000,5.158000,0.0,85935500.0
2,2010-07-01,4.392000,0.0,5.184000,4.054000,5.000000,0.0,41094000.0
3,2010-07-02,3.840000,0.0,4.620000,3.742000,4.600000,0.0,25699000.0
4,2010-07-06,3.222000,0.0,4.000000,3.166000,4.000000,0.0,34334500.0
...,...,...,...,...,...,...,...,...
2591,2020-10-13,446.649994,0.0,448.890015,436.600006,443.350006,0.0,34463700.0
2592,2020-10-14,461.299988,0.0,465.899994,447.350006,449.779999,0.0,48045400.0
2593,2020-10-15,448.880005,0.0,456.570007,442.500000,450.309998,0.0,35672400.0
2594,2020-10-16,439.670013,0.0,455.950012,438.850006,454.440002,0.0,32775900.0


In [31]:
tslaData['Ticker'] = 'TSLA'  # can also pass in a list to assign different values for each rows.
msftData['Ticker'] = 'MSFT'
allData = pd.concat([tslaData, msftData], axis=0)  # Stack them in a single table, concat take a list of data frame. Axis = 1, stack them side by side (horizontally)
allData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0.0,0.0,TSLA
2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0.0,0.0,TSLA
2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0.0,0.0,TSLA
2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0.0,0.0,TSLA
2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0.0,0.0,TSLA
...,...,...,...,...,...,...,...,...
2020-10-09,211.229996,215.860001,211.229996,215.809998,26458000,0.0,0.0,MSFT
2020-10-12,218.789993,223.860001,216.809998,221.399994,40461400,0.0,0.0,MSFT
2020-10-13,222.720001,225.210007,220.429993,222.860001,28950800,0.0,0.0,MSFT
2020-10-14,223.000000,224.220001,219.130005,220.860001,23451700,0.0,0.0,MSFT


In [32]:
import numpy as np
allData.pivot_table(index = ['Ticker'], columns=[], values=['Open', 'Close'], aggfunc=np.std) # also np.mean and others

Unnamed: 0_level_0,Close,Open
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
MSFT,36.960719,36.945707
TSLA,64.416241,64.222579


In [66]:
# Pass in the column you want to groupby and do an aggregation
allData.groupby(by=['Date', 'Ticker']).sum() # Also see as_index=False

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Daily Gain %,Daily_Gain,TickerAvgVolume
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1986-03-13,MSFT,0.056514,0.064825,0.056514,0.062055,1031.7888,0.0,0.0,9.803927,0.005541,60.009023
1986-03-14,MSFT,0.062055,0.065379,0.062055,0.064271,308.1600,0.0,0.0,3.571430,0.002216,60.009023
1986-03-17,MSFT,0.064271,0.065933,0.064271,0.065379,133.1712,0.0,0.0,1.724131,0.001108,60.009023
1986-03-18,MSFT,0.065379,0.065933,0.063163,0.063717,67.7664,0.0,0.0,-2.542371,-0.001662,60.009023
1986-03-19,MSFT,0.063717,0.064271,0.062055,0.062609,47.8944,0.0,0.0,-1.739131,-0.001108,60.009023
...,...,...,...,...,...,...,...,...,...,...,...
2020-10-13,TSLA,443.350006,448.890015,436.600006,446.649994,34.4637,0.0,0.0,0.000000,0.000000,31.249511
2020-10-14,MSFT,223.000000,224.220001,219.130005,220.860001,23.4517,0.0,0.0,-0.959641,-2.139999,60.009023
2020-10-14,TSLA,449.779999,465.899994,447.350006,461.299988,48.0454,0.0,0.0,0.000000,0.000000,31.249511
2020-10-15,MSFT,217.100006,220.360001,216.009995,219.660004,22.7184,0.0,0.0,1.179179,2.559998,60.009023


In [34]:
# Custom aggregation
from scipy import stats
# We want to use stats.sem (standard error). Can also create your own function
allData.groupby(by=['Ticker']).agg({'Open': stats.sem, 'Close':np.sum}) # Can customize by column as well

Unnamed: 0_level_0,Open,Close
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
MSFT,0.395622,234732.822972
TSLA,1.260964,133085.523901


### Joins


In [67]:
tsla = yf.Ticker("TSLA")
tslaData = tsla.history(period="max")

msft = yf.Ticker("MSFT")
msftData = msft.history(period="max")
                        


In [68]:
# taking subset of the data
tslaData = tslaData[tslaData.Open<tslaData.Close]
msftData = msftData[msftData.Open<msftData.Close]

In [72]:
# Join on the indices from the left hand table (msft in this case, right hand is tsla)
x = msftData.join(tslaData, how='left', rsuffix='_tsla', lsuffix='_msft') # left, right, outer, inner
x.dropna()  # Drop the NaN values

Unnamed: 0_level_0,Open_msft,High_msft,Low_msft,Close_msft,Volume_msft,Dividends_msft,Stock Splits_msft,Open_tsla,High_tsla,Low_tsla,Close_tsla,Volume_tsla,Dividends_tsla,Stock Splits_tsla
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2010-07-19,19.670458,19.938405,19.631055,19.883240,38181800,0.0,0.0,4.274000,4.450000,4.184000,4.382000,12432500.0,0.0,0.0
2010-07-22,20.103902,20.482179,20.072378,20.363968,73016400,0.0,0.0,4.100000,4.250000,4.074000,4.200000,4789000.0,0.0,0.0
2010-08-02,20.482181,20.789531,20.293042,20.750128,55044600,0.0,0.0,4.100000,4.194000,4.066000,4.184000,3590500.0,0.0,0.0
2010-08-13,19.189732,19.441917,19.103043,19.229136,45263500,0.0,0.0,3.636000,3.690000,3.532000,3.664000,3170000.0,0.0,0.0
2010-08-16,19.197607,19.394627,19.150321,19.307938,40909700,0.0,0.0,3.690000,3.760000,3.652000,3.756000,2429000.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-10-05,207.220001,210.410004,206.979996,210.380005,21331600,0.0,0.0,423.350006,433.640015,419.329987,425.679993,44722800.0,0.0,0.0
2020-10-07,207.059998,210.110001,206.720001,209.830002,25681100,0.0,0.0,419.869995,429.899994,413.850006,425.299988,43127700.0,0.0,0.0
2020-10-09,211.229996,215.860001,211.229996,215.809998,26458000,0.0,0.0,430.130005,434.589996,426.459991,434.000000,28925700.0,0.0,0.0
2020-10-12,218.789993,223.860001,216.809998,221.399994,40461400,0.0,0.0,442.000000,448.739990,438.579987,442.299988,38791100.0,0.0,0.0


In [73]:
# Inner: intesect of data. Outer = union of data
msftData.join(tslaData, how='outer', rsuffix='_tsla', lsuffix='_msft') # left, right, outer, inner

Unnamed: 0_level_0,Open_msft,High_msft,Low_msft,Close_msft,Volume_msft,Dividends_msft,Stock Splits_msft,Open_tsla,High_tsla,Low_tsla,Close_tsla,Volume_tsla,Dividends_tsla,Stock Splits_tsla
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1986-03-13,0.056514,0.064825,0.056514,0.062055,1.031789e+09,0.0,0.0,,,,,,,
1986-03-14,0.062055,0.065379,0.062055,0.064271,3.081600e+08,0.0,0.0,,,,,,,
1986-03-17,0.064271,0.065933,0.064271,0.065379,1.331712e+08,0.0,0.0,,,,,,,
1986-03-25,0.057623,0.058731,0.057068,0.058731,3.208320e+07,0.0,0.0,,,,,,,
1986-03-26,0.058731,0.060947,0.058177,0.060393,2.275200e+07,0.0,0.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-10-09,211.229996,215.860001,211.229996,215.809998,2.645800e+07,0.0,0.0,430.130005,434.589996,426.459991,434.000000,28925700.0,0.0,0.0
2020-10-12,218.789993,223.860001,216.809998,221.399994,4.046140e+07,0.0,0.0,442.000000,448.739990,438.579987,442.299988,38791100.0,0.0,0.0
2020-10-13,222.720001,225.210007,220.429993,222.860001,2.895080e+07,0.0,0.0,443.350006,448.890015,436.600006,446.649994,34463700.0,0.0,0.0
2020-10-14,,,,,,,,449.779999,465.899994,447.350006,461.299988,48045400.0,0.0,0.0


In [38]:
msftData.reset_index().join(tslaData, on='Date', how='right', lsuffix='_tsla', rsuffix='_msft') # left, right, outer, inner

Unnamed: 0,Date,Open_tsla,High_tsla,Low_tsla,Close_tsla,Volume_tsla,Dividends_tsla,Stock Splits_tsla,Open_msft,High_msft,Low_msft,Close_msft,Volume_msft,Dividends_msft,Stock Splits_msft
2978.0,2010-07-19,19.670458,19.938405,19.631055,19.883240,38181800.0,0.0,0.0,4.274000,4.450000,4.184000,4.382000,12432500,0,0.0
2980.0,2010-07-22,20.103902,20.482179,20.072378,20.363968,73016400.0,0.0,0.0,4.100000,4.250000,4.074000,4.200000,4789000,0,0.0
2984.0,2010-08-02,20.482181,20.789531,20.293042,20.750128,55044600.0,0.0,0.0,4.100000,4.194000,4.066000,4.184000,3590500,0,0.0
2989.0,2010-08-13,19.189732,19.441917,19.103043,19.229136,45263500.0,0.0,0.0,3.636000,3.690000,3.532000,3.664000,3170000,0,0.0
2990.0,2010-08-16,19.197607,19.394627,19.150321,19.307938,40909700.0,0.0,0.0,3.690000,3.760000,3.652000,3.756000,2429000,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,2020-09-04,,,,,,,,402.809998,428.000000,372.019989,418.320007,110321900,0,0.0
,2020-09-16,,,,,,,,439.869995,457.790009,435.309998,441.760010,72279300,0,0.0
,2020-09-29,,,,,,,,416.000000,428.500000,411.600006,419.070007,50219300,0,0.0
,2020-10-01,,,,,,,,440.760010,448.880005,434.420013,448.160004,50741500,0,0.0


In [39]:
tslaData.merge(msftData, how='left', left_on='Date', right_on='Date', suffixes=['_tsla', '_msft'])  # more flexible

Unnamed: 0_level_0,Open_tsla,High_tsla,Low_tsla,Close_tsla,Volume_tsla,Dividends_tsla,Stock Splits_tsla,Open_msft,High_msft,Low_msft,Close_msft,Volume_msft,Dividends_msft,Stock Splits_msft
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0,,,,,,,
2010-07-08,3.228000,3.504000,3.114000,3.492000,38557000,0,0.0,,,,,,,
2010-07-13,3.478000,3.728000,3.380000,3.628000,13400500,0,0.0,,,,,,,
2010-07-14,3.588000,4.030000,3.552000,3.968000,20976000,0,0.0,,,,,,,
2010-07-19,4.274000,4.450000,4.184000,4.382000,12432500,0,0.0,19.670458,19.938405,19.631055,19.883240,38181800.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-10-07,419.869995,429.899994,413.850006,425.299988,43127700,0,0.0,207.059998,210.110001,206.720001,209.830002,25681100.0,0.0,0.0
2020-10-09,430.130005,434.589996,426.459991,434.000000,28925700,0,0.0,211.229996,215.860001,211.229996,215.809998,26458000.0,0.0,0.0
2020-10-12,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0,218.789993,223.860001,216.809998,221.399994,40461400.0,0.0,0.0
2020-10-13,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0,222.720001,225.210007,220.429993,222.860001,28950800.0,0.0,0.0


In [40]:
pd.merge_asof(msftData, tslaData, left_on='Date', right_on='Date', suffixes=['_msft', '_tsla'], direction='nearest')  # useful for timeseries, try to join on on the closest data

Unnamed: 0,Date,Open_msft,High_msft,Low_msft,Close_msft,Volume_msft,Dividends_msft,Stock Splits_msft,Open_tsla,High_tsla,Low_tsla,Close_tsla,Volume_tsla,Dividends_tsla,Stock Splits_tsla
0,1986-03-13,0.056514,0.064825,0.056514,0.062055,1031788800,0.0,0.0,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
1,1986-03-14,0.062055,0.065379,0.062055,0.064271,308160000,0.0,0.0,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
2,1986-03-17,0.064271,0.065933,0.064271,0.065379,133171200,0.0,0.0,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
3,1986-03-25,0.057623,0.058731,0.057068,0.058731,32083200,0.0,0.0,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
4,1986-03-26,0.058731,0.060947,0.058177,0.060393,22752000,0.0,0.0,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4303,2020-10-08,210.509995,211.190002,208.320007,210.580002,19925800,0.0,0.0,419.869995,429.899994,413.850006,425.299988,43127700,0,0.0
4304,2020-10-09,211.229996,215.860001,211.229996,215.809998,26458000,0.0,0.0,430.130005,434.589996,426.459991,434.000000,28925700,0,0.0
4305,2020-10-12,218.789993,223.860001,216.809998,221.399994,40461400,0.0,0.0,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0
4306,2020-10-13,222.720001,225.210007,220.429993,222.860001,28950800,0.0,0.0,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0


### MultiIndex

In [2]:
import yfinance as yf
tickers = yf.Tickers('msft aapl goog tsla')
hist = tickers.history(group_by="ticker")

[*********************100%***********************]  4 of 4 completed


In [3]:
hist.head()

Unnamed: 0_level_0,TSLA,TSLA,TSLA,TSLA,TSLA,TSLA,TSLA,AAPL,AAPL,AAPL,...,MSFT,MSFT,MSFT,GOOG,GOOG,GOOG,GOOG,GOOG,GOOG,GOOG
Unnamed: 0_level_1,Open,High,Low,Close,Volume,Dividends,Stock Splits,Open,High,Low,...,Volume,Dividends,Stock Splits,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2020-09-21,453.130005,455.679993,407.070007,449.390015,109476800,0,0,104.540001,110.190002,103.099998,...,39839700,0,0,1440.060059,1448.359985,1406.550049,1431.160034,2888800,0,0
2020-09-22,429.600006,437.76001,417.600006,424.230011,79580800,0,0,112.68,112.860001,109.160004,...,33517100,0,0,1450.089966,1469.52002,1434.530029,1465.459961,1583200,0,0
2020-09-23,405.160004,412.149994,375.880005,380.359985,95074200,0,0,111.620003,112.110001,106.769997,...,30803800,0,0,1458.780029,1460.959961,1407.699951,1415.209961,1657400,0,0
2020-09-24,363.799988,399.5,351.299988,387.790009,96561100,0,0,105.169998,110.25,105.0,...,31202500,0,0,1411.030029,1443.708984,1409.849976,1428.290039,1450200,0,0
2020-09-25,393.470001,408.730011,391.299988,407.339996,67208500,0,0,108.43,112.440002,107.669998,...,29437300,0,0,1432.630005,1450.0,1413.339966,1444.959961,1323000,0,0


In [4]:
hist.columns

MultiIndex([('TSLA',         'Open'),
            ('TSLA',         'High'),
            ('TSLA',          'Low'),
            ('TSLA',        'Close'),
            ('TSLA',       'Volume'),
            ('TSLA',    'Dividends'),
            ('TSLA', 'Stock Splits'),
            ('AAPL',         'Open'),
            ('AAPL',         'High'),
            ('AAPL',          'Low'),
            ('AAPL',        'Close'),
            ('AAPL',       'Volume'),
            ('AAPL',    'Dividends'),
            ('AAPL', 'Stock Splits'),
            ('MSFT',         'Open'),
            ('MSFT',         'High'),
            ('MSFT',          'Low'),
            ('MSFT',        'Close'),
            ('MSFT',       'Volume'),
            ('MSFT',    'Dividends'),
            ('MSFT', 'Stock Splits'),
            ('GOOG',         'Open'),
            ('GOOG',         'High'),
            ('GOOG',          'Low'),
            ('GOOG',        'Close'),
            ('GOOG',       'Volume'),
            

In [5]:
hist.shape

(21, 28)

In [45]:
hist

Unnamed: 0_level_0,GOOG,GOOG,GOOG,GOOG,GOOG,GOOG,GOOG,TSLA,TSLA,TSLA,...,MSFT,MSFT,MSFT,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL
Unnamed: 0_level_1,Open,High,Low,Close,Volume,Dividends,Stock Splits,Open,High,Low,...,Volume,Dividends,Stock Splits,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2020-09-16,1555.540039,1562.0,1519.819946,1520.900024,1311700,0,0,439.869995,457.790009,435.309998,...,26328100,0,0,115.230003,116.0,112.040001,112.129997,154679000,0,0
2020-09-17,1496.0,1508.297974,1470.0,1495.530029,1879800,0,0,415.600006,437.790009,408.0,...,34011300,0,0,109.720001,112.199997,108.709999,110.339996,178011000,0,0
2020-09-18,1498.01001,1503.003052,1437.130005,1459.98999,3103900,0,0,447.940002,451.0,428.799988,...,55225300,0,0,110.400002,110.879997,106.089996,106.839996,287104900,0,0
2020-09-21,1440.060059,1448.359985,1406.550049,1431.160034,2888800,0,0,453.130005,455.679993,407.070007,...,39839700,0,0,104.540001,110.190002,103.099998,110.080002,195713800,0,0
2020-09-22,1450.089966,1469.52002,1434.530029,1465.459961,1583200,0,0,429.600006,437.76001,417.600006,...,33517100,0,0,112.68,112.860001,109.160004,111.809998,183055400,0,0
2020-09-23,1458.780029,1460.959961,1407.699951,1415.209961,1657400,0,0,405.160004,412.149994,375.880005,...,30803800,0,0,111.620003,112.110001,106.769997,107.120003,150718700,0,0
2020-09-24,1411.030029,1443.708984,1409.849976,1428.290039,1450200,0,0,363.799988,399.5,351.299988,...,31202500,0,0,105.169998,110.25,105.0,108.220001,167743300,0,0
2020-09-25,1432.630005,1450.0,1413.339966,1444.959961,1323000,0,0,393.470001,408.730011,391.299988,...,29437300,0,0,108.43,112.440002,107.669998,112.279999,149981400,0,0
2020-09-28,1474.209961,1476.800049,1449.301025,1464.52002,2007900,0,0,424.619995,428.079987,415.549988,...,32004900,0,0,115.010002,115.32,112.779999,114.959999,137672400,0,0
2020-09-29,1470.390015,1476.662964,1458.805054,1469.329956,978200,0,0,416.0,428.5,411.600006,...,24221900,0,0,114.550003,115.309998,113.57,114.089996,99382200,0,0


In [6]:
# Stack the table
hist.stack(level=0).reset_index().rename(columns = {'level_1':'Ticker'})

# Rename column level_1

Unnamed: 0,Date,Ticker,Close,Dividends,High,Low,Open,Stock Splits,Volume
0,2020-09-21,AAPL,110.080002,0,110.190002,103.099998,104.540001,0,195713800
1,2020-09-21,GOOG,1431.160034,0,1448.359985,1406.550049,1440.060059,0,2888800
2,2020-09-21,MSFT,202.539993,0,202.710007,196.380005,197.190002,0,39839700
3,2020-09-21,TSLA,449.390015,0,455.679993,407.070007,453.130005,0,109476800
4,2020-09-22,AAPL,111.809998,0,112.860001,109.160004,112.680000,0,183055400
...,...,...,...,...,...,...,...,...,...
79,2020-10-16,TSLA,439.670013,0,455.950012,438.850006,454.440002,0,32775900
80,2020-10-19,AAPL,115.980003,0,120.419998,115.660004,119.959999,0,120221600
81,2020-10-19,GOOG,1534.609985,0,1588.150024,1528.000000,1580.459961,0,1606500
82,2020-10-19,MSFT,214.220001,0,222.300003,213.720001,220.419998,0,27591900


In [7]:
hist.stack(level=0).unstack()

Unnamed: 0_level_0,Close,Close,Close,Close,Dividends,Dividends,Dividends,Dividends,High,High,...,Open,Open,Stock Splits,Stock Splits,Stock Splits,Stock Splits,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,AAPL,GOOG,MSFT,TSLA,AAPL,GOOG,MSFT,TSLA,AAPL,GOOG,...,MSFT,TSLA,AAPL,GOOG,MSFT,TSLA,AAPL,GOOG,MSFT,TSLA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2020-09-21,110.080002,1431.160034,202.539993,449.390015,0,0,0,0,110.190002,1448.359985,...,197.190002,453.130005,0,0,0,0,195713800,2888800,39839700,109476800
2020-09-22,111.809998,1465.459961,207.419998,424.230011,0,0,0,0,112.860001,1469.52002,...,205.059998,429.600006,0,0,0,0,183055400,1583200,33517100,79580800
2020-09-23,107.120003,1415.209961,200.589996,380.359985,0,0,0,0,112.110001,1460.959961,...,207.899994,405.160004,0,0,0,0,150718700,1657400,30803800,95074200
2020-09-24,108.220001,1428.290039,203.190002,387.790009,0,0,0,0,110.25,1443.708984,...,199.850006,363.799988,0,0,0,0,167743300,1450200,31202500,96561100
2020-09-25,112.279999,1444.959961,207.820007,407.339996,0,0,0,0,112.440002,1450.0,...,203.550003,393.470001,0,0,0,0,149981400,1323000,29437300,67208500
2020-09-28,114.959999,1464.52002,209.440002,421.200012,0,0,0,0,115.32,1476.800049,...,210.880005,424.619995,0,0,0,0,137672400,2007900,32004900,49719600
2020-09-29,114.089996,1469.329956,207.259995,419.070007,0,0,0,0,115.309998,1476.662964,...,209.350006,416.0,0,0,0,0,99382200,978200,24221900,50219300
2020-09-30,115.809998,1469.599976,210.330002,429.01001,0,0,0,0,117.260002,1489.75,...,207.729996,421.320007,0,0,0,0,142675200,1700600,33780700,48145600
2020-10-01,116.790001,1490.089966,212.460007,448.160004,0,0,0,0,117.720001,1499.040039,...,213.490005,440.76001,0,0,0,0,116120400,1779500,27158400,50741500
2020-10-02,113.019997,1458.420044,206.190002,415.089996,0,0,0,0,115.370003,1483.199951,...,208.0,421.390015,0,0,0,0,144712000,1284100,33154800,71430000


## Transformations

In [48]:
import pandas as pd
import yfinance as yf
tsla = yf.Ticker("TSLA")
tslaData = tsla.history(period="max")

msft = yf.Ticker("MSFT")
msftData = msft.history(period="max")
                        


In [49]:
# Calculated columns
msftData['Daily Gain %'] = 100*(msftData['Close']-msftData['Open'])/msftData['Open'] # Approach 1 (preferred generally)
msftData = msftData.assign(Daily_Gain=lambda x:x['Close']-x['Open']) # Approach 2

msftData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Daily Gain %,Daily_Gain
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1986-03-13,0.056514,0.064825,0.056514,0.062055,1031788800,0.0,0.0,9.803927,0.005541
1986-03-14,0.062055,0.065379,0.062055,0.064271,308160000,0.0,0.0,3.571430,0.002216
1986-03-17,0.064271,0.065933,0.064271,0.065379,133171200,0.0,0.0,1.724131,0.001108
1986-03-18,0.065379,0.065933,0.063163,0.063717,67766400,0.0,0.0,-2.542371,-0.001662
1986-03-19,0.063717,0.064271,0.062055,0.062609,47894400,0.0,0.0,-1.739131,-0.001108
...,...,...,...,...,...,...,...,...,...
2020-10-09,211.229996,215.860001,211.229996,215.809998,26458000,0.0,0.0,2.168254,4.580002
2020-10-12,218.789993,223.860001,216.809998,221.399994,40461400,0.0,0.0,1.192925,2.610001
2020-10-13,222.720001,225.210007,220.429993,222.860001,28950800,0.0,0.0,0.062859,0.139999
2020-10-14,223.000000,224.220001,219.130005,220.860001,23451700,0.0,0.0,-0.959641,-2.139999


In [50]:
tslaData.apply(lambda x:x*1e-6)  # scale back every column by 1e-6

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,0.000004,0.000005,0.000004,0.000005,93.8315,0.0,0.0
2010-06-30,0.000005,0.000006,0.000005,0.000005,85.9355,0.0,0.0
2010-07-01,0.000005,0.000005,0.000004,0.000004,41.0940,0.0,0.0
2010-07-02,0.000005,0.000005,0.000004,0.000004,25.6990,0.0,0.0
2010-07-06,0.000004,0.000004,0.000003,0.000003,34.3345,0.0,0.0
...,...,...,...,...,...,...,...
2020-10-09,0.000430,0.000435,0.000426,0.000434,28.9257,0.0,0.0
2020-10-12,0.000442,0.000449,0.000439,0.000442,38.7911,0.0,0.0
2020-10-13,0.000443,0.000449,0.000437,0.000447,34.4637,0.0,0.0
2020-10-14,0.000450,0.000466,0.000447,0.000461,48.0454,0.0,0.0


In [51]:
tslaData['Volume'] = tslaData['Volume'].apply(lambda x:x*1e-6)  # scale back the volume column bby 1e-6
tslaData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93.8315,0,0.0
2010-06-30,5.158000,6.084000,4.660000,4.766000,85.9355,0,0.0
2010-07-01,5.000000,5.184000,4.054000,4.392000,41.0940,0,0.0
2010-07-02,4.600000,4.620000,3.742000,3.840000,25.6990,0,0.0
2010-07-06,4.000000,4.000000,3.166000,3.222000,34.3345,0,0.0
...,...,...,...,...,...,...,...
2020-10-09,430.130005,434.589996,426.459991,434.000000,28.9257,0,0.0
2020-10-12,442.000000,448.739990,438.579987,442.299988,38.7911,0,0.0
2020-10-13,443.350006,448.890015,436.600006,446.649994,34.4637,0,0.0
2020-10-14,449.779999,465.899994,447.350006,461.299988,48.0454,0,0.0


In [74]:
msftData['Volume'] = msftData['Volume'].transform(lambda x:x*1e-6)  # simple case, not much different from apply
msftData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1986-03-13,0.056514,0.064825,0.056514,0.062055,1031.7888,0.0,0.0
1986-03-14,0.062055,0.065379,0.062055,0.064271,308.1600,0.0,0.0
1986-03-17,0.064271,0.065933,0.064271,0.065379,133.1712,0.0,0.0
1986-03-25,0.057623,0.058731,0.057068,0.058731,32.0832,0.0,0.0
1986-03-26,0.058731,0.060947,0.058177,0.060393,22.7520,0.0,0.0
...,...,...,...,...,...,...,...
2020-10-08,210.509995,211.190002,208.320007,210.580002,19.9258,0.0,0.0
2020-10-09,211.229996,215.860001,211.229996,215.809998,26.4580,0.0,0.0
2020-10-12,218.789993,223.860001,216.809998,221.399994,40.4614,0.0,0.0
2020-10-13,222.720001,225.210007,220.429993,222.860001,28.9508,0.0,0.0


In [53]:
tslaData['Ticker'] = 'TSLA'
msftData['Ticker'] = 'MSFT'
allData = pd.concat([tslaData, msftData], axis=0)  # combine tsla and msft data to a single dataframe


In [77]:
grouped = allData.groupby(['Ticker']).agg({'Volume': np.sum})  # sum total volume by the ticker
grouped

Unnamed: 0_level_0,Volume
Ticker,Unnamed: 1_level_1
MSFT,523338.6871
TSLA,81061.2308


In [78]:
grouped = allData.groupby(['Ticker']).transform(np.sum)  # place it back to the original table, maintain table structure
grouped

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Daily Gain %,Daily_Gain,TickerAvgVolume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2010-06-29,132952.952120,135897.888107,129937.136043,133085.523901,81061.2308,0.00,5.0,0.000000,0.000000,81061.2308
2010-06-30,132952.952120,135897.888107,129937.136043,133085.523901,81061.2308,0.00,5.0,0.000000,0.000000,81061.2308
2010-07-01,132952.952120,135897.888107,129937.136043,133085.523901,81061.2308,0.00,5.0,0.000000,0.000000,81061.2308
2010-07-02,132952.952120,135897.888107,129937.136043,133085.523901,81061.2308,0.00,5.0,0.000000,0.000000,81061.2308
2010-07-06,132952.952120,135897.888107,129937.136043,133085.523901,81061.2308,0.00,5.0,0.000000,0.000000,81061.2308
...,...,...,...,...,...,...,...,...,...,...
2020-10-09,234668.687067,237166.955768,232112.644185,234732.822972,523338.6871,19.15,17.0,600.046874,64.135906,523338.6871
2020-10-12,234668.687067,237166.955768,232112.644185,234732.822972,523338.6871,19.15,17.0,600.046874,64.135906,523338.6871
2020-10-13,234668.687067,237166.955768,232112.644185,234732.822972,523338.6871,19.15,17.0,600.046874,64.135906,523338.6871
2020-10-14,234668.687067,237166.955768,232112.644185,234732.822972,523338.6871,19.15,17.0,600.046874,64.135906,523338.6871


In [55]:
#allData['TickerAvgVolume'] = allData.groupby(['Ticker']).agg({'Volume': 'sum'}) # doesnt work
# For each ticker, what's the average volume is and put that volume back to the table
allData.join(allData.groupby(['Ticker']).agg({'Volume': 'mean'}).rename(columns={'Volume':'TickerAvgVolume'}), on='Ticker')

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,Daily Gain %,Daily_Gain,TickerAvgVolume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93.8315,0.0,0.0,TSLA,,,31.249511
2010-06-30,5.158000,6.084000,4.660000,4.766000,85.9355,0.0,0.0,TSLA,,,31.249511
2010-07-01,5.000000,5.184000,4.054000,4.392000,41.0940,0.0,0.0,TSLA,,,31.249511
2010-07-02,4.600000,4.620000,3.742000,3.840000,25.6990,0.0,0.0,TSLA,,,31.249511
2010-07-06,4.000000,4.000000,3.166000,3.222000,34.3345,0.0,0.0,TSLA,,,31.249511
...,...,...,...,...,...,...,...,...,...,...,...
2020-10-09,211.229996,215.860001,211.229996,215.809998,26.4580,0.0,0.0,MSFT,2.168254,4.580002,60.009023
2020-10-12,218.789993,223.860001,216.809998,221.399994,40.4614,0.0,0.0,MSFT,1.192925,2.610001,60.009023
2020-10-13,222.720001,225.210007,220.429993,222.860001,28.9508,0.0,0.0,MSFT,0.062859,0.139999,60.009023
2020-10-14,223.000000,224.220001,219.130005,220.860001,23.4517,0.0,0.0,MSFT,-0.959641,-2.139999,60.009023


In [56]:
allData.join(allData.groupby(['Ticker']).agg({'Volume': 'mean'}).rename(columns={'Volume':'TickerAvgVolume'}), on='Ticker')

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,Daily Gain %,Daily_Gain,TickerAvgVolume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93.8315,0.0,0.0,TSLA,,,31.249511
2010-06-30,5.158000,6.084000,4.660000,4.766000,85.9355,0.0,0.0,TSLA,,,31.249511
2010-07-01,5.000000,5.184000,4.054000,4.392000,41.0940,0.0,0.0,TSLA,,,31.249511
2010-07-02,4.600000,4.620000,3.742000,3.840000,25.6990,0.0,0.0,TSLA,,,31.249511
2010-07-06,4.000000,4.000000,3.166000,3.222000,34.3345,0.0,0.0,TSLA,,,31.249511
...,...,...,...,...,...,...,...,...,...,...,...
2020-10-09,211.229996,215.860001,211.229996,215.809998,26.4580,0.0,0.0,MSFT,2.168254,4.580002,60.009023
2020-10-12,218.789993,223.860001,216.809998,221.399994,40.4614,0.0,0.0,MSFT,1.192925,2.610001,60.009023
2020-10-13,222.720001,225.210007,220.429993,222.860001,28.9508,0.0,0.0,MSFT,0.062859,0.139999,60.009023
2020-10-14,223.000000,224.220001,219.130005,220.860001,23.4517,0.0,0.0,MSFT,-0.959641,-2.139999,60.009023


In [57]:
allData.groupby(['Ticker']).transform('mean')[['Volume']] # vs agg

Unnamed: 0_level_0,Volume
Date,Unnamed: 1_level_1
2010-06-29,31.249511
2010-06-30,31.249511
2010-07-01,31.249511
2010-07-02,31.249511
2010-07-06,31.249511
...,...
2020-10-09,60.009023
2020-10-12,60.009023
2020-10-13,60.009023
2020-10-14,60.009023


In [58]:
# Cleaner!
allData['TickerAvgVolume'] = allData.groupby(['Ticker']).transform('mean')[['Volume']].rename(columns={'Volume':'TickerAvgVolume'})
allData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,Daily Gain %,Daily_Gain,TickerAvgVolume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93.8315,0.0,0.0,TSLA,,,31.249511
2010-06-30,5.158000,6.084000,4.660000,4.766000,85.9355,0.0,0.0,TSLA,,,31.249511
2010-07-01,5.000000,5.184000,4.054000,4.392000,41.0940,0.0,0.0,TSLA,,,31.249511
2010-07-02,4.600000,4.620000,3.742000,3.840000,25.6990,0.0,0.0,TSLA,,,31.249511
2010-07-06,4.000000,4.000000,3.166000,3.222000,34.3345,0.0,0.0,TSLA,,,31.249511
...,...,...,...,...,...,...,...,...,...,...,...
2020-10-09,211.229996,215.860001,211.229996,215.809998,26.4580,0.0,0.0,MSFT,2.168254,4.580002,60.009023
2020-10-12,218.789993,223.860001,216.809998,221.399994,40.4614,0.0,0.0,MSFT,1.192925,2.610001,60.009023
2020-10-13,222.720001,225.210007,220.429993,222.860001,28.9508,0.0,0.0,MSFT,0.062859,0.139999,60.009023
2020-10-14,223.000000,224.220001,219.130005,220.860001,23.4517,0.0,0.0,MSFT,-0.959641,-2.139999,60.009023


### Time Series

#### Timestamps

In [59]:
# Using datetime
import datetime
import pytz

dateToConvert = datetime.datetime(2020,7,1,15,31,24) # This is a 'naive' time without timezone
dateToConvert

datetime.datetime(2020, 7, 1, 15, 31, 24)

In [60]:
dateToConvert.astimezone(pytz.UTC) # Set to UTC, Time stays the same
#dateToConvert.astimezone(pytz.timezone('US/Eastern')) # Set to Eastern, Time stays the same

datetime.datetime(2020, 7, 1, 22, 31, 24, tzinfo=<UTC>)

In [61]:
pytz.timezone('US/Eastern').localize(dateToConvert)

datetime.datetime(2020, 7, 1, 15, 31, 24, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>)

In [62]:
#dateToConvert = dateToConvert.astimezone(pytz.UTC)
# localize take the original datetime and localize to the specified timezone
pytz.timezone('America/Chicago').localize(dateToConvert).astimezone(pytz.UTC) # America/Chicago

datetime.datetime(2020, 7, 1, 20, 31, 24, tzinfo=<UTC>)

In [63]:
start = pd.to_datetime('2015-01-01')  # convert string to an np.datetime64
end = pd.to_datetime('2018-01-01')

def random_dates(start, end, n=10):

    start_u = start.value//10**9
    end_u = end.value//10**9

    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')

dates = random_dates(start, end, 1000000)
pyDates = list(dates.to_pydatetime())


In [64]:
pyDates

[datetime.datetime(2016, 8, 23, 20, 24, 2),
 datetime.datetime(2015, 3, 2, 9, 44, 12),
 datetime.datetime(2016, 10, 26, 23, 46, 1),
 datetime.datetime(2017, 10, 20, 21, 24, 25),
 datetime.datetime(2015, 11, 21, 11, 40, 7),
 datetime.datetime(2016, 10, 17, 22, 12, 8),
 datetime.datetime(2015, 9, 5, 7, 44, 11),
 datetime.datetime(2016, 6, 30, 11, 11, 5),
 datetime.datetime(2015, 10, 9, 12, 32, 3),
 datetime.datetime(2017, 12, 10, 19, 28, 2),
 datetime.datetime(2015, 12, 4, 13, 53, 30),
 datetime.datetime(2015, 11, 20, 23, 49, 53),
 datetime.datetime(2015, 6, 13, 22, 13, 31),
 datetime.datetime(2017, 10, 25, 7, 48, 40),
 datetime.datetime(2016, 2, 6, 20, 42, 15),
 datetime.datetime(2016, 9, 17, 18, 12, 51),
 datetime.datetime(2015, 7, 6, 7, 5, 40),
 datetime.datetime(2017, 4, 22, 9, 40, 39),
 datetime.datetime(2017, 9, 12, 22, 51, 54),
 datetime.datetime(2015, 2, 16, 12, 52, 4),
 datetime.datetime(2015, 3, 16, 16, 48, 30),
 datetime.datetime(2017, 2, 9, 12, 23, 28),
 datetime.datetime(201

In [65]:
from timer import Timer

with Timer('ConvertDatesPy') as t:
    res = [pytz.timezone('US/Eastern').localize(r).astimezone(pytz.UTC) for r in pyDates]

with Timer('ConvertDatesNP') as t:
    res = dates.tz_localize('US/Eastern', nonexistent='shift_forward', ambiguous='NaT').tz_convert('UTC')  # if unclear, shift forward, if not a datetime, return NaT i.e. Not a Time

ImportError: cannot import name 'Timer' from 'timer' (D:\anaconda3\lib\site-packages\win32\timer.pyd)

In [79]:
dates

DatetimeIndex(['2016-08-23 20:24:02', '2015-03-02 09:44:12',
               '2016-10-26 23:46:01', '2017-10-20 21:24:25',
               '2015-11-21 11:40:07', '2016-10-17 22:12:08',
               '2015-09-05 07:44:11', '2016-06-30 11:11:05',
               '2015-10-09 12:32:03', '2017-12-10 19:28:02',
               ...
               '2016-02-14 03:58:47', '2015-08-02 18:31:07',
               '2017-11-28 03:10:36', '2015-08-16 04:09:01',
               '2017-08-02 14:19:29', '2015-09-24 14:12:12',
               '2017-06-08 23:02:35', '2015-12-17 02:42:05',
               '2016-03-30 23:53:31', '2017-09-16 16:57:02'],
              dtype='datetime64[ns]', length=1000000, freq=None)

In [80]:
res

NameError: name 'res' is not defined

In [81]:
# Add/Subtract business day
pd.to_datetime('2020.01.03') + pd.offsets.BDay()

Timestamp('2020-01-06 00:00:00')

In [82]:
# Date ranges
pd.date_range('1/1/2010', periods=10, freq=2 * pd.offsets.BDay())

DatetimeIndex(['2010-01-01', '2010-01-05', '2010-01-07', '2010-01-11',
               '2010-01-13', '2010-01-15', '2010-01-19', '2010-01-21',
               '2010-01-25', '2010-01-27'],
              dtype='datetime64[ns]', freq='2B')

In [83]:
#asfreq

In [84]:
dayMinuteBars = tsla.history(period='2d', interval='1m')

In [85]:
dayMinuteBars

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-10-14 09:30:00-04:00,449.779999,450.010010,448.010010,448.890015,1536522,0,0
2020-10-14 09:31:00-04:00,448.899994,449.459991,447.350006,448.899994,271293,0,0
2020-10-14 09:32:00-04:00,449.079987,453.000000,448.399994,452.917908,630737,0,0
2020-10-14 09:33:00-04:00,452.850006,454.608490,452.609985,454.380890,474245,0,0
2020-10-14 09:34:00-04:00,454.339996,454.799988,453.324005,453.540009,402649,0,0
...,...,...,...,...,...,...,...
2020-10-15 15:55:00-04:00,448.630005,449.589996,448.510193,449.305603,87386,0,0
2020-10-15 15:56:00-04:00,449.290009,449.649994,449.170013,449.322388,77293,0,0
2020-10-15 15:57:00-04:00,449.309998,449.450012,448.760010,448.822998,61143,0,0
2020-10-15 15:58:00-04:00,448.850006,449.366302,448.790009,449.200012,97709,0,0


In [86]:
# volume weighted average price
def vwap(bars): 
    return ((bars.Close*bars.Volume).sum()/bars.Volume.sum()).round(2)

vwap(dayMinuteBars[dayMinuteBars.index.date==pd.to_datetime('2020-07-15')])

  return ((bars.Close*bars.Volume).sum()/bars.Volume.sum()).round(2)


nan

In [87]:
dayMinuteBars.eval(
        'wgtd = Close * Volume', inplace=False
    ).groupby(dayMinuteBars.index.date).cumsum().eval('wgtd / Volume')

Datetime
2020-10-14 09:30:00-04:00    448.890015
2020-10-14 09:31:00-04:00    448.891512
2020-10-14 09:32:00-04:00    449.932949
2020-10-14 09:33:00-04:00    450.657137
2020-10-14 09:34:00-04:00    451.007251
                                ...    
2020-10-15 15:55:00-04:00    448.895178
2020-10-15 15:56:00-04:00    448.896138
2020-10-15 15:57:00-04:00    448.896008
2020-10-15 15:58:00-04:00    448.896867
2020-10-15 15:59:00-04:00    448.896771
Length: 780, dtype: float64

In [89]:
vwap = dayMinuteBars.assign(  # create a calculator column
    vwap=dayMinuteBars.eval(
        'wgtd = Close * Volume', inplace=False  # interim column
    ).groupby(dayMinuteBars.index.date).cumsum().eval('wgtd / Volume')
).groupby(dayMinuteBars.index.date).agg({'vwap':lambda x: x.iloc[-1]})
vwap

Unnamed: 0,vwap
2020-10-14,459.511531
2020-10-15,448.896771


In [90]:
dayMinuteBars

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-10-14 09:30:00-04:00,449.779999,450.010010,448.010010,448.890015,1536522,0,0
2020-10-14 09:31:00-04:00,448.899994,449.459991,447.350006,448.899994,271293,0,0
2020-10-14 09:32:00-04:00,449.079987,453.000000,448.399994,452.917908,630737,0,0
2020-10-14 09:33:00-04:00,452.850006,454.608490,452.609985,454.380890,474245,0,0
2020-10-14 09:34:00-04:00,454.339996,454.799988,453.324005,453.540009,402649,0,0
...,...,...,...,...,...,...,...
2020-10-15 15:55:00-04:00,448.630005,449.589996,448.510193,449.305603,87386,0,0
2020-10-15 15:56:00-04:00,449.290009,449.649994,449.170013,449.322388,77293,0,0
2020-10-15 15:57:00-04:00,449.309998,449.450012,448.760010,448.822998,61143,0,0
2020-10-15 15:58:00-04:00,448.850006,449.366302,448.790009,449.200012,97709,0,0


In [91]:
dayMinuteBars.index.to_series().diff().shift(-1)/ np.timedelta64(1, 'm')

Datetime
2020-10-14 09:30:00-04:00    1.0
2020-10-14 09:31:00-04:00    1.0
2020-10-14 09:32:00-04:00    1.0
2020-10-14 09:33:00-04:00    1.0
2020-10-14 09:34:00-04:00    1.0
                            ... 
2020-10-15 15:55:00-04:00    1.0
2020-10-15 15:56:00-04:00    1.0
2020-10-15 15:57:00-04:00    1.0
2020-10-15 15:58:00-04:00    1.0
2020-10-15 15:59:00-04:00    NaN
Name: Datetime, Length: 780, dtype: float64

In [92]:
dayMinuteBars['TimeDiff'] = dayMinuteBars.index.to_series().diff().shift(-1)/ np.timedelta64(1, 's')
dayMinuteBars

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,TimeDiff
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-10-14 09:30:00-04:00,449.779999,450.010010,448.010010,448.890015,1536522,0,0,60.0
2020-10-14 09:31:00-04:00,448.899994,449.459991,447.350006,448.899994,271293,0,0,60.0
2020-10-14 09:32:00-04:00,449.079987,453.000000,448.399994,452.917908,630737,0,0,60.0
2020-10-14 09:33:00-04:00,452.850006,454.608490,452.609985,454.380890,474245,0,0,60.0
2020-10-14 09:34:00-04:00,454.339996,454.799988,453.324005,453.540009,402649,0,0,60.0
...,...,...,...,...,...,...,...,...
2020-10-15 15:55:00-04:00,448.630005,449.589996,448.510193,449.305603,87386,0,0,60.0
2020-10-15 15:56:00-04:00,449.290009,449.649994,449.170013,449.322388,77293,0,0,60.0
2020-10-15 15:57:00-04:00,449.309998,449.450012,448.760010,448.822998,61143,0,0,60.0
2020-10-15 15:58:00-04:00,448.850006,449.366302,448.790009,449.200012,97709,0,0,60.0


In [93]:
twap = dayMinuteBars.assign(
    twap=dayMinuteBars.eval(
        'wgtd = Close * TimeDiff', inplace=False
    ).groupby(dayMinuteBars.index.date).cumsum().eval('wgtd / TimeDiff')
).groupby(dayMinuteBars.index.date).agg({'twap':lambda x: x.iloc[-1]})
twap

Unnamed: 0,twap
2020-10-14,461.090597
2020-10-15,


#### Functionality

In [94]:
tslaData.cumsum()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
2010-07-08,7.028000,8.504000,6.622000,8.270000,132388500,0,0.0
2010-07-13,10.506000,12.232000,10.002000,11.898000,145789000,0,0.0
2010-07-14,14.094000,16.262000,13.554000,15.866000,166765000,0,0.0
2010-07-19,18.368000,20.712000,17.738000,20.248000,179197500,0,0.0
...,...,...,...,...,...,...,...
2020-10-07,64274.180036,66470.498147,63584.836070,65790.236095,39896349600,0,5.0
2020-10-09,64704.310041,66905.088143,64011.296062,66224.236095,39925275300,0,5.0
2020-10-12,65146.310041,67353.828133,64449.876048,66666.536083,39964066400,0,5.0
2020-10-13,65589.660047,67802.718148,64886.476054,67113.186077,39998530100,0,5.0


In [95]:
tslaData['Volume'].cumsum()

Date
2010-06-29       93831500
2010-07-08      132388500
2010-07-13      145789000
2010-07-14      166765000
2010-07-19      179197500
                 ...     
2020-10-07    39896349600
2020-10-09    39925275300
2020-10-12    39964066400
2020-10-13    39998530100
2020-10-14    40046575500
Name: Volume, Length: 1285, dtype: int64

In [96]:
tslaData['CumVolume'] = tslaData['Volume'].cumsum()
tslaData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,CumVolume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0,93831500
2010-07-08,3.228000,3.504000,3.114000,3.492000,38557000,0,0.0,132388500
2010-07-13,3.478000,3.728000,3.380000,3.628000,13400500,0,0.0,145789000
2010-07-14,3.588000,4.030000,3.552000,3.968000,20976000,0,0.0,166765000
2010-07-19,4.274000,4.450000,4.184000,4.382000,12432500,0,0.0,179197500
...,...,...,...,...,...,...,...,...
2020-10-07,419.869995,429.899994,413.850006,425.299988,43127700,0,0.0,39896349600
2020-10-09,430.130005,434.589996,426.459991,434.000000,28925700,0,0.0,39925275300
2020-10-12,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0,39964066400
2020-10-13,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0,39998530100


In [97]:
tslaData['Volume'].cummax()  # running max over the entire table, useful to calculate drawdown

Date
2010-06-29     93831500
2010-07-08     93831500
2010-07-13     93831500
2010-07-14     93831500
2010-07-19     93831500
                ...    
2020-10-07    304694000
2020-10-09    304694000
2020-10-12    304694000
2020-10-13    304694000
2020-10-14    304694000
Name: Volume, Length: 1285, dtype: int64

In [98]:
tslaData['Volume'].cummin()

Date
2010-06-29    93831500
2010-07-08    38557000
2010-07-13    13400500
2010-07-14    13400500
2010-07-19    12432500
                ...   
2020-10-07      805500
2020-10-09      805500
2020-10-12      805500
2020-10-13      805500
2020-10-14      805500
Name: Volume, Length: 1285, dtype: int64

In [99]:
tslaData['Volume'].cumprod()  # cumulative product

Date
2010-06-29               93831500
2010-07-08       3617861145500000
2010-07-13    3104854564048353152
2010-07-14     344881892615208960
2010-07-19    1384189610927128576
                     ...         
2020-10-07                      0
2020-10-09                      0
2020-10-12                      0
2020-10-13                      0
2020-10-14                      0
Name: Volume, Length: 1285, dtype: int64

In [100]:
# useful to calculate moving avg
tslaData['Volume'].rolling(10).sum() # sum, mean, median, var, std, min, max, corr, cov, skew, kurt, quantile

Date
2010-06-29            NaN
2010-07-08            NaN
2010-07-13            NaN
2010-07-14            NaN
2010-07-19            NaN
                 ...     
2020-10-07    647083200.0
2020-10-09    578710700.0
2020-10-12    545222500.0
2020-10-13    502907000.0
2020-10-14    454391300.0
Name: Volume, Length: 1285, dtype: float64

In [101]:
zscore = lambda x: (x[-1] - x.mean()) / x.std(ddof=1)
tslaData['Volume'].rolling(10).apply(zscore)

Date
2010-06-29         NaN
2010-07-08         NaN
2010-07-13         NaN
2010-07-14         NaN
2010-07-19         NaN
                ...   
2020-10-07   -1.046069
2020-10-09   -1.451049
2020-10-12   -0.783779
2020-10-13   -0.819921
2020-10-14    0.249484
Name: Volume, Length: 1285, dtype: float64

In [102]:
# Expanding is very similar to cumulative, but 'waits' (NaN) until the minimum number specified

tslaData['Volume'].expanding(10).sum() # sum, mean, median, var, std, min, max, corr, cov, skew, kurt, quantile

Date
2010-06-29             NaN
2010-07-08             NaN
2010-07-13             NaN
2010-07-14             NaN
2010-07-19             NaN
                  ...     
2020-10-07    3.989635e+10
2020-10-09    3.992528e+10
2020-10-12    3.996407e+10
2020-10-13    3.999853e+10
2020-10-14    4.004658e+10
Name: Volume, Length: 1285, dtype: float64

In [103]:
# Exponential weighted moving average
tslaData['Volume'].ewm(2).mean() # std, var, corr, cob

Date
2010-06-29    9.383150e+07
2010-07-08    6.066680e+07
2010-07-13    3.827750e+07
2010-07-14    3.109072e+07
2010-07-19    2.392809e+07
                  ...     
2020-10-07    5.053666e+07
2020-10-09    4.333300e+07
2020-10-12    4.181904e+07
2020-10-13    3.936726e+07
2020-10-14    4.225997e+07
Name: Volume, Length: 1285, dtype: float64

In [104]:
# Deltas
tslaData['OpenDelta'] = tslaData.Open.diff().shift(-1)
tslaData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,CumVolume,OpenDelta
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0,93831500,-0.572000
2010-07-08,3.228000,3.504000,3.114000,3.492000,38557000,0,0.0,132388500,0.250000
2010-07-13,3.478000,3.728000,3.380000,3.628000,13400500,0,0.0,145789000,0.110000
2010-07-14,3.588000,4.030000,3.552000,3.968000,20976000,0,0.0,166765000,0.686000
2010-07-19,4.274000,4.450000,4.184000,4.382000,12432500,0,0.0,179197500,-0.174000
...,...,...,...,...,...,...,...,...,...
2020-10-07,419.869995,429.899994,413.850006,425.299988,43127700,0,0.0,39896349600,10.260010
2020-10-09,430.130005,434.589996,426.459991,434.000000,28925700,0,0.0,39925275300,11.869995
2020-10-12,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0,39964066400,1.350006
2020-10-13,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0,39998530100,6.429993


In [105]:
# Deltas
tslaData['OpenDelta'] = tslaData.Open.diff() # .shift(-1)
tslaData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,CumVolume,OpenDelta
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0,93831500,
2010-07-08,3.228000,3.504000,3.114000,3.492000,38557000,0,0.0,132388500,-0.572000
2010-07-13,3.478000,3.728000,3.380000,3.628000,13400500,0,0.0,145789000,0.250000
2010-07-14,3.588000,4.030000,3.552000,3.968000,20976000,0,0.0,166765000,0.110000
2010-07-19,4.274000,4.450000,4.184000,4.382000,12432500,0,0.0,179197500,0.686000
...,...,...,...,...,...,...,...,...,...
2020-10-07,419.869995,429.899994,413.850006,425.299988,43127700,0,0.0,39896349600,-3.480011
2020-10-09,430.130005,434.589996,426.459991,434.000000,28925700,0,0.0,39925275300,10.260010
2020-10-12,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0,39964066400,11.869995
2020-10-13,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0,39998530100,1.350006
