### Creating a DF

In [1]:
import pandas as pd
df = pd.DataFrame({'Col1':[1,2,3,4], 'Col2':[5, 6, 7, 8]})  # Pass in a dict
df

Unnamed: 0,Col1,Col2
0,1,5
1,2,6
2,3,7
3,4,8


In [2]:
import pandas as pd
import numpy as np
arr = np.array([[1,2,3], [4,5,6]])  # ndArray
df = pd.DataFrame(arr)
df

df.columns = ['a', 'b', 'c']  # Name the column
df
df = pd.DataFrame(arr, columns=['a', 'b', 'c'])  
df
df = pd.DataFrame(arr, index=['z', 'x'], columns=['a', 'b', 'c'])  # Name indices
df

Unnamed: 0,a,b,c
z,1,2,3
x,4,5,6


In [3]:
# Renaming
df.columns = ['q', 'r', 's']  # Pass in the list, has to be the correct size
df

Unnamed: 0,q,r,s
z,1,2,3
x,4,5,6


### Importing a DF

In [4]:
import yfinance as yf
tsla = yf.Ticker("TSLA")
tslaData = tsla.history(period="max")
tslaData.head()  # Top 5 rows

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,3.8,5.0,3.508,4.778,93831500,0,0.0
2010-06-30,5.158,6.084,4.66,4.766,85935500,0,0.0
2010-07-01,5.0,5.184,4.054,4.392,41094000,0,0.0
2010-07-02,4.6,4.62,3.742,3.84,25699000,0,0.0
2010-07-06,4.0,4.0,3.166,3.222,34334500,0,0.0


In [5]:
tslaData.shape  # number of rows and columns

(2599, 7)

In [6]:
tslaData.index

DatetimeIndex(['2010-06-29', '2010-06-30', '2010-07-01', '2010-07-02',
               '2010-07-06', '2010-07-07', '2010-07-08', '2010-07-09',
               '2010-07-12', '2010-07-13',
               ...
               '2020-10-09', '2020-10-12', '2020-10-13', '2020-10-14',
               '2020-10-15', '2020-10-16', '2020-10-19', '2020-10-20',
               '2020-10-21', '2020-10-22'],
              dtype='datetime64[ns]', name='Date', length=2599, freq=None)

In [7]:
tslaData.reset_index()  # Date becomes a column and index has row number

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
1,2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0
2,2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
3,2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0
4,2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
...,...,...,...,...,...,...,...,...
2594,2020-10-16,454.440002,455.950012,438.850006,439.670013,32775900,0,0.0
2595,2020-10-19,446.239990,447.000000,428.869995,430.829987,36287800,0,0.0
2596,2020-10-20,431.750000,431.750000,419.049988,421.940002,31656300,0,0.0
2597,2020-10-21,422.700012,432.950012,421.250000,422.640015,32370500,0,0.0


In [8]:
tslaData.axes

[DatetimeIndex(['2010-06-29', '2010-06-30', '2010-07-01', '2010-07-02',
                '2010-07-06', '2010-07-07', '2010-07-08', '2010-07-09',
                '2010-07-12', '2010-07-13',
                ...
                '2020-10-09', '2020-10-12', '2020-10-13', '2020-10-14',
                '2020-10-15', '2020-10-16', '2020-10-19', '2020-10-20',
                '2020-10-21', '2020-10-22'],
               dtype='datetime64[ns]', name='Date', length=2599, freq=None),
 Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits'], dtype='object')]

In [9]:
display(tslaData)
print('\n\n')
display(tslaData.to_numpy())  # convert it to an ndarray

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0
2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0
2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
...,...,...,...,...,...,...,...
2020-10-16,454.440002,455.950012,438.850006,439.670013,32775900,0,0.0
2020-10-19,446.239990,447.000000,428.869995,430.829987,36287800,0,0.0
2020-10-20,431.750000,431.750000,419.049988,421.940002,31656300,0,0.0
2020-10-21,422.700012,432.950012,421.250000,422.640015,32370500,0,0.0







array([[3.79999995e+00, 5.00000000e+00, 3.50799990e+00, ...,
        9.38315000e+07, 0.00000000e+00, 0.00000000e+00],
       [5.15799999e+00, 6.08400011e+00, 4.65999985e+00, ...,
        8.59355000e+07, 0.00000000e+00, 0.00000000e+00],
       [5.00000000e+00, 5.18400002e+00, 4.05399990e+00, ...,
        4.10940000e+07, 0.00000000e+00, 0.00000000e+00],
       ...,
       [4.31750000e+02, 4.31750000e+02, 4.19049988e+02, ...,
        3.16563000e+07, 0.00000000e+00, 0.00000000e+00],
       [4.22700012e+02, 4.32950012e+02, 4.21250000e+02, ...,
        3.23705000e+07, 0.00000000e+00, 0.00000000e+00],
       [4.41920013e+02, 4.45230011e+02, 4.24510010e+02, ...,
        3.98252000e+07, 0.00000000e+00, 0.00000000e+00]])

In [10]:
tslaData.describe()  # Aggregate columns with default statistics

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits
count,2599.0,2599.0,2599.0,2599.0,2599.0,2599.0,2599.0
mean,52.00077,53.139965,50.81557,52.030163,31139840.0,0.0,0.001924
std,66.379551,68.326122,64.278536,66.440711,28845960.0,0.0,0.098077
min,3.228,3.326,2.996,3.16,592500.0,0.0,0.0
25%,7.2,7.34,7.037,7.202,10708750.0,0.0,0.0
50%,44.136002,44.844002,43.43,44.006001,24435500.0,0.0,0.0
75%,59.649,60.871,58.549,59.722,40167500.0,0.0,0.0
max,502.140015,502.48999,470.51001,498.320007,304694000.0,0.0,5.0


In [11]:
#tslaData.sort_values(by='Volume', ascending=False, kind='mergesort', axis=0)
tslaData.sort_values(by='2010-06-30', ascending=False, kind='mergesort', axis=1)  # sort the dataframe, axis=0 -> operate on row

Unnamed: 0_level_0,Volume,High,Open,Close,Low,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,93831500,5.000000,3.800000,4.778000,3.508000,0,0.0
2010-06-30,85935500,6.084000,5.158000,4.766000,4.660000,0,0.0
2010-07-01,41094000,5.184000,5.000000,4.392000,4.054000,0,0.0
2010-07-02,25699000,4.620000,4.600000,3.840000,3.742000,0,0.0
2010-07-06,34334500,4.000000,4.000000,3.222000,3.166000,0,0.0
...,...,...,...,...,...,...,...
2020-10-16,32775900,455.950012,454.440002,439.670013,438.850006,0,0.0
2020-10-19,36287800,447.000000,446.239990,430.829987,428.869995,0,0.0
2020-10-20,31656300,431.750000,431.750000,421.940002,419.049988,0,0.0
2020-10-21,32370500,432.950012,422.700012,422.640015,421.250000,0,0.0


In [12]:
tslaData.isna()  # whether or not there's a null value
tslaData.dropna()  # drop rows that has null value, useful for data cleaning

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0
2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0
2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
...,...,...,...,...,...,...,...
2020-10-16,454.440002,455.950012,438.850006,439.670013,32775900,0,0.0
2020-10-19,446.239990,447.000000,428.869995,430.829987,36287800,0,0.0
2020-10-20,431.750000,431.750000,419.049988,421.940002,31656300,0,0.0
2020-10-21,422.700012,432.950012,421.250000,422.640015,32370500,0,0.0


In [13]:
#dct = tslaData.to_dict()
dct = tslaData.to_dict(orient='list')  # convert dataframe to dictionary
#display(dct)
pd.DataFrame.from_dict(dct)  # reload the dict originally from the df and recreate the df

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
1,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0
2,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
3,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0
4,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
...,...,...,...,...,...,...,...
2594,454.440002,455.950012,438.850006,439.670013,32775900,0,0.0
2595,446.239990,447.000000,428.869995,430.829987,36287800,0,0.0
2596,431.750000,431.750000,419.049988,421.940002,31656300,0,0.0
2597,422.700012,432.950012,421.250000,422.640015,32370500,0,0.0


### Slicing

In [14]:
tslaData = tslaData.reset_index()

In [15]:
# Slicing
tslaData[2:5]



Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
2,2010-07-01,5.0,5.184,4.054,4.392,41094000,0,0.0
3,2010-07-02,4.6,4.62,3.742,3.84,25699000,0,0.0
4,2010-07-06,4.0,4.0,3.166,3.222,34334500,0,0.0


In [16]:
tslaData[-2:]  # data from the last 2 row

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
2597,2020-10-21,422.700012,432.950012,421.25,422.640015,32370500,0,0.0
2598,2020-10-22,441.920013,445.230011,424.51001,425.790009,39825200,0,0.0


In [17]:
tslaData['Open'][2]  # give 2nd value from the 'Open' column

5.0

In [18]:
tslaData['Open'][0] 

3.799999952316284

In [19]:
tslaData[['Open', 'Close']]  #see multiple columns

Unnamed: 0,Open,Close
0,3.800000,4.778000
1,5.158000,4.766000
2,5.000000,4.392000
3,4.600000,3.840000
4,4.000000,3.222000
...,...,...
2594,454.440002,439.670013
2595,446.239990,430.829987
2596,431.750000,421.940002
2597,422.700012,422.640015


In [20]:
tslaData.loc[2:5,'Close']  # see row 2:5 from 'Close' column

2    4.392
3    3.840
4    3.222
5    3.160
Name: Close, dtype: float64

In [21]:
tslaData.iloc[:,3]  # similar to loc but require a numeric index, i.e. 3 instead of 'Close'

0         3.508000
1         4.660000
2         4.054000
3         3.742000
4         3.166000
           ...    
2594    438.850006
2595    428.869995
2596    419.049988
2597    421.250000
2598    424.510010
Name: Low, Length: 2599, dtype: float64

### Filtering

In [22]:
len(tslaData)  # how many rows

2599

In [23]:
tslaData[tslaData.Volume > 1e7]  # only see row where the volume exceed 10^7

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
1,2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0
2,2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
3,2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0
4,2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
...,...,...,...,...,...,...,...,...
2594,2020-10-16,454.440002,455.950012,438.850006,439.670013,32775900,0,0.0
2595,2020-10-19,446.239990,447.000000,428.869995,430.829987,36287800,0,0.0
2596,2020-10-20,431.750000,431.750000,419.049988,421.940002,31656300,0,0.0
2597,2020-10-21,422.700012,432.950012,421.250000,422.640015,32370500,0,0.0


In [24]:
tslaData[(tslaData.Volume > 1e7)&(tslaData.Open < tslaData.Close)]

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
6,2010-07-08,3.228000,3.504000,3.114000,3.492000,38557000,0,0.0
9,2010-07-13,3.478000,3.728000,3.380000,3.628000,13400500,0,0.0
10,2010-07-14,3.588000,4.030000,3.552000,3.968000,20976000,0,0.0
13,2010-07-19,4.274000,4.450000,4.184000,4.382000,12432500,0,0.0
...,...,...,...,...,...,...,...,...
2587,2020-10-07,419.869995,429.899994,413.850006,425.299988,43127700,0,0.0
2589,2020-10-09,430.130005,434.589996,426.459991,434.000000,28925700,0,0.0
2590,2020-10-12,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0
2591,2020-10-13,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0


In [25]:
tslaData.loc[lambda x: x.index % 2 == 0]  # slice using a callable (lambda)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
2,2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
4,2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
6,2010-07-08,3.228000,3.504000,3.114000,3.492000,38557000,0,0.0
8,2010-07-12,3.590000,3.614000,3.400000,3.410000,11012500,0,0.0
...,...,...,...,...,...,...,...,...
2590,2020-10-12,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0
2592,2020-10-14,449.779999,465.899994,447.350006,461.299988,48045400,0,0.0
2594,2020-10-16,454.440002,455.950012,438.850006,439.670013,32775900,0,0.0
2596,2020-10-20,431.750000,431.750000,419.049988,421.940002,31656300,0,0.0


In [26]:
import pandas as pd
import yfinance as yf

## Data Manipulation

In [27]:
tsla = yf.Ticker("TSLA")
tslaData = tsla.history(period="max")

msft = yf.Ticker("MSFT")
msftData = msft.history(period="max")
                        


In [28]:
display(tslaData)

display(msftData)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0
2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0
2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
...,...,...,...,...,...,...,...
2020-10-16,454.440002,455.950012,438.850006,439.670013,32775900,0,0.0
2020-10-19,446.239990,447.000000,428.869995,430.829987,36287800,0,0.0
2020-10-20,431.750000,431.750000,419.049988,421.940002,31656300,0,0.0
2020-10-21,422.700012,432.950012,421.250000,422.640015,32370500,0,0.0


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1986-03-13,0.056514,0.064825,0.056514,0.062055,1031788800,0.0,0.0
1986-03-14,0.062055,0.065379,0.062055,0.064271,308160000,0.0,0.0
1986-03-17,0.064271,0.065933,0.064271,0.065379,133171200,0.0,0.0
1986-03-18,0.065379,0.065933,0.063163,0.063717,67766400,0.0,0.0
1986-03-19,0.063717,0.064271,0.062055,0.062609,47894400,0.0,0.0
...,...,...,...,...,...,...,...
2020-10-16,220.149994,222.289993,219.320007,219.660004,26057900,0.0,0.0
2020-10-19,220.419998,222.300003,213.720001,214.220001,27625800,0.0,0.0
2020-10-20,215.800003,217.369995,213.089996,214.649994,22753500,0.0,0.0
2020-10-21,213.119995,216.919998,213.119995,214.800003,22724900,0.0,0.0


In [29]:
# reverse pivot, id_vars = what you want to hold constant. Take the column value and turn into a row
# optional: var_name and value_name to rename
melted = tslaData.reset_index().melt(id_vars=['Date'], var_name="Attribute", value_name="Value")
melted

Unnamed: 0,Date,Attribute,Value
0,2010-06-29,Open,3.800
1,2010-06-30,Open,5.158
2,2010-07-01,Open,5.000
3,2010-07-02,Open,4.600
4,2010-07-06,Open,4.000
...,...,...,...
18188,2020-10-16,Stock Splits,0.000
18189,2020-10-19,Stock Splits,0.000
18190,2020-10-20,Stock Splits,0.000
18191,2020-10-21,Stock Splits,0.000


In [30]:
pivoted = melted.pivot(index='Date', columns='Attribute')  # Reverse melt state to pivot state
pivoted = pivoted['Value'].reset_index()
pivoted.columns.name = None
pivoted

Unnamed: 0,Date,Close,Dividends,High,Low,Open,Stock Splits,Volume
0,2010-06-29,4.778000,0.0,5.000000,3.508000,3.800000,0.0,93831500.0
1,2010-06-30,4.766000,0.0,6.084000,4.660000,5.158000,0.0,85935500.0
2,2010-07-01,4.392000,0.0,5.184000,4.054000,5.000000,0.0,41094000.0
3,2010-07-02,3.840000,0.0,4.620000,3.742000,4.600000,0.0,25699000.0
4,2010-07-06,3.222000,0.0,4.000000,3.166000,4.000000,0.0,34334500.0
...,...,...,...,...,...,...,...,...
2594,2020-10-16,439.670013,0.0,455.950012,438.850006,454.440002,0.0,32775900.0
2595,2020-10-19,430.829987,0.0,447.000000,428.869995,446.239990,0.0,36287800.0
2596,2020-10-20,421.940002,0.0,431.750000,419.049988,431.750000,0.0,31656300.0
2597,2020-10-21,422.640015,0.0,432.950012,421.250000,422.700012,0.0,32370500.0


In [31]:
tslaData['Ticker'] = 'TSLA'  # can also pass in a list to assign different values for each rows.
msftData['Ticker'] = 'MSFT'
allData = pd.concat([tslaData, msftData], axis=0)  # Stack them in a single table, concat take a list of data frame. Axis = 1, stack them side by side (horizontally)
allData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0.0,0.0,TSLA
2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0.0,0.0,TSLA
2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0.0,0.0,TSLA
2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0.0,0.0,TSLA
2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0.0,0.0,TSLA
...,...,...,...,...,...,...,...,...
2020-10-16,220.149994,222.289993,219.320007,219.660004,26057900,0.0,0.0,MSFT
2020-10-19,220.419998,222.300003,213.720001,214.220001,27625800,0.0,0.0,MSFT
2020-10-20,215.800003,217.369995,213.089996,214.649994,22753500,0.0,0.0,MSFT
2020-10-21,213.119995,216.919998,213.119995,214.800003,22724900,0.0,0.0,MSFT


In [70]:
import numpy as np

allData.pivot_table(index = ['Ticker'], columns=[], values=['Open', 'Close'], aggfunc=np.std) # also np.mean and others

Unnamed: 0_level_0,Close,Open
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
MSFT,37.225182,37.213377
TSLA,66.440711,66.379551


In [71]:
# Pass in the column you want to groupby and do an aggregation
allData.groupby(by=['Date', 'Ticker']).sum() # Also see as_index=False

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Daily Gain %,Daily_Gain,TickerAvgVolume
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1986-03-13,MSFT,0.056514,0.064825,0.056514,0.062055,1031.7888,0.0,0.0,9.803927,0.005541,59.988593
1986-03-14,MSFT,0.062055,0.065379,0.062055,0.064271,308.1600,0.0,0.0,3.571430,0.002216,59.988593
1986-03-17,MSFT,0.064271,0.065933,0.064271,0.065379,133.1712,0.0,0.0,1.724131,0.001108,59.988593
1986-03-18,MSFT,0.065379,0.065933,0.063163,0.063717,67.7664,0.0,0.0,-2.542371,-0.001662,59.988593
1986-03-19,MSFT,0.063717,0.064271,0.062055,0.062609,47.8944,0.0,0.0,-1.739131,-0.001108,59.988593
...,...,...,...,...,...,...,...,...,...,...,...
2020-10-20,TSLA,431.750000,431.750000,419.049988,421.940002,31.6563,0.0,0.0,0.000000,0.000000,31.139844
2020-10-21,MSFT,213.119995,216.919998,213.119995,214.800003,22.7249,0.0,0.0,0.788292,1.680008,59.988593
2020-10-21,TSLA,422.700012,432.950012,421.250000,422.640015,32.3705,0.0,0.0,0.000000,0.000000,31.139844
2020-10-22,MSFT,213.929993,216.059998,211.699997,214.889999,22.3341,0.0,0.0,0.448748,0.960007,59.988593


In [72]:
allData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,Daily Gain %,Daily_Gain,TickerAvgVolume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93.8315,0.0,0.0,TSLA,,,31.139844
2010-06-30,5.158000,6.084000,4.660000,4.766000,85.9355,0.0,0.0,TSLA,,,31.139844
2010-07-01,5.000000,5.184000,4.054000,4.392000,41.0940,0.0,0.0,TSLA,,,31.139844
2010-07-02,4.600000,4.620000,3.742000,3.840000,25.6990,0.0,0.0,TSLA,,,31.139844
2010-07-06,4.000000,4.000000,3.166000,3.222000,34.3345,0.0,0.0,TSLA,,,31.139844
...,...,...,...,...,...,...,...,...,...,...,...
2020-10-16,220.149994,222.289993,219.320007,219.660004,26.0579,0.0,0.0,MSFT,-0.222571,-0.489990,59.988593
2020-10-19,220.419998,222.300003,213.720001,214.220001,27.6258,0.0,0.0,MSFT,-2.812811,-6.199997,59.988593
2020-10-20,215.800003,217.369995,213.089996,214.649994,22.7535,0.0,0.0,MSFT,-0.532905,-1.150009,59.988593
2020-10-21,213.119995,216.919998,213.119995,214.800003,22.7249,0.0,0.0,MSFT,0.788292,1.680008,59.988593


In [74]:
# Custom aggregation
from scipy import stats
# We want to use stats.sem (standard error). Can also create your own function
#allData.groupby(by=['Ticker']).agg({'Open': stats.sem, 'Close':np.sum}) # Can customize by column as well
allData.groupby(by=['Ticker']).agg({'Close':np.sum}) # Can customize by column as well

Unnamed: 0_level_0,Close
Ticker,Unnamed: 1_level_1
MSFT,235811.042973
TSLA,135226.393926


### Joins


In [35]:
tsla = yf.Ticker("TSLA")
tslaData = tsla.history(period="max")

msft = yf.Ticker("MSFT")
msftData = msft.history(period="max")
                        


In [36]:
# taking subset of the data
tslaData = tslaData[tslaData.Open<tslaData.Close]
msftData = msftData[msftData.Open<msftData.Close]

In [37]:
# Join on the indices from the left hand table (msft in this case, right hand is tsla)
x = msftData.join(tslaData, how='left', rsuffix='_tsla', lsuffix='_msft') # left, right, outer, inner
x.dropna()  # Drop the NaN values

Unnamed: 0_level_0,Open_msft,High_msft,Low_msft,Close_msft,Volume_msft,Dividends_msft,Stock Splits_msft,Open_tsla,High_tsla,Low_tsla,Close_tsla,Volume_tsla,Dividends_tsla,Stock Splits_tsla
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2010-07-19,19.670458,19.938405,19.631055,19.883240,38181800,0.0,0.0,4.274000,4.450000,4.184000,4.382000,12432500.0,0.0,0.0
2010-07-22,20.103902,20.482179,20.072378,20.363968,73016400,0.0,0.0,4.100000,4.250000,4.074000,4.200000,4789000.0,0.0,0.0
2010-08-02,20.482181,20.789531,20.293042,20.750128,55044600,0.0,0.0,4.100000,4.194000,4.066000,4.184000,3590500.0,0.0,0.0
2010-08-13,19.189732,19.441917,19.103043,19.229136,45263500,0.0,0.0,3.636000,3.690000,3.532000,3.664000,3170000.0,0.0,0.0
2010-08-16,19.197607,19.394627,19.150321,19.307938,40909700,0.0,0.0,3.690000,3.760000,3.652000,3.756000,2429000.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-10-05,207.220001,210.410004,206.979996,210.380005,21331600,0.0,0.0,423.350006,433.640015,419.329987,425.679993,44722800.0,0.0,0.0
2020-10-07,207.059998,210.110001,206.720001,209.830002,25681100,0.0,0.0,419.869995,429.899994,413.850006,425.299988,43127700.0,0.0,0.0
2020-10-09,211.229996,215.860001,211.229996,215.809998,26458000,0.0,0.0,430.130005,434.589996,426.459991,434.000000,28925700.0,0.0,0.0
2020-10-12,218.789993,223.860001,216.809998,221.399994,40461400,0.0,0.0,442.000000,448.739990,438.579987,442.299988,38791100.0,0.0,0.0


In [38]:
# Inner: intesect of data. Outer = union of data
msftData.join(tslaData, how='outer', rsuffix='_tsla', lsuffix='_msft') # left, right, outer, inner

Unnamed: 0_level_0,Open_msft,High_msft,Low_msft,Close_msft,Volume_msft,Dividends_msft,Stock Splits_msft,Open_tsla,High_tsla,Low_tsla,Close_tsla,Volume_tsla,Dividends_tsla,Stock Splits_tsla
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1986-03-13,0.056514,0.064825,0.056514,0.062055,1.031789e+09,0.0,0.0,,,,,,,
1986-03-14,0.062055,0.065379,0.062055,0.064271,3.081600e+08,0.0,0.0,,,,,,,
1986-03-17,0.064271,0.065933,0.064271,0.065379,1.331712e+08,0.0,0.0,,,,,,,
1986-03-25,0.057623,0.058731,0.057068,0.058731,3.208320e+07,0.0,0.0,,,,,,,
1986-03-26,0.058731,0.060947,0.058177,0.060393,2.275200e+07,0.0,0.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-10-13,222.720001,225.210007,220.429993,222.860001,2.895080e+07,0.0,0.0,443.350006,448.890015,436.600006,446.649994,34463700.0,0.0,0.0
2020-10-14,,,,,,,,449.779999,465.899994,447.350006,461.299988,48045400.0,0.0,0.0
2020-10-15,217.100006,220.360001,216.009995,219.660004,2.273310e+07,0.0,0.0,,,,,,,
2020-10-21,213.119995,216.919998,213.119995,214.800003,2.272490e+07,0.0,0.0,,,,,,,


In [39]:
msftData.reset_index().join(tslaData, on='Date', how='right', lsuffix='_tsla', rsuffix='_msft') # left, right, outer, inner

Unnamed: 0,Date,Open_tsla,High_tsla,Low_tsla,Close_tsla,Volume_tsla,Dividends_tsla,Stock Splits_tsla,Open_msft,High_msft,Low_msft,Close_msft,Volume_msft,Dividends_msft,Stock Splits_msft
2978.0,2010-07-19,19.670458,19.938405,19.631055,19.883240,38181800.0,0.0,0.0,4.274000,4.450000,4.184000,4.382000,12432500,0,0.0
2980.0,2010-07-22,20.103902,20.482179,20.072378,20.363968,73016400.0,0.0,0.0,4.100000,4.250000,4.074000,4.200000,4789000,0,0.0
2984.0,2010-08-02,20.482181,20.789531,20.293042,20.750128,55044600.0,0.0,0.0,4.100000,4.194000,4.066000,4.184000,3590500,0,0.0
2989.0,2010-08-13,19.189732,19.441917,19.103043,19.229136,45263500.0,0.0,0.0,3.636000,3.690000,3.532000,3.664000,3170000,0,0.0
2990.0,2010-08-16,19.197607,19.394627,19.150321,19.307938,40909700.0,0.0,0.0,3.690000,3.760000,3.652000,3.756000,2429000,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,2020-09-04,,,,,,,,402.809998,428.000000,372.019989,418.320007,110321900,0,0.0
,2020-09-16,,,,,,,,439.869995,457.790009,435.309998,441.760010,72279300,0,0.0
,2020-09-29,,,,,,,,416.000000,428.500000,411.600006,419.070007,50219300,0,0.0
,2020-10-01,,,,,,,,440.760010,448.880005,434.420013,448.160004,50741500,0,0.0


In [40]:
tslaData.merge(msftData, how='left', left_on='Date', right_on='Date', suffixes=['_tsla', '_msft'])  # more flexible

Unnamed: 0_level_0,Open_tsla,High_tsla,Low_tsla,Close_tsla,Volume_tsla,Dividends_tsla,Stock Splits_tsla,Open_msft,High_msft,Low_msft,Close_msft,Volume_msft,Dividends_msft,Stock Splits_msft
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0,,,,,,,
2010-07-08,3.228000,3.504000,3.114000,3.492000,38557000,0,0.0,,,,,,,
2010-07-13,3.478000,3.728000,3.380000,3.628000,13400500,0,0.0,,,,,,,
2010-07-14,3.588000,4.030000,3.552000,3.968000,20976000,0,0.0,,,,,,,
2010-07-19,4.274000,4.450000,4.184000,4.382000,12432500,0,0.0,19.670458,19.938405,19.631055,19.883240,38181800.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-10-07,419.869995,429.899994,413.850006,425.299988,43127700,0,0.0,207.059998,210.110001,206.720001,209.830002,25681100.0,0.0,0.0
2020-10-09,430.130005,434.589996,426.459991,434.000000,28925700,0,0.0,211.229996,215.860001,211.229996,215.809998,26458000.0,0.0,0.0
2020-10-12,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0,218.789993,223.860001,216.809998,221.399994,40461400.0,0.0,0.0
2020-10-13,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0,222.720001,225.210007,220.429993,222.860001,28950800.0,0.0,0.0


In [41]:
pd.merge_asof(msftData, tslaData, left_on='Date', right_on='Date', suffixes=['_msft', '_tsla'], direction='nearest')  # useful for timeseries, try to join on on the closest data

Unnamed: 0,Date,Open_msft,High_msft,Low_msft,Close_msft,Volume_msft,Dividends_msft,Stock Splits_msft,Open_tsla,High_tsla,Low_tsla,Close_tsla,Volume_tsla,Dividends_tsla,Stock Splits_tsla
0,1986-03-13,0.056514,0.064825,0.056514,0.062055,1031788800,0.0,0.0,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
1,1986-03-14,0.062055,0.065379,0.062055,0.064271,308160000,0.0,0.0,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
2,1986-03-17,0.064271,0.065933,0.064271,0.065379,133171200,0.0,0.0,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
3,1986-03-25,0.057623,0.058731,0.057068,0.058731,32083200,0.0,0.0,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
4,1986-03-26,0.058731,0.060947,0.058177,0.060393,22752000,0.0,0.0,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4305,2020-10-12,218.789993,223.860001,216.809998,221.399994,40461400,0.0,0.0,442.000000,448.739990,438.579987,442.299988,38791100,0,0.0
4306,2020-10-13,222.720001,225.210007,220.429993,222.860001,28950800,0.0,0.0,443.350006,448.890015,436.600006,446.649994,34463700,0,0.0
4307,2020-10-15,217.100006,220.360001,216.009995,219.660004,22733100,0.0,0.0,449.779999,465.899994,447.350006,461.299988,48045400,0,0.0
4308,2020-10-21,213.119995,216.919998,213.119995,214.800003,22724900,0.0,0.0,449.779999,465.899994,447.350006,461.299988,48045400,0,0.0


### MultiIndex

In [42]:
import yfinance as yf
tickers = yf.Tickers('msft aapl goog tsla')
hist = tickers.history(group_by="ticker")

[*********************100%***********************]  4 of 4 completed


In [43]:
hist.head()

Unnamed: 0_level_0,GOOG,GOOG,GOOG,GOOG,GOOG,GOOG,GOOG,MSFT,MSFT,MSFT,...,TSLA,TSLA,TSLA,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL
Unnamed: 0_level_1,Open,High,Low,Close,Volume,Dividends,Stock Splits,Open,High,Low,...,Volume,Dividends,Stock Splits,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2020-09-23,1458.780029,1460.959961,1407.699951,1415.209961,1657400,0,0,207.899994,208.100006,200.029999,...,95074200,0,0,111.620003,112.110001,106.769997,107.120003,150718700,0,0
2020-09-24,1411.030029,1443.708984,1409.849976,1428.290039,1450200,0,0,199.850006,205.570007,199.199997,...,96561100,0,0,105.169998,110.25,105.0,108.220001,167743300,0,0
2020-09-25,1432.630005,1450.0,1413.339966,1444.959961,1323000,0,0,203.550003,209.039993,202.539993,...,67208500,0,0,108.43,112.440002,107.669998,112.279999,149981400,0,0
2020-09-28,1474.209961,1476.800049,1449.301025,1464.52002,2007900,0,0,210.880005,212.570007,208.059998,...,49719600,0,0,115.010002,115.32,112.779999,114.959999,137672400,0,0
2020-09-29,1470.390015,1476.662964,1458.805054,1469.329956,978200,0,0,209.350006,210.070007,206.809998,...,50219300,0,0,114.550003,115.309998,113.57,114.089996,99382200,0,0


In [44]:
hist.columns

MultiIndex([('GOOG',         'Open'),
            ('GOOG',         'High'),
            ('GOOG',          'Low'),
            ('GOOG',        'Close'),
            ('GOOG',       'Volume'),
            ('GOOG',    'Dividends'),
            ('GOOG', 'Stock Splits'),
            ('MSFT',         'Open'),
            ('MSFT',         'High'),
            ('MSFT',          'Low'),
            ('MSFT',        'Close'),
            ('MSFT',       'Volume'),
            ('MSFT',    'Dividends'),
            ('MSFT', 'Stock Splits'),
            ('TSLA',         'Open'),
            ('TSLA',         'High'),
            ('TSLA',          'Low'),
            ('TSLA',        'Close'),
            ('TSLA',       'Volume'),
            ('TSLA',    'Dividends'),
            ('TSLA', 'Stock Splits'),
            ('AAPL',         'Open'),
            ('AAPL',         'High'),
            ('AAPL',          'Low'),
            ('AAPL',        'Close'),
            ('AAPL',       'Volume'),
            

In [45]:
hist.shape

(22, 28)

In [46]:
hist

Unnamed: 0_level_0,GOOG,GOOG,GOOG,GOOG,GOOG,GOOG,GOOG,MSFT,MSFT,MSFT,...,TSLA,TSLA,TSLA,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL
Unnamed: 0_level_1,Open,High,Low,Close,Volume,Dividends,Stock Splits,Open,High,Low,...,Volume,Dividends,Stock Splits,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2020-09-23,1458.780029,1460.959961,1407.699951,1415.209961,1657400,0,0,207.899994,208.100006,200.029999,...,95074200,0,0,111.620003,112.110001,106.769997,107.120003,150718700,0,0
2020-09-24,1411.030029,1443.708984,1409.849976,1428.290039,1450200,0,0,199.850006,205.570007,199.199997,...,96561100,0,0,105.169998,110.25,105.0,108.220001,167743300,0,0
2020-09-25,1432.630005,1450.0,1413.339966,1444.959961,1323000,0,0,203.550003,209.039993,202.539993,...,67208500,0,0,108.43,112.440002,107.669998,112.279999,149981400,0,0
2020-09-28,1474.209961,1476.800049,1449.301025,1464.52002,2007900,0,0,210.880005,212.570007,208.059998,...,49719600,0,0,115.010002,115.32,112.779999,114.959999,137672400,0,0
2020-09-29,1470.390015,1476.662964,1458.805054,1469.329956,978200,0,0,209.350006,210.070007,206.809998,...,50219300,0,0,114.550003,115.309998,113.57,114.089996,99382200,0,0
2020-09-30,1466.800049,1489.75,1459.880005,1469.599976,1700600,0,0,207.729996,211.979996,206.539993,...,48145600,0,0,113.790001,117.260002,113.620003,115.809998,142675200,0,0
2020-10-01,1484.27002,1499.040039,1479.209961,1490.089966,1779500,0,0,213.490005,213.990005,211.320007,...,50741500,0,0,117.639999,117.720001,115.830002,116.790001,116120400,0,0
2020-10-02,1462.030029,1483.199951,1450.920044,1458.420044,1284100,0,0,208.0,210.990005,205.539993,...,71430000,0,0,112.889999,115.370003,112.220001,113.019997,144712000,0,0
2020-10-05,1466.209961,1488.209961,1464.27002,1486.02002,1113300,0,0,207.220001,210.410004,206.979996,...,44722800,0,0,113.910004,116.650002,113.550003,116.5,106243800,0,0
2020-10-06,1475.579956,1486.76001,1448.589966,1453.439941,1245400,0,0,208.820007,210.179993,204.820007,...,49146300,0,0,115.699997,116.120003,112.25,113.160004,161498200,0,0


In [47]:
# Stack the table
hist.stack(level=0).reset_index().rename(columns = {'level_1':'Ticker'})

# Rename column level_1

Unnamed: 0,Date,Ticker,Close,Dividends,High,Low,Open,Stock Splits,Volume
0,2020-09-23,AAPL,107.120003,0,112.110001,106.769997,111.620003,0,150718700
1,2020-09-23,GOOG,1415.209961,0,1460.959961,1407.699951,1458.780029,0,1657400
2,2020-09-23,MSFT,200.589996,0,208.100006,200.029999,207.899994,0,30803800
3,2020-09-23,TSLA,380.359985,0,412.149994,375.880005,405.160004,0,95074200
4,2020-09-24,AAPL,108.220001,0,110.250000,105.000000,105.169998,0,167743300
...,...,...,...,...,...,...,...,...,...
83,2020-10-21,TSLA,422.640015,0,432.950012,421.250000,422.700012,0,32370500
84,2020-10-22,AAPL,115.750000,0,118.040001,114.589996,117.449997,0,101709700
85,2020-10-22,GOOG,1615.329956,0,1621.989990,1585.000000,1593.050049,0,1433600
86,2020-10-22,MSFT,214.889999,0,216.059998,211.699997,213.929993,0,22334100


In [48]:
hist.stack(level=0).unstack()

Unnamed: 0_level_0,Close,Close,Close,Close,Dividends,Dividends,Dividends,Dividends,High,High,...,Open,Open,Stock Splits,Stock Splits,Stock Splits,Stock Splits,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,AAPL,GOOG,MSFT,TSLA,AAPL,GOOG,MSFT,TSLA,AAPL,GOOG,...,MSFT,TSLA,AAPL,GOOG,MSFT,TSLA,AAPL,GOOG,MSFT,TSLA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2020-09-23,107.120003,1415.209961,200.589996,380.359985,0,0,0,0,112.110001,1460.959961,...,207.899994,405.160004,0,0,0,0,150718700,1657400,30803800,95074200
2020-09-24,108.220001,1428.290039,203.190002,387.790009,0,0,0,0,110.25,1443.708984,...,199.850006,363.799988,0,0,0,0,167743300,1450200,31202500,96561100
2020-09-25,112.279999,1444.959961,207.820007,407.339996,0,0,0,0,112.440002,1450.0,...,203.550003,393.470001,0,0,0,0,149981400,1323000,29437300,67208500
2020-09-28,114.959999,1464.52002,209.440002,421.200012,0,0,0,0,115.32,1476.800049,...,210.880005,424.619995,0,0,0,0,137672400,2007900,32004900,49719600
2020-09-29,114.089996,1469.329956,207.259995,419.070007,0,0,0,0,115.309998,1476.662964,...,209.350006,416.0,0,0,0,0,99382200,978200,24221900,50219300
2020-09-30,115.809998,1469.599976,210.330002,429.01001,0,0,0,0,117.260002,1489.75,...,207.729996,421.320007,0,0,0,0,142675200,1700600,33780700,48145600
2020-10-01,116.790001,1490.089966,212.460007,448.160004,0,0,0,0,117.720001,1499.040039,...,213.490005,440.76001,0,0,0,0,116120400,1779500,27158400,50741500
2020-10-02,113.019997,1458.420044,206.190002,415.089996,0,0,0,0,115.370003,1483.199951,...,208.0,421.390015,0,0,0,0,144712000,1284100,33154800,71430000
2020-10-05,116.5,1486.02002,210.380005,425.679993,0,0,0,0,116.650002,1488.209961,...,207.220001,423.350006,0,0,0,0,106243800,1113300,21331600,44722800
2020-10-06,113.160004,1453.439941,205.910004,413.980011,0,0,0,0,116.120003,1486.76001,...,208.820007,423.790009,0,0,0,0,161498200,1245400,28554300,49146300


## Transformations

In [49]:
import pandas as pd
import yfinance as yf
tsla = yf.Ticker("TSLA")
tslaData = tsla.history(period="max")

msft = yf.Ticker("MSFT")
msftData = msft.history(period="max")
                        


In [50]:
# Calculated columns
msftData['Daily Gain %'] = 100*(msftData['Close']-msftData['Open'])/msftData['Open'] # Approach 1 (preferred generally)
msftData = msftData.assign(Daily_Gain=lambda x:x['Close']-x['Open']) # Approach 2

msftData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Daily Gain %,Daily_Gain
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1986-03-13,0.056514,0.064825,0.056514,0.062055,1031788800,0.0,0.0,9.803927,0.005541
1986-03-14,0.062055,0.065379,0.062055,0.064271,308160000,0.0,0.0,3.571430,0.002216
1986-03-17,0.064271,0.065933,0.064271,0.065379,133171200,0.0,0.0,1.724131,0.001108
1986-03-18,0.065379,0.065933,0.063163,0.063717,67766400,0.0,0.0,-2.542371,-0.001662
1986-03-19,0.063717,0.064271,0.062055,0.062609,47894400,0.0,0.0,-1.739131,-0.001108
...,...,...,...,...,...,...,...,...,...
2020-10-16,220.149994,222.289993,219.320007,219.660004,26057900,0.0,0.0,-0.222571,-0.489990
2020-10-19,220.419998,222.300003,213.720001,214.220001,27625800,0.0,0.0,-2.812811,-6.199997
2020-10-20,215.800003,217.369995,213.089996,214.649994,22753500,0.0,0.0,-0.532905,-1.150009
2020-10-21,213.119995,216.919998,213.119995,214.800003,22724900,0.0,0.0,0.788292,1.680008


In [51]:
tslaData.apply(lambda x:x*1e-6)  # scale back every column by 1e-6

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,0.000004,0.000005,0.000004,0.000005,93.8315,0.0,0.0
2010-06-30,0.000005,0.000006,0.000005,0.000005,85.9355,0.0,0.0
2010-07-01,0.000005,0.000005,0.000004,0.000004,41.0940,0.0,0.0
2010-07-02,0.000005,0.000005,0.000004,0.000004,25.6990,0.0,0.0
2010-07-06,0.000004,0.000004,0.000003,0.000003,34.3345,0.0,0.0
...,...,...,...,...,...,...,...
2020-10-16,0.000454,0.000456,0.000439,0.000440,32.7759,0.0,0.0
2020-10-19,0.000446,0.000447,0.000429,0.000431,36.2878,0.0,0.0
2020-10-20,0.000432,0.000432,0.000419,0.000422,31.6563,0.0,0.0
2020-10-21,0.000423,0.000433,0.000421,0.000423,32.3705,0.0,0.0


In [52]:
tslaData['Volume'] = tslaData['Volume'].apply(lambda x:x*1e-6)  # scale back the volume column bby 1e-6
tslaData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93.8315,0,0.0
2010-06-30,5.158000,6.084000,4.660000,4.766000,85.9355,0,0.0
2010-07-01,5.000000,5.184000,4.054000,4.392000,41.0940,0,0.0
2010-07-02,4.600000,4.620000,3.742000,3.840000,25.6990,0,0.0
2010-07-06,4.000000,4.000000,3.166000,3.222000,34.3345,0,0.0
...,...,...,...,...,...,...,...
2020-10-16,454.440002,455.950012,438.850006,439.670013,32.7759,0,0.0
2020-10-19,446.239990,447.000000,428.869995,430.829987,36.2878,0,0.0
2020-10-20,431.750000,431.750000,419.049988,421.940002,31.6563,0,0.0
2020-10-21,422.700012,432.950012,421.250000,422.640015,32.3705,0,0.0


In [53]:
msftData['Volume'] = msftData['Volume'].transform(lambda x:x*1e-6)  # simple case, not much different from apply
msftData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Daily Gain %,Daily_Gain
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1986-03-13,0.056514,0.064825,0.056514,0.062055,1031.7888,0.0,0.0,9.803927,0.005541
1986-03-14,0.062055,0.065379,0.062055,0.064271,308.1600,0.0,0.0,3.571430,0.002216
1986-03-17,0.064271,0.065933,0.064271,0.065379,133.1712,0.0,0.0,1.724131,0.001108
1986-03-18,0.065379,0.065933,0.063163,0.063717,67.7664,0.0,0.0,-2.542371,-0.001662
1986-03-19,0.063717,0.064271,0.062055,0.062609,47.8944,0.0,0.0,-1.739131,-0.001108
...,...,...,...,...,...,...,...,...,...
2020-10-16,220.149994,222.289993,219.320007,219.660004,26.0579,0.0,0.0,-0.222571,-0.489990
2020-10-19,220.419998,222.300003,213.720001,214.220001,27.6258,0.0,0.0,-2.812811,-6.199997
2020-10-20,215.800003,217.369995,213.089996,214.649994,22.7535,0.0,0.0,-0.532905,-1.150009
2020-10-21,213.119995,216.919998,213.119995,214.800003,22.7249,0.0,0.0,0.788292,1.680008


In [54]:
tslaData['Ticker'] = 'TSLA'
msftData['Ticker'] = 'MSFT'
allData = pd.concat([tslaData, msftData], axis=0)  # combine tsla and msft data to a single dataframe


In [55]:
grouped = allData.groupby(['Ticker']).agg({'Volume': np.sum})  # sum total volume by the ticker
grouped

Unnamed: 0_level_0,Volume
Ticker,Unnamed: 1_level_1
MSFT,523460.4601
TSLA,80932.4551


In [56]:
grouped = allData.groupby(['Ticker']).transform(np.sum)  # place it back to the original table, maintain table structure
grouped

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Daily Gain %,Daily_Gain
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-06-29,135150.002138,138110.768142,132069.666042,135226.393926,80932.4551,0.00,5.0,0.000000,0.000000
2010-06-30,135150.002138,138110.768142,132069.666042,135226.393926,80932.4551,0.00,5.0,0.000000,0.000000
2010-07-01,135150.002138,138110.768142,132069.666042,135226.393926,80932.4551,0.00,5.0,0.000000,0.000000
2010-07-02,135150.002138,138110.768142,132069.666042,135226.393926,80932.4551,0.00,5.0,0.000000,0.000000
2010-07-06,135150.002138,138110.768142,132069.666042,135226.393926,80932.4551,0.00,5.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
2020-10-16,235752.107050,238261.895755,233183.594182,235811.042973,523460.4601,19.15,17.0,597.715627,58.935924
2020-10-19,235752.107050,238261.895755,233183.594182,235811.042973,523460.4601,19.15,17.0,597.715627,58.935924
2020-10-20,235752.107050,238261.895755,233183.594182,235811.042973,523460.4601,19.15,17.0,597.715627,58.935924
2020-10-21,235752.107050,238261.895755,233183.594182,235811.042973,523460.4601,19.15,17.0,597.715627,58.935924


In [57]:
#allData['TickerAvgVolume'] = allData.groupby(['Ticker']).agg({'Volume': 'sum'}) # doesnt work
# For each ticker, what's the average volume is and put that volume back to the table
allData.join(allData.groupby(['Ticker']).agg({'Volume': 'mean'}).rename(columns={'Volume':'TickerAvgVolume'}), on='Ticker')

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,Daily Gain %,Daily_Gain,TickerAvgVolume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93.8315,0.0,0.0,TSLA,,,31.139844
2010-06-30,5.158000,6.084000,4.660000,4.766000,85.9355,0.0,0.0,TSLA,,,31.139844
2010-07-01,5.000000,5.184000,4.054000,4.392000,41.0940,0.0,0.0,TSLA,,,31.139844
2010-07-02,4.600000,4.620000,3.742000,3.840000,25.6990,0.0,0.0,TSLA,,,31.139844
2010-07-06,4.000000,4.000000,3.166000,3.222000,34.3345,0.0,0.0,TSLA,,,31.139844
...,...,...,...,...,...,...,...,...,...,...,...
2020-10-16,220.149994,222.289993,219.320007,219.660004,26.0579,0.0,0.0,MSFT,-0.222571,-0.489990,59.988593
2020-10-19,220.419998,222.300003,213.720001,214.220001,27.6258,0.0,0.0,MSFT,-2.812811,-6.199997,59.988593
2020-10-20,215.800003,217.369995,213.089996,214.649994,22.7535,0.0,0.0,MSFT,-0.532905,-1.150009,59.988593
2020-10-21,213.119995,216.919998,213.119995,214.800003,22.7249,0.0,0.0,MSFT,0.788292,1.680008,59.988593


In [58]:
allData.join(allData.groupby(['Ticker']).agg({'Volume': 'mean'}).rename(columns={'Volume':'TickerAvgVolume'}), on='Ticker')

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,Daily Gain %,Daily_Gain,TickerAvgVolume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93.8315,0.0,0.0,TSLA,,,31.139844
2010-06-30,5.158000,6.084000,4.660000,4.766000,85.9355,0.0,0.0,TSLA,,,31.139844
2010-07-01,5.000000,5.184000,4.054000,4.392000,41.0940,0.0,0.0,TSLA,,,31.139844
2010-07-02,4.600000,4.620000,3.742000,3.840000,25.6990,0.0,0.0,TSLA,,,31.139844
2010-07-06,4.000000,4.000000,3.166000,3.222000,34.3345,0.0,0.0,TSLA,,,31.139844
...,...,...,...,...,...,...,...,...,...,...,...
2020-10-16,220.149994,222.289993,219.320007,219.660004,26.0579,0.0,0.0,MSFT,-0.222571,-0.489990,59.988593
2020-10-19,220.419998,222.300003,213.720001,214.220001,27.6258,0.0,0.0,MSFT,-2.812811,-6.199997,59.988593
2020-10-20,215.800003,217.369995,213.089996,214.649994,22.7535,0.0,0.0,MSFT,-0.532905,-1.150009,59.988593
2020-10-21,213.119995,216.919998,213.119995,214.800003,22.7249,0.0,0.0,MSFT,0.788292,1.680008,59.988593


In [59]:
allData.groupby(['Ticker']).transform('mean')[['Volume']] # vs agg

Unnamed: 0_level_0,Volume
Date,Unnamed: 1_level_1
2010-06-29,31.139844
2010-06-30,31.139844
2010-07-01,31.139844
2010-07-02,31.139844
2010-07-06,31.139844
...,...
2020-10-16,59.988593
2020-10-19,59.988593
2020-10-20,59.988593
2020-10-21,59.988593


In [60]:
# Cleaner!
allData['TickerAvgVolume'] = allData.groupby(['Ticker']).transform('mean')[['Volume']].rename(columns={'Volume':'TickerAvgVolume'})
allData

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,Daily Gain %,Daily_Gain,TickerAvgVolume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93.8315,0.0,0.0,TSLA,,,31.139844
2010-06-30,5.158000,6.084000,4.660000,4.766000,85.9355,0.0,0.0,TSLA,,,31.139844
2010-07-01,5.000000,5.184000,4.054000,4.392000,41.0940,0.0,0.0,TSLA,,,31.139844
2010-07-02,4.600000,4.620000,3.742000,3.840000,25.6990,0.0,0.0,TSLA,,,31.139844
2010-07-06,4.000000,4.000000,3.166000,3.222000,34.3345,0.0,0.0,TSLA,,,31.139844
...,...,...,...,...,...,...,...,...,...,...,...
2020-10-16,220.149994,222.289993,219.320007,219.660004,26.0579,0.0,0.0,MSFT,-0.222571,-0.489990,59.988593
2020-10-19,220.419998,222.300003,213.720001,214.220001,27.6258,0.0,0.0,MSFT,-2.812811,-6.199997,59.988593
2020-10-20,215.800003,217.369995,213.089996,214.649994,22.7535,0.0,0.0,MSFT,-0.532905,-1.150009,59.988593
2020-10-21,213.119995,216.919998,213.119995,214.800003,22.7249,0.0,0.0,MSFT,0.788292,1.680008,59.988593


### Time Series

#### Timestamps

In [61]:
# Using datetime
import datetime
import pytz

dateToConvert = datetime.datetime(2020,7,1,15,31,24) # This is a 'naive' time without timezone
dateToConvert

datetime.datetime(2020, 7, 1, 15, 31, 24)

In [62]:
dateToConvert.astimezone(pytz.UTC) # Set to UTC, Time stays the same
#dateToConvert.astimezone(pytz.timezone('US/Eastern')) # Set to Eastern, Time stays the same

datetime.datetime(2020, 7, 1, 22, 31, 24, tzinfo=<UTC>)

In [63]:
pytz.timezone('US/Eastern').localize(dateToConvert)

datetime.datetime(2020, 7, 1, 15, 31, 24, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>)

In [64]:
#dateToConvert = dateToConvert.astimezone(pytz.UTC)
# localize take the original datetime and localize to the specified timezone
pytz.timezone('America/Chicago').localize(dateToConvert).astimezone(pytz.UTC) # America/Chicago

datetime.datetime(2020, 7, 1, 20, 31, 24, tzinfo=<UTC>)

In [65]:
start = pd.to_datetime('2015-01-01')  # convert string to an np.datetime64
end = pd.to_datetime('2018-01-01')

def random_dates(start, end, n=10):

    start_u = start.value//10**9
    end_u = end.value//10**9

    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')

dates = random_dates(start, end, 1000000)
pyDates = list(dates.to_pydatetime())


In [66]:
pyDates

[datetime.datetime(2016, 12, 3, 8, 0, 4),
 datetime.datetime(2016, 11, 28, 20, 55, 36),
 datetime.datetime(2017, 5, 19, 11, 2, 58),
 datetime.datetime(2016, 4, 1, 13, 9, 3),
 datetime.datetime(2015, 9, 3, 18, 53, 27),
 datetime.datetime(2015, 6, 4, 7, 0, 31),
 datetime.datetime(2016, 4, 21, 7, 16, 5),
 datetime.datetime(2016, 1, 13, 15, 2, 33),
 datetime.datetime(2016, 7, 20, 20, 51, 16),
 datetime.datetime(2015, 12, 16, 7, 19, 44),
 datetime.datetime(2015, 4, 20, 4, 59, 54),
 datetime.datetime(2015, 5, 6, 23, 15, 9),
 datetime.datetime(2016, 12, 3, 16, 46),
 datetime.datetime(2015, 11, 6, 0, 36, 39),
 datetime.datetime(2017, 10, 5, 3, 23, 37),
 datetime.datetime(2016, 12, 22, 1, 59, 13),
 datetime.datetime(2017, 2, 22, 21, 24, 18),
 datetime.datetime(2015, 5, 27, 9, 10, 46),
 datetime.datetime(2017, 9, 16, 18, 24, 11),
 datetime.datetime(2015, 1, 28, 3, 41, 38),
 datetime.datetime(2017, 1, 3, 11, 6, 44),
 datetime.datetime(2017, 1, 6, 7, 42, 25),
 datetime.datetime(2017, 1, 28, 9, 31,

In [67]:
from timer import Timer

with Timer('ConvertDatesPy') as t:
    res = [pytz.timezone('US/Eastern').localize(r).astimezone(pytz.UTC) for r in pyDates]

with Timer('ConvertDatesNP') as t:
    res = dates.tz_localize('US/Eastern', nonexistent='shift_forward', ambiguous='NaT').tz_convert('UTC')  # if unclear, shift forward, if not a datetime, return NaT i.e. Not a Time

ImportError: cannot import name 'Timer' from 'timer' (D:\anaconda3\lib\site-packages\win32\timer.pyd)

In [None]:
dates

In [None]:
res

In [None]:
# Add/Subtract business day
pd.to_datetime('2020.01.03') + pd.offsets.BDay()

In [None]:
# Date ranges
pd.date_range('1/1/2010', periods=10, freq=2 * pd.offsets.BDay())

In [None]:
#asfreq

In [None]:
dayMinuteBars = tsla.history(period='2d', interval='1m')

In [None]:
dayMinuteBars

In [None]:
# volume weighted average price
def vwap(bars): 
    return ((bars.Close*bars.Volume).sum()/bars.Volume.sum()).round(2)

vwap(dayMinuteBars[dayMinuteBars.index.date==pd.to_datetime('2020-07-15')])

In [None]:
dayMinuteBars.eval(
        'wgtd = Close * Volume', inplace=False
    ).groupby(dayMinuteBars.index.date).cumsum().eval('wgtd / Volume')

In [None]:
vwap = dayMinuteBars.assign(  # create a calculator column
    vwap=dayMinuteBars.eval(
        'wgtd = Close * Volume', inplace=False  # interim column
    ).groupby(dayMinuteBars.index.date).cumsum().eval('wgtd / Volume')
).groupby(dayMinuteBars.index.date).agg({'vwap':lambda x: x.iloc[-1]})
vwap

In [None]:
dayMinuteBars

In [None]:
dayMinuteBars.index.to_series().diff().shift(-1)/ np.timedelta64(1, 'm')

In [None]:
dayMinuteBars['TimeDiff'] = dayMinuteBars.index.to_series().diff().shift(-1)/ np.timedelta64(1, 's')
dayMinuteBars

In [None]:
twap = dayMinuteBars.assign(
    twap=dayMinuteBars.eval(
        'wgtd = Close * TimeDiff', inplace=False
    ).groupby(dayMinuteBars.index.date).cumsum().eval('wgtd / TimeDiff')
).groupby(dayMinuteBars.index.date).agg({'twap':lambda x: x.iloc[-1]})
twap

#### Functionality

In [None]:
tslaData.cumsum()

In [None]:
tslaData['Volume'].cumsum()

In [None]:
tslaData['CumVolume'] = tslaData['Volume'].cumsum()
tslaData

In [None]:
tslaData['Volume'].cummax()  # running max over the entire table, useful to calculate drawdown

In [None]:
tslaData['Volume'].cummin()

In [None]:
tslaData['Volume'].cumprod()  # cumulative product

In [None]:
# useful to calculate moving avg
tslaData['Volume'].rolling(10).sum() # sum, mean, median, var, std, min, max, corr, cov, skew, kurt, quantile

In [None]:
zscore = lambda x: (x[-1] - x.mean()) / x.std(ddof=1)
tslaData['Volume'].rolling(10).apply(zscore)

In [None]:
# Expanding is very similar to cumulative, but 'waits' (NaN) until the minimum number specified

tslaData['Volume'].expanding(10).sum() # sum, mean, median, var, std, min, max, corr, cov, skew, kurt, quantile

In [None]:
# Exponential weighted moving average
tslaData['Volume'].ewm(2).mean() # std, var, corr, cob

In [None]:
# Deltas
tslaData['OpenDelta'] = tslaData.Open.diff().shift(-1)
tslaData

In [None]:
# Deltas
tslaData['OpenDelta'] = tslaData.Open.diff() # .shift(-1)
tslaData