### FINA 4380 with Marius Popescu

## Pandas - Part II

In [4]:
lst2 = [0]
for i in range(5):
    lst2.append(i+1)
    print(lst2)
    lst2[i] = lst2[i+1]
    print(lst2)
lst2

[0, 1]
[1, 1]
[1, 1, 2]
[1, 2, 2]
[1, 2, 2, 3]
[1, 2, 3, 3]
[1, 2, 3, 3, 4]
[1, 2, 3, 4, 4]
[1, 2, 3, 4, 4, 5]
[1, 2, 3, 4, 5, 5]


[1, 2, 3, 4, 5, 5]

In [2]:
import pandas as pd
import numpy as np

In [3]:
bank_df = pd.read_csv('bank_data.csv',
                       usecols = ['date','TICKER','PRC','RET','VOL'],
                       index_col = 'date',
                       parse_dates = True)
bank_df.head()

Unnamed: 0_level_0,TICKER,PRC,VOL,RET
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-03,WFC,56.0,22063790,0.01615
2017-01-04,WFC,56.05,15036148,0.000893
2017-01-05,WFC,55.18,18831628,-0.015522
2017-01-06,WFC,55.04,18715908,-0.002537
2017-01-09,WFC,54.24,21794061,-0.014535


In [29]:
bank_df2 = bank_df.reset_index().set_index(['TICKER','date'])
bank_df2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PRC,VOL,RET
TICKER,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
WFC,2017-01-03,56.0,22063790,0.01615
WFC,2017-01-04,56.05,15036148,0.000893
WFC,2017-01-05,55.18,18831628,-0.015522
WFC,2017-01-06,55.04,18715908,-0.002537
WFC,2017-01-09,54.24,21794061,-0.014535


### 1. Sorting Pandas DataFrames

#### We can use the `df_name.sort_index()` method to sort the DataFrame by its index. Default is axis = 0 (across rows).

In [28]:
# Sorting the DataFrame by its index in ascending order (the default). The sorting is not permanent (not in place).
#bank_df.sort_index()
bank_df.sort_index().head()

Unnamed: 0_level_0,TICKER,PRC,VOL,RET
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-03,WFC,56.0,22063790,0.01615
2017-01-03,JPM,87.23,20550714,0.010894
2017-01-03,C,60.59,18322398,0.019519
2017-01-03,USB,51.46,7859031,0.001752
2017-01-03,BAC,22.53,99298080,0.019457


In [13]:
#bank_df2.sort_index()
bank_df2.sort_index().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PRC,VOL,RET
TICKER,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BAC,2017-01-03,22.53,99298080,0.019457
BAC,2017-01-04,22.95,76875052,0.018642
BAC,2017-01-05,22.68,86826447,-0.011765
BAC,2017-01-06,22.68,66281476,0.0
BAC,2017-01-09,22.55,75901509,-0.005732


#### We can use the `df_name.sort_values()` method to sort the DataFrame by its values in one or multiple columns.

In [14]:
# Sorting the DataFrame by the values in 'Ticker' in ascending order (the default). The sorting is not in place.
#bank_df.sort_values(by = 'TICKER')
bank_df.sort_values(by = 'TICKER').head()

Unnamed: 0_level_0,TICKER,PRC,VOL,RET
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-12-29,BAC,29.52,40672012,-0.009396
2018-05-15,BAC,31.22,64433546,0.003213
2018-05-14,BAC,31.12,41791621,0.006468
2018-05-11,BAC,30.92,42024768,0.000971
2018-05-10,BAC,30.89,54773215,0.005534


### 2. Other Commonly Used Methods with Series and DataFrames

#### We can use the `series_name.unique()` method to return a NumPy array with the unique values in the series.

In [71]:
bank_df['TICKER'].unique()

array(['WFC', 'JPM', 'BAC', 'USB', 'C'], dtype=object)

#### We can use the `series_name.nunique()` method to return the number of unique values in a Series. Missing values will be droppped from the computation.

In [72]:
bank_df['TICKER'].nunique()

5

#### We can use the `series_name.value_counts()` method to return a Series containing the number of times each value is present. Missing values will be dropped from the computation.

In [73]:
bank_df['TICKER'].value_counts()

WFC    502
JPM    502
BAC    502
USB    502
C      502
Name: TICKER, dtype: int64

#### We can use the `series_name.count()` method to return the number of non missing (non Nan) values in a Series. In the case of a DataFrame, the method will compute the non Nan values for all columns with numerical data.

In [74]:
bank_df['TICKER'].count()

2510

#### We can use either `obj_name.isnull().sum()` or `obj_name.isna().sum()` to count the number of missing values in a Series or DataFrame.

In [77]:
#bank_df.isnull().sum()
bank_df.isna().sum()

TICKER    0
PRC       0
VOL       0
RET       0
D_VOL     0
dtype: int64

In [78]:
#bank_df['TICKER'].isnull().sum()
bank_df['TICKER'].isna().sum()

0

### 3. Selecting Data from Pandas Series and DataFrames

#### *Sorting and Selecting Data from a Series*

In [60]:
tickers = bank_df['TICKER']
tickers.head()

date
2017-01-03    WFC
2017-01-04    WFC
2017-01-05    WFC
2017-01-06    WFC
2017-01-09    WFC
Name: TICKER, dtype: object

##### We can also use the `.sort_index()` and `.sort_values()` method to sort series.

In [38]:
#tickers.sort_index()
tickers.sort_index().head()

date
2017-01-03    WFC
2017-01-03    JPM
2017-01-03      C
2017-01-03    USB
2017-01-03    BAC
Name: TICKER, dtype: object

In [40]:
#tickers.sort_values()
tickers.sort_values().head()

date
2017-12-29    BAC
2018-05-15    BAC
2018-05-14    BAC
2018-05-11    BAC
2018-05-10    BAC
Name: TICKER, dtype: object

In [49]:
# Selection from a Series
tickers['2018-12-27']

date
2018-12-27    WFC
2018-12-27    JPM
2018-12-27    BAC
2018-12-27    USB
2018-12-27      C
Name: TICKER, dtype: object

#### *Selecting Columns from a DataFrame*

In [15]:
# Selecting one column as a Series
#bank_df['TICKER']
bank_df['TICKER'].head()

date
2017-01-03    WFC
2017-01-04    WFC
2017-01-05    WFC
2017-01-06    WFC
2017-01-09    WFC
Name: TICKER, dtype: object

In [17]:
# Selecting one column as a DataFrame
#bank_df[['TICKER']]
bank_df[['TICKER']].head()

Unnamed: 0_level_0,TICKER
date,Unnamed: 1_level_1
2017-01-03,WFC
2017-01-04,WFC
2017-01-05,WFC
2017-01-06,WFC
2017-01-09,WFC


In [18]:
# Selecting multiple columns requires passing on a list of column names
#bank_df[['TICKER', 'RET']]
bank_df[['TICKER', 'RET']].head()

Unnamed: 0_level_0,TICKER,RET
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-03,WFC,0.01615
2017-01-04,WFC,0.000893
2017-01-05,WFC,-0.015522
2017-01-06,WFC,-0.002537
2017-01-09,WFC,-0.014535


#### *Selecting Rows from Single Index DataFrames*

In [19]:
# Selecting rows using index labels
bank_df.loc['2018-12-31']
#bank_df.loc['20181231']
#bank_df.loc['2018/12/31']

Unnamed: 0_level_0,TICKER,PRC,VOL,RET
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-12-31,WFC,46.08,16544485,0.006553
2018-12-31,JPM,97.62,13237180,0.008159
2018-12-31,BAC,24.64,56257342,0.01025
2018-12-31,USB,45.7,7388314,0.010838
2018-12-31,C,52.06,19317506,0.004438


In [20]:
#bank_df.loc['20170101':'20171231']
#bank_df.sort_index().loc['20170101':'20171231']
bank_df.sort_index().loc['20170101':'20171231'].head()

Unnamed: 0_level_0,TICKER,PRC,VOL,RET
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-03,WFC,56.0,22063790,0.01615
2017-01-03,JPM,87.23,20550714,0.010894
2017-01-03,C,60.59,18322398,0.019519
2017-01-03,USB,51.46,7859031,0.001752
2017-01-03,BAC,22.53,99298080,0.019457


In [21]:
#bank_df.loc['2018-12']
#bank_df.loc['2018']

#### *Selecting Rows from Multi Index DataFrames*

##### Method 1

In [24]:
# Selecting one level of the index returns the associated sub-dataframe.
#bank_df2.loc['BAC']
bank_df2.loc['BAC'].head()

Unnamed: 0_level_0,PRC,VOL,RET
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-03,22.53,99298080,0.019457
2017-01-04,22.95,76875052,0.018642
2017-01-05,22.68,86826447,-0.011765
2017-01-06,22.68,66281476,0.0
2017-01-09,22.55,75901509,-0.005732


In [25]:
# Further data selection within the sub-DataFrame
bank_df2.loc['BAC'].loc['2018-12-24']
#bank_df2.loc['BAC','2018-12-24']

PRC    2.273000e+01
VOL    6.485906e+07
RET   -2.738600e-02
Name: 2018-12-24 00:00:00, dtype: float64

##### Method 2 - Use the `.xs()` method

In [150]:
# Selecting one level of the index returns the associated sub-DataFrame.
#bank_df2.xs('BAC', level = 'TICKER')
#bank_df2.xs('BAC', level = 0)
bank_df2.xs('BAC', level = 0).head()

Unnamed: 0_level_0,PRC,VOL,RET
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-03,22.53,99298080,0.019457
2017-01-04,22.95,76875052,0.018642
2017-01-05,22.68,86826447,-0.011765
2017-01-06,22.68,66281476,0.0
2017-01-09,22.55,75901509,-0.005732


In [31]:
# Further data selection within the sub-DataFrame
#bank_df2.xs(('BAC','2018-12-24'), level = ['TICKER','date'])
bank_df2.xs(('BAC','2018-12-24'), level = [0,1])

Unnamed: 0_level_0,Unnamed: 1_level_0,PRC,VOL,RET
TICKER,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BAC,2018-12-24,22.73,64859064,-0.027386


##### In contrast to the first method, the second method allows us to easily select data based on the inner index

In [32]:
#bank_df2.xs('2018-12-24', level = 'date')
bank_df2.xs('2018-12-24', level = 1)

Unnamed: 0_level_0,PRC,VOL,RET
TICKER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
WFC,43.6,18049856,-0.033688
JPM,92.14,17009280,-0.021557
BAC,22.73,64859064,-0.027386
USB,43.76,6830457,-0.019054
C,49.26,21970833,-0.019506


### 4. Adding, Changing the Name and Removing Columns from a DataFrame

In [64]:
# Creating a new column based on current columns.
bank_df['DOLLAR_VOL'] = bank_df['PRC']*bank_df['VOL']

In [65]:
bank_df.head()

Unnamed: 0_level_0,TICKER,PRC,VOL,RET,DOLLAR_VOL
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-03,WFC,56.0,22063790,0.01615,1235572000.0
2017-01-04,WFC,56.05,15036148,0.000893,842776100.0
2017-01-05,WFC,55.18,18831628,-0.015522,1039129000.0
2017-01-06,WFC,55.04,18715908,-0.002537,1030124000.0
2017-01-09,WFC,54.24,21794061,-0.014535,1182110000.0


#### We can use the `df_name.rename()` method to change the name of one or multiple columns in a DataFrame.

In [69]:
bank_df.rename({'DOLLAR_VOL':'D_VOL'},axis=1,inplace=True)
bank_df.head()          

Unnamed: 0_level_0,TICKER,PRC,VOL,RET,D_VOL
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-03,WFC,56.0,22063790,0.01615,1235572000.0
2017-01-04,WFC,56.05,15036148,0.000893,842776100.0
2017-01-05,WFC,55.18,18831628,-0.015522,1039129000.0
2017-01-06,WFC,55.04,18715908,-0.002537,1030124000.0
2017-01-09,WFC,54.24,21794061,-0.014535,1182110000.0


#### We can use the `df_name.drop(column_label,axis=1)` method to remove one or more columns. As seen below, the removal is not permanent (not in place).

In [70]:
#bank_df.drop('D_VOL', axis = 1)
bank_df.drop('D_VOL', axis = 1).head()

Unnamed: 0_level_0,TICKER,PRC,VOL,RET
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-03,WFC,56.0,22063790,0.01615
2017-01-04,WFC,56.05,15036148,0.000893
2017-01-05,WFC,55.18,18831628,-0.015522
2017-01-06,WFC,55.04,18715908,-0.002537
2017-01-09,WFC,54.24,21794061,-0.014535
