In [2]:
# References: 
# 10 minutes to Pandas: https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html
# Docs - https://pandas.pydata.org/pandas-docs/stable/reference/index.html
# Datacamp cheat sheet: http://datacamp-community-prod.s3.amazonaws.com/dbed353d-2757-4617-8206-8767ab379ab3

In [3]:
# Imports
import pandas as pd
import numpy as np
#path = 'C:\\Users\\K\\AIDeepDiveMaterials\\' 
path = 'C:\\Users\\K\\AIDeepDiveMaterials\\' # Replace with the folder containing the materials for this class
pd.__version__ 

'0.25.0'

In [4]:
# Pandas
# Tabular and human-friendly (excel/SQL-like interface)
# Homogeneous or heterogeneous data
# Row and column labels!
# Similarities to R's data.frame...but with more

In [5]:
# Efficiency gains vs. base python

#pandas_fun = pd.Series(np.arange(10000))

#% timeit pandas_fun*500 # microseconds (us) vs. milliseconds (ms)```

In [6]:
# Pandas Series & DataFrames (2 primary data structures)
# https://pandas.pydata.org/pandas-docs/stable/getting_started/dsintro.html#dsintro

my_series = pd.Series([1,2,np.nan,5,4])

print(my_series)
print(type(my_series))

0    1.0
1    2.0
2    NaN
3    5.0
4    4.0
dtype: float64
<class 'pandas.core.series.Series'>


In [7]:
my_series.values # converts to numpy array

array([ 1.,  2., nan,  5.,  4.])

In [8]:
# DataFrame 

my_df = pd.DataFrame(np.random.randn(6,4),columns=list('ABCD')) # created from a 2D numpy array, could use list of lists

print()
print(my_df)
print(type(my_df))


          A         B         C         D
0  0.732648 -1.156832  0.365486 -1.159664
1 -0.234164 -1.866204 -0.790839  0.903253
2  1.076773  1.786251 -1.430559 -1.030840
3  0.091828  1.934300  2.144404  0.176385
4 -0.161901 -0.575755 -1.558379  0.605511
5  0.964239 -1.216262 -0.422605  1.135774
<class 'pandas.core.frame.DataFrame'>


In [9]:
my_df.values # will convert to numpy array regardless of how created

array([[ 0.73264818, -1.15683249,  0.36548564, -1.15966417],
       [-0.23416434, -1.86620364, -0.79083889,  0.90325349],
       [ 1.07677338,  1.78625123, -1.43055853, -1.03084003],
       [ 0.09182831,  1.93430049,  2.14440388,  0.17638525],
       [-0.16190125, -0.57575482, -1.55837914,  0.60551089],
       [ 0.9642389 , -1.21626179, -0.42260484,  1.13577352]])

In [10]:
print(my_df['A'])
type(my_df['A']) # an individual column of a dataframe is a pandas series

0    0.732648
1   -0.234164
2    1.076773
3    0.091828
4   -0.161901
5    0.964239
Name: A, dtype: float64


pandas.core.series.Series

In [11]:
# Index - row labels
print(my_series.index)

my_df.index

RangeIndex(start=0, stop=5, step=1)


RangeIndex(start=0, stop=6, step=1)

In [12]:
print(my_df.index.values)

[0 1 2 3 4 5]


In [13]:
# Indicating indices on creation
my_series=pd.Series([1,2,np.nan,5,4], index=list('ABCDE'))
print(my_series)

my_df=pd.DataFrame(np.random.randn(6,4),columns=list('ABCD'), index=list('ABCDEF'))
print()
print(my_df)

A    1.0
B    2.0
C    NaN
D    5.0
E    4.0
dtype: float64

          A         B         C         D
A -0.914360 -0.091910 -1.738797  2.503396
B -0.112808 -0.303286  0.303880  1.032969
C -0.377410  2.880770  0.871770  0.653901
D  1.537640  0.588826 -0.147002  0.217592
E  0.530197  1.105229 -2.137768 -0.246727
F -0.866567 -0.196847 -0.226044  0.054202


In [14]:
# Columns
my_df.columns # [0,..,3] if automatically assigned

Index(['A', 'B', 'C', 'D'], dtype='object')

In [15]:
my_series.name # returns none - unnamed series

In [16]:
my_series.name='MyFavouriteSeries'
my_series.name

'MyFavouriteSeries'

In [17]:
# Alternative ways to create a DataFrame
# Create a dataframe from a dictionary (or a json file)
keys=list('ABCD')
vals= np.random.randn(6,4)
my_dict = dict(zip(keys, vals))
print(my_dict)
print()

print(pd.DataFrame(my_dict))

{'A': array([-0.24920851,  0.50085561, -1.48515837,  0.29374407]), 'B': array([-0.5131129 ,  0.95392831,  0.33345521, -0.83164472]), 'C': array([ 0.65834035, -0.76329906, -0.72936957,  0.153754  ]), 'D': array([ 0.88668524, -0.02555665,  1.20627158, -0.99409099])}

          A         B         C         D
0 -0.249209 -0.513113  0.658340  0.886685
1  0.500856  0.953928 -0.763299 -0.025557
2 -1.485158  0.333455 -0.729370  1.206272
3  0.293744 -0.831645  0.153754 -0.994091


In [18]:
# SQL database: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_sql.html
# Interfaces well with excel, accomodates multiple sheets: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html

In [19]:
# Read-in from csv
# Reference: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
df = pd.read_csv(path+'SPTSXComposite.csv') # path defined in cell 2

In [20]:
# Viewing DataFrames
df.shape # (rows, columns)

(132, 6)

In [21]:
df.head() # View first 5 rows

Unnamed: 0,Name,Ticker,TotalAssets,TotalRevenue,GeographicSegments,PrimarySector
0,Agnico Eagle Mines Limited (NYSE:AEM),NYSE:AEM,10715.5,2990.0,5.0,Materials
1,Air Canada (TSX:AC),TSX:AC,19197.0,18065.0,5.0,Industrials
2,Alamos Gold Inc. (TSX:AGI),TSX:AGI,4455.5,889.4,4.0,Materials
3,Algonquin Power & Utilities Corp. (TSX:AQN),TSX:AQN,12811.6,2247.9,2.0,Utilities
4,Alimentation Couche-Tard Inc. (TSX:ATD.B),TSX:ATD.B,30426.3,79562.8,3.0,Consumer Staples


In [22]:
df.tail() # Last 5

Unnamed: 0,Name,Ticker,TotalAssets,TotalRevenue,GeographicSegments,PrimarySector
127,West Fraser Timber Co. Ltd. (TSX:WFT),TSX:WFT,4791.0,6118.0,5.0,Materials
128,WestJet Airlines Ltd. (TSX:WJA),TSX:WJA,6758.1,4728.4,4.0,Industrials
129,Wheaton Precious Metals Corp. (TSX:WPM),TSX:WPM,8828.6,1083.5,9.0,Materials
130,WSP Global Inc. (TSX:WSP),TSX:WSP,7766.6,7908.1,8.0,Industrials
131,Yamana Gold Inc. (TSX:YRI),TSX:YRI,10933.9,2454.1,5.0,Materials


In [23]:
# Column names
df.columns

Index(['Name', 'Ticker', 'TotalAssets', 'TotalRevenue', 'GeographicSegments',
       'PrimarySector'],
      dtype='object')

In [24]:
# Current index
df.index

RangeIndex(start=0, stop=132, step=1)

In [25]:
# Aside - index.values = numpy array
df.index.values[:5]

array([0, 1, 2, 3, 4], dtype=int64)

In [26]:
# Set new index
df.Ticker.is_unique    # Check series for uniqueness

True

In [27]:
df = df.set_index('Ticker')  # Set ticker as new index
print(df.index)

Index(['NYSE:AEM', 'TSX:AC', 'TSX:AGI', 'TSX:AQN', 'TSX:ATD.B', 'TSX:AP.UN',
       'TSX:ALA', 'TSX:ACO.X', 'TSX:ACB', 'TSX:BTO',
       ...
       'TSX:TIH', 'TSX:TOU', 'TSX:RNW', 'TSX:VET', 'NYSE:WCN', 'TSX:WFT',
       'TSX:WJA', 'TSX:WPM', 'TSX:WSP', 'TSX:YRI'],
      dtype='object', name='Ticker', length=132)


In [28]:
df.head()

Unnamed: 0_level_0,Name,TotalAssets,TotalRevenue,GeographicSegments,PrimarySector
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NYSE:AEM,Agnico Eagle Mines Limited (NYSE:AEM),10715.5,2990.0,5.0,Materials
TSX:AC,Air Canada (TSX:AC),19197.0,18065.0,5.0,Industrials
TSX:AGI,Alamos Gold Inc. (TSX:AGI),4455.5,889.4,4.0,Materials
TSX:AQN,Algonquin Power & Utilities Corp. (TSX:AQN),12811.6,2247.9,2.0,Utilities
TSX:ATD.B,Alimentation Couche-Tard Inc. (TSX:ATD.B),30426.3,79562.8,3.0,Consumer Staples


In [29]:
# Indicate the column name to use as the index when reading in the CSV (index_col=)
# Limit the columns read in, this saves space (usecols=)

print(pd.read_csv(path+'SPTSXComposite.csv',index_col='Ticker',usecols=['Name', 'PrimarySector','Ticker']).head(3))

                                           Name PrimarySector
Ticker                                                       
NYSE:AEM  Agnico Eagle Mines Limited (NYSE:AEM)     Materials
TSX:AC                      Air Canada (TSX:AC)   Industrials
TSX:AGI              Alamos Gold Inc. (TSX:AGI)     Materials


In [30]:
# Datatypes
df.dtypes

Name                   object
TotalAssets           float64
TotalRevenue          float64
GeographicSegments    float64
PrimarySector          object
dtype: object

In [31]:
# Converting types
df = df.astype({'PrimarySector':'category', 'TotalAssets':'float32', 'TotalRevenue':'float32'})
# {Column_name: datatype,...}
df.dtypes

Name                    object
TotalAssets            float32
TotalRevenue           float32
GeographicSegments     float64
PrimarySector         category
dtype: object

In [32]:
# Exercise
# Important: Create a copy of "df" and assign it a different variable name (hint: think NumPy) 

new_df = df.copy()

# Perform the following steps using this new dataframe...


In [33]:
# Change the type of "TotalAssets" to int

new_df = new_df.astype({'TotalAssets':'int'})
print(new_df.dtypes)
    
new_df.TotalAssets = new_df.TotalAssets.astype('int')  # alternative approach when you only need to change the dtype of one column

# Check that it worked 
print(new_df.dtypes)


Name                    object
TotalAssets              int32
TotalRevenue           float32
GeographicSegments     float64
PrimarySector         category
dtype: object
Name                    object
TotalAssets              int32
TotalRevenue           float32
GeographicSegments     float64
PrimarySector         category
dtype: object


In [34]:
# Make "Name" the index instead of "Ticker"
new_df = new_df.set_index('Name') # Note - set_index replaces and drops the current ticker

# In order to avoid replacing and removing "Ticker", do the following INSTEAD:
#new_df = new_df.reindex(df.Name) 

In [35]:
# Set the columns of the new dataframe equal to list('ABCDE')
new_df.columns = list('ABCD')

In [36]:
# Check the top 5 rows and all of the column names
new_df.head()

Unnamed: 0_level_0,A,B,C,D
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Agnico Eagle Mines Limited (NYSE:AEM),10715,2990.0,5.0,Materials
Air Canada (TSX:AC),19197,18065.0,5.0,Industrials
Alamos Gold Inc. (TSX:AGI),4455,889.400024,4.0,Materials
Algonquin Power & Utilities Corp. (TSX:AQN),12811,2247.899902,2.0,Utilities
Alimentation Couche-Tard Inc. (TSX:ATD.B),30426,79562.796875,3.0,Consumer Staples


In [37]:
# Describe and summarize
df.describe().round(1) # automatically selects numeric columns, ignores nans

Unnamed: 0,TotalAssets,TotalRevenue,GeographicSegments
count,132.0,132.0,124.0
mean,79793.3,11634.7,4.8
std,218163.2,15739.7,4.2
min,261.1,15.7,1.0
25%,5077.7,1683.9,2.0
50%,11690.3,5005.6,3.5
75%,35186.8,14151.5,6.0
max,1334903.0,79975.7,24.0


In [38]:
# The describe method is also implemented for pandas series
# See what happens when you ask pandas to describe a categorical variable (same result for any object type series)
df.PrimarySector.describe()

count            132
unique            11
top       Financials
freq              21
Name: PrimarySector, dtype: object

In [39]:
df.PrimarySector.value_counts()

Financials                21
Materials                 19
Energy                    18
Industrials               17
Utilities                 12
Real Estate               11
Consumer Discretionary     9
Consumer Staples           8
Information Technology     6
Communication Services     6
Health Care                5
Name: PrimarySector, dtype: int64

In [40]:
# Integer position based indexing - "iloc"
# Reference: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iloc.html

print(df.iloc[1]) # first row
print()
print(df.iloc[-1]) # last row
print()
print(df.iloc[:5,1]) # first five rows of first column

Name                  Air Canada (TSX:AC)
TotalAssets                         19197
TotalRevenue                        18065
GeographicSegments                      5
PrimarySector                 Industrials
Name: TSX:AC, dtype: object

Name                  Yamana Gold Inc. (TSX:YRI)
TotalAssets                              10933.9
TotalRevenue                              2454.1
GeographicSegments                             5
PrimarySector                          Materials
Name: TSX:YRI, dtype: object

Ticker
NYSE:AEM     10715.500000
TSX:AC       19197.000000
TSX:AGI       4455.500000
TSX:AQN      12811.599609
TSX:ATD.B    30426.300781
Name: TotalAssets, dtype: float32


In [41]:
# Label based indexing
# Reference:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.loc.html

print(df.loc['TSX:RY']) #get row with info on RBC
print()
print(df.loc['TSX:RY','GeographicSegments']) # get number of geographic segments at RBC

Name                  Royal Bank of Canada (TSX:RY)
TotalAssets                             1.33473e+06
TotalRevenue                                  41269
GeographicSegments                                3
PrimarySector                            Financials
Name: TSX:RY, dtype: object

3.0


In [42]:
df.loc['TSX:RY','TotalRevenue':'PrimarySector']

TotalRevenue               41269
GeographicSegments             3
PrimarySector         Financials
Name: TSX:RY, dtype: object

In [43]:
# Select a column - three ways to do the same thing
print(df.Name[:3])
print(df['Name'][:3])
print(df.loc[:,'Name'][:3])

Ticker
NYSE:AEM    Agnico Eagle Mines Limited (NYSE:AEM)
TSX:AC                        Air Canada (TSX:AC)
TSX:AGI                Alamos Gold Inc. (TSX:AGI)
Name: Name, dtype: object
Ticker
NYSE:AEM    Agnico Eagle Mines Limited (NYSE:AEM)
TSX:AC                        Air Canada (TSX:AC)
TSX:AGI                Alamos Gold Inc. (TSX:AGI)
Name: Name, dtype: object
Ticker
NYSE:AEM    Agnico Eagle Mines Limited (NYSE:AEM)
TSX:AC                        Air Canada (TSX:AC)
TSX:AGI                Alamos Gold Inc. (TSX:AGI)
Name: Name, dtype: object


In [44]:
# Select multiple columns
print(df[['Name','PrimarySector']].head(3))
print()
print(df.filter(like='Total').head(3))

                                           Name PrimarySector
Ticker                                                       
NYSE:AEM  Agnico Eagle Mines Limited (NYSE:AEM)     Materials
TSX:AC                      Air Canada (TSX:AC)   Industrials
TSX:AGI              Alamos Gold Inc. (TSX:AGI)     Materials

          TotalAssets  TotalRevenue
Ticker                             
NYSE:AEM      10715.5   2990.000000
TSX:AC        19197.0  18065.000000
TSX:AGI        4455.5    889.400024


In [45]:
# Exercise:
# Get info for the following tickers: 'TSX:RY', 'TSX:BMO', 'TSX:TD'
tickers = ['TSX:RY', 'TSX:BMO', 'TSX:TD']
df.loc[tickers,:]

# Get the 25th and 26th rows and the Name & PrimarySector columns


df.iloc[24:26][['Name','PrimarySector']]
df[['Name','PrimarySector']].iloc[24:26]
df.iloc[24:26].loc[:,['Name','PrimarySector']]

df.iloc[24:26][['Name','PrimarySector']]
df[['Name','PrimarySector']].iloc[24:26]

Unnamed: 0_level_0,Name,PrimarySector
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
TSX:CCO,Cameco Corporation (TSX:CCO),Energy
TSX:GOOS,Canada Goose Holdings Inc. (TSX:GOOS),Consumer Discretionary


In [46]:
df.TotalAssets.describe()

count    1.320000e+02
mean     7.979331e+04
std      2.181632e+05
min      2.611000e+02
25%      5.077725e+03
50%      1.169030e+04
75%      3.518675e+04
max      1.334903e+06
Name: TotalAssets, dtype: float64

In [47]:
# Sorting
df.sort_values('TotalAssets').head()

Unnamed: 0_level_0,Name,TotalAssets,TotalRevenue,GeographicSegments,PrimarySector
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TSX:CRON,Cronos Group Inc. (TSX:CRON),261.100006,15.7,2.0,Health Care
TSX:GOOS,Canada Goose Holdings Inc. (TSX:GOOS),725.400024,830.5,3.0,Consumer Discretionary
TSX:DSG,The Descartes Systems Group Inc (TSX:DSG),858.900024,361.700012,4.0,Information Technology
TSX:BYD.UN,Boyd Group Income Fund (TSX:BYD.UN),1233.5,1864.599976,2.0,Industrials
TSX:FSV,FirstService Corporation (TSX:FSV),1374.699951,2635.600098,2.0,Real Estate


In [48]:
print(df.nlargest(3,'TotalRevenue')[['Name','TotalRevenue']])
print(df.nsmallest(3,'TotalRevenue')[['Name','TotalRevenue']])

                                                   Name  TotalRevenue
Ticker                                                               
TSX:BAM.A  Brookfield Asset Management Inc. (TSX:BAM.A)  79975.703125
TSX:ATD.B     Alimentation Couche-Tard Inc. (TSX:ATD.B)  79562.796875
TSX:MG                Magna International Inc. (TSX:MG)  55710.101562
                                          Name  TotalRevenue
Ticker                                                      
TSX:CRON          Cronos Group Inc. (TSX:CRON)     15.700000
TSX:ACB         Aurora Cannabis Inc. (TSX:ACB)     55.200001
TSX:WEED  Canopy Growth Corporation (TSX:WEED)    226.300003


In [49]:
# Boolean indexing
bool_series=df.TotalAssets>9000
bool_series[:5]

Ticker
NYSE:AEM      True
TSX:AC        True
TSX:AGI      False
TSX:AQN       True
TSX:ATD.B     True
Name: TotalAssets, dtype: bool

In [50]:
# Slicing based on conditions
df.loc[df.TotalAssets>9000,['Name','TotalAssets']].head()

Unnamed: 0_level_0,Name,TotalAssets
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
NYSE:AEM,Agnico Eagle Mines Limited (NYSE:AEM),10715.5
TSX:AC,Air Canada (TSX:AC),19197.0
TSX:AQN,Algonquin Power & Utilities Corp. (TSX:AQN),12811.599609
TSX:ATD.B,Alimentation Couche-Tard Inc. (TSX:ATD.B),30426.300781
TSX:ALA,AltaGas Ltd. (TSX:ALA),23487.699219


In [51]:
# Exercise
# 1. Get the ticker and name ONLY for the company that has the most geographic segments
most_segments = df.nlargest(1, 'GeographicSegments')
print(most_segments.Name)
most_segments_ticker = most_segments.index
print(most_segments_ticker)



Ticker
TSX:FM    First Quantum Minerals Ltd. (TSX:FM)
Name: Name, dtype: object
Index(['TSX:FM'], dtype='object', name='Ticker')


In [52]:
# 2. Use the index of the company found in step 1 to get its primary sector 

print(most_segments.PrimarySector)

m = df.loc[most_segments_ticker,'PrimarySector'].values

Ticker
TSX:FM    Materials
Name: PrimarySector, dtype: category
Categories (11, object): [Communication Services, Consumer Discretionary, Consumer Staples, Energy, ..., Information Technology, Materials, Real Estate, Utilities]


In [53]:
# 3. Create a new dataframe containing only companies in the same sector as the company in question
# 4. What is the shape of this new dataframe? 

materials = df.loc[df.PrimarySector=='Materials']
materials.shape

(19, 5)

In [54]:
# 5. Does the shape match the information shown by df.PrimarySector.value_counts()?

df.PrimarySector.value_counts()

Financials                21
Materials                 19
Energy                    18
Industrials               17
Utilities                 12
Real Estate               11
Consumer Discretionary     9
Consumer Staples           8
Information Technology     6
Communication Services     6
Health Care                5
Name: PrimarySector, dtype: int64

In [55]:
# 6. Sort the dataframe created in step 3 by total revenue (in place) and view the top 5 companies

materials = materials.sort_values('TotalRevenue')
materials.head(3)

Unnamed: 0_level_0,Name,TotalAssets,TotalRevenue,GeographicSegments,PrimarySector
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TSX:FNV,Franco-Nevada Corporation (TSX:FNV),6729.600098,888.200012,7.0,Materials
TSX:AGI,Alamos Gold Inc. (TSX:AGI),4455.5,889.400024,4.0,Materials
TSX:DGC,Detour Gold Corporation (TSX:DGC),3367.699951,1058.900024,1.0,Materials


In [56]:
# 7. Create a third dataframe from the sorted dataframe (step 6) with ..
#only two columns: "TotalAssets" and "TotalRevenue"
# and exclude companies with less than 4 geographic segments

#mini_sector_df = materials.loc[materials.GeographicSegments>=4., ['TotalAssets','TotalRevenue']]



In [58]:
# 8. How many companies were excluded? 

'''print(mini_sector_df.shape)
print('Number of companies excluded is ', len(sector_df)-len(mini_sector_df))
print()
'''

"print(mini_sector_df.shape)\nprint('Number of companies excluded is ', len(sector_df)-len(mini_sector_df))\nprint()\n"

In [None]:
# 9. Print the summary statistics (count, mean, std etc.) of the dataframe created in step 7.

print(mini_sector_df.describe())

In [None]:
# Dealing with missing values
df.isna().any() # identify which variables contain missing values

In [None]:
df.loc[df.GeographicSegments.isna(),['Name','GeographicSegments']]

In [None]:
df.GeographicSegments = df.GeographicSegments.fillna(0) 

df.GeographicSegments.isna().any() # check if we succeeded

In [None]:
df.GeographicSegments = df.GeographicSegments.astype('int')

In [None]:
# Arithmetic & aggregation

numeric_df = df.select_dtypes(include='number')
print(numeric_df.head())

In [None]:
numeric_df.apply(np.log).head().round(1) # applies function to each cell

In [None]:
numeric_df.sum() # defaults to summing over axis=0/axis='rows'

In [None]:
numeric_df.sum(axis='columns')[:5]

In [None]:
numeric_df.sum(axis=1)[:5] # same as NumPy (and as above)

In [None]:
print(numeric_df.loc['NYSE:AEM'])
10715.5+2990+5 # check pandas sum of numeric variables

In [None]:
a=df.TotalAssets*2
print(a[:5])

In [None]:
b=df.TotalAssets*df.GeographicSegments # elementwise multiplication
b[:5]

In [None]:
pd.DataFrame([[1,1],[2,2]]) @ pd.DataFrame([[1,1],[2,2]]) # Refer to NumPy materials for more

In [None]:
# Groupby
# References: 
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html, 
# https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html

group = df.groupby(['PrimarySector']).median().round(1).sort_values('TotalAssets', ascending=False)
print(type(df.groupby(['PrimarySector'])))
print(type(group))
group

In [None]:
# Pivot table
pivot_df = df.pivot_table(values=['TotalAssets','TotalRevenue'],index=['PrimarySector'],aggfunc=np.mean).round(1).sort_values('TotalAssets', ascending=False)
print(type(pivot_df))
pivot_df

In [None]:
df.pivot_table(values=['TotalAssets','TotalRevenue'],columns=['PrimarySector'],aggfunc=np.mean).round(1)

In [59]:
# Exercise
# Get the median value total assets value for each sector in the dataset, use either groupby or pivot_table


df.pivot_table(values=['TotalAssets'],columns=['PrimarySector'],aggfunc=np.median).round(1)

PrimarySector,Communication Services,Consumer Discretionary,Consumer Staples,Energy,Financials,Health Care,Industrials,Information Technology,Materials,Real Estate,Utilities
TotalAssets,23171.0,4094.800049,10403.900391,16199.5,271827.0,3318.800049,7766.600098,4589.399902,7027.600098,10453.099609,23415.800781


In [66]:
df.groupby(['PrimarySector'])['TotalAssets'].median()

PrimarySector
Communication Services     23171.000000
Consumer Discretionary      4094.800049
Consumer Staples           10403.900391
Energy                     16199.500000
Financials                271827.000000
Health Care                 3318.800049
Industrials                 7766.600098
Information Technology      4589.399902
Materials                   7027.600098
Real Estate                10453.099609
Utilities                  23415.849609
Name: TotalAssets, dtype: float32

In [70]:
df_group = df.groupby(['PrimarySector'])
type(df_group)

pandas.core.groupby.generic.DataFrameGroupBy

In [64]:
df[['PrimarySector','TotalAssets']].groupby(['PrimarySector']).median()

Unnamed: 0_level_0,TotalAssets
PrimarySector,Unnamed: 1_level_1
Communication Services,23171.0
Consumer Discretionary,4094.800049
Consumer Staples,10403.900391
Energy,16199.5
Financials,271827.0
Health Care,3318.800049
Industrials,7766.600098
Information Technology,4589.399902
Materials,7027.600098
Real Estate,10453.099609


In [82]:
# Assigning new data
df['AssetsInBillions'] = np.round(df.TotalAssets/1000) # create a new column
df.head()

Unnamed: 0_level_0,Name,TotalAssets,TotalRevenue,GeographicSegments,PrimarySector,AssetsInBillions
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NYSE:AEM,Agnico Eagle Mines Limited (NYSE:AEM),10715.5,2990.0,5,Materials,11.0
TSX:AC,Air Canada (TSX:AC),19197.0,18065.0,5,Industrials,19.0
TSX:AGI,Alamos Gold Inc. (TSX:AGI),4455.5,889.400024,4,Materials,4.0
TSX:AQN,Algonquin Power & Utilities Corp. (TSX:AQN),12811.599609,2247.899902,2,Utilities,13.0
TSX:ATD.B,Alimentation Couche-Tard Inc. (TSX:ATD.B),30426.300781,79562.796875,3,Consumer Staples,30.0


In [83]:
# If a company has the number of geographic segments listed as 0, change it to it's median

print(len(df.loc[df.GeographicSegments==0]))
df.loc[df.GeographicSegments==0,'GeographicSegments'] = df.GeographicSegments.median()
print(len(df.loc[df.GeographicSegments==0]))

8
0


In [72]:
# Exercise:
# Correct the total assets figure for TSX:RNW from 3747 to 3477

df.loc['TSX:RNW','TotalAssets'] = 3477
print(df.loc['TSX:RNW','TotalAssets']

3477.0


In [85]:
df.index=='TSX:RNW'

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False,

In [84]:
df.index

Index(['NYSE:AEM', 'TSX:AC', 'TSX:AGI', 'TSX:AQN', 'TSX:ATD.B', 'TSX:AP.UN',
       'TSX:ALA', 'TSX:ACO.X', 'TSX:ACB', 'TSX:BTO',
       ...
       'TSX:TIH', 'TSX:TOU', 'TSX:RNW', 'TSX:VET', 'NYSE:WCN', 'TSX:WFT',
       'TSX:WJA', 'TSX:WPM', 'TSX:WSP', 'TSX:YRI'],
      dtype='object', name='Ticker', length=132)

In [76]:
df.loc['TSX:RNW'].iloc[1]

3477.0

In [79]:
# Create a new column that gives the average revenue per geographic segment (Revenue/Segment)

df['AverageRevenue'] = df.TotalRevenue/df.GeographicSegments

print(df.head())


                                                  Name   TotalAssets  \
Ticker                                                                 
NYSE:AEM         Agnico Eagle Mines Limited (NYSE:AEM)  10715.500000   
TSX:AC                             Air Canada (TSX:AC)  19197.000000   
TSX:AGI                     Alamos Gold Inc. (TSX:AGI)   4455.500000   
TSX:AQN    Algonquin Power & Utilities Corp. (TSX:AQN)  12811.599609   
TSX:ATD.B    Alimentation Couche-Tard Inc. (TSX:ATD.B)  30426.300781   

           TotalRevenue  GeographicSegments     PrimarySector  AverageRevenue  
Ticker                                                                         
NYSE:AEM    2990.000000                 5.0         Materials      598.000000  
TSX:AC     18065.000000                 5.0       Industrials     3613.000000  
TSX:AGI      889.400024                 4.0         Materials      222.350006  
TSX:AQN     2247.899902                 2.0         Utilities     1123.949951  
TSX:ATD.B  7956

In [81]:
for element in df.TotalRevenue:
    print(element)
    print(type(element))

2990.0
<class 'float'>
18065.0
<class 'float'>
889.4000244140625
<class 'float'>
2247.89990234375
<class 'float'>
79562.796875
<class 'float'>
436.3999938964844
<class 'float'>
4256.7001953125
<class 'float'>
4888.0
<class 'float'>
55.20000076293945
<class 'float'>
1671.5999755859375
<class 'float'>
22375.0
<class 'float'>
9883.400390625
<class 'float'>
11434.7998046875
<class 'float'>
23468.0
<class 'float'>
1190.300048828125
<class 'float'>
22154.69921875
<class 'float'>
1864.5999755859375
<class 'float'>
79975.703125
<class 'float'>
50717.19921875
<class 'float'>
6347.7998046875
<class 'float'>
11170.099609375
<class 'float'>
4069.10009765625
<class 'float'>
5243.7998046875
<class 'float'>
3304.10009765625
<class 'float'>
2091.699951171875
<class 'float'>
830.5
<class 'float'>
688.5999755859375
<class 'float'>
16964.0
<class 'float'>
14321.0
<class 'float'>
21027.0
<class 'float'>
7316.0
<class 'float'>
13031.5
<class 'float'>
4377.0
<class 'float'>
226.3000030517578
<class 'float'>

In [82]:
df.loc[df.TotalRevenue>10000]

Unnamed: 0_level_0,Name,TotalAssets,TotalRevenue,GeographicSegments,PrimarySector,AverageRevenue
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TSX:AC,Air Canada (TSX:AC),19197.0,18065.0,5.0,Industrials,3613.0
TSX:ATD.B,Alimentation Couche-Tard Inc. (TSX:ATD.B),30426.3,79562.796875,3.0,Consumer Staples,26520.932292
TSX:BMO,Bank of Montreal (TSX:BMO),774048.0,22375.0,3.0,Financials,7458.333333
NYSE:BHC,Bausch Health Companies Inc. (NYSE:BHC),44336.6,11434.799805,17.0,Health Care,672.635283
TSX:BCE,BCE Inc. (TSX:BCE),57100.0,23468.0,1.0,Communication Services,23468.0
TSX:BBD.B,Bombardier Inc. (TSX:BBD.B),34056.2,22154.699219,13.0,Industrials,1704.207632
TSX:BAM.A,Brookfield Asset Management Inc. (TSX:BAM.A),349705.7,79975.703125,9.0,Financials,8886.189236
NYSE:BBU,Brookfield Business Partners L.P. (NYSE:BBU),37276.5,50717.199219,8.0,Industrials,6339.649902
NasdaqGS:BPY,Brookfield Property Partners L.P. (NasdaqGS:BPY),167183.4,11170.099609,9.0,Real Estate,1241.122179
TSX:CM,Canadian Imperial Bank of Commerce (TSX:CM),597099.0,16964.0,4.0,Financials,4241.0
