In [2]:
# References: 
# 10 minutes to Pandas: https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html
# Docs - https://pandas.pydata.org/pandas-docs/stable/reference/index.html
# Datacamp cheat sheet: http://datacamp-community-prod.s3.amazonaws.com/dbed353d-2757-4617-8206-8767ab379ab3

In [3]:
# Imports
import pandas as pd
import numpy as np
pd.__version__ # Check that you have the same version of pandas

'0.25.0'

In [4]:
# Pandas Series & DataFrames
# https://pandas.pydata.org/pandas-docs/stable/getting_started/dsintro.html#dsintro
my_series=pd.Series([1,2,np.nan,5,4])
print(my_series)
print(type(my_series))

# Create a dataframe from a numpy array
my_df=pd.DataFrame(np.random.randn(6,4),columns=list('ABCD'))
print()
print(my_df)
print(type(my_df))

0    1.0
1    2.0
2    NaN
3    5.0
4    4.0
dtype: float64
<class 'pandas.core.series.Series'>

          A         B         C         D
0 -0.658915  1.658887  2.805993 -0.577777
1  1.573727  1.199413  1.181562 -0.220740
2  0.072988  0.355671  1.267864  0.501981
3 -0.772058 -1.314699 -1.715834 -0.355858
4 -0.874362  0.300280  0.887505  0.947620
5  0.027241  0.060413 -0.918045  0.221320
<class 'pandas.core.frame.DataFrame'>


In [5]:
# Index
my_series=pd.Series([1,2,np.nan,5,4], index=list('ABCDE'))
print(my_series)
print(type(my_series))

my_df=pd.DataFrame(np.random.randn(6,4),columns=list('ABCD'), index=list('ABCDEF'))
print()
print(my_df)
print(type(my_df))

A    1.0
B    2.0
C    NaN
D    5.0
E    4.0
dtype: float64
<class 'pandas.core.series.Series'>

          A         B         C         D
A -0.293373  0.382822  1.044245 -0.128176
B  0.525672  0.339466 -0.792176 -1.161377
C -0.508241  0.336184 -0.022091  1.058840
D  0.448545 -1.592467 -1.076913  0.607434
E -0.756569 -0.762593 -0.418113  0.153808
F  0.701783  1.517367 -0.365328 -0.779116
<class 'pandas.core.frame.DataFrame'>


In [6]:
# Convert dataframe to NumPy array
print(my_df.values)
print(type(my_df.values))

[[-0.29337297  0.38282172  1.04424521 -0.1281761 ]
 [ 0.52567241  0.33946571 -0.79217613 -1.16137679]
 [-0.50824111  0.336184   -0.02209092  1.05884016]
 [ 0.4485452  -1.592467   -1.07691317  0.60743359]
 [-0.75656855 -0.76259281 -0.4181126   0.15380795]
 [ 0.70178323  1.51736701 -0.36532774 -0.77911614]]
<class 'numpy.ndarray'>


In [5]:
# Create a dataframe from a dictionary
labels=list('ABCD')
vals= np.random.randn(6,4)
my_dict = dict(zip(labels, vals))
print(my_dict)
print()
print(pd.DataFrame(my_dict))

{'A': array([ 1.36372414,  0.03891867, -0.74510362, -1.16317942]), 'B': array([ 1.35101874, -0.5962577 , -0.52426846, -0.41543683]), 'C': array([-0.13295734,  0.34379269, -1.11615439,  0.45283801]), 'D': array([ 0.05065542, -0.19208929,  0.16038389, -0.85861959])}

          A         B         C         D
0  1.363724  1.351019 -0.132957  0.050655
1  0.038919 -0.596258  0.343793 -0.192089
2 -0.745104 -0.524268 -1.116154  0.160384
3 -1.163179 -0.415437  0.452838 -0.858620


In [6]:
# Read-in from database: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_sql.html

In [7]:
# Read-in from csv
# Reference: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
path = 'C:\\Users\\K\\AIDeepDiveMaterials\\'
df = pd.read_csv(path+'SPTSXComposite.csv')

In [8]:
#view
print(df.shape)

print(df.head())

(132, 6)
                                          Name     Ticker  TotalAssets  \
0        Agnico Eagle Mines Limited (NYSE:AEM)   NYSE:AEM      10715.5   
1                          Air Canada (TSX:AC)     TSX:AC      19197.0   
2                   Alamos Gold Inc. (TSX:AGI)    TSX:AGI       4455.5   
3  Algonquin Power & Utilities Corp. (TSX:AQN)    TSX:AQN      12811.6   
4    Alimentation Couche-Tard Inc. (TSX:ATD.B)  TSX:ATD.B      30426.3   

   TotalRevenue  GeographicSegments     PrimarySector  
0        2990.0                 5.0         Materials  
1       18065.0                 5.0       Industrials  
2         889.4                 4.0         Materials  
3        2247.9                 2.0         Utilities  
4       79562.8                 3.0  Consumer Staples  


In [9]:
df.tail()

Unnamed: 0,Name,Ticker,TotalAssets,TotalRevenue,GeographicSegments,PrimarySector
127,West Fraser Timber Co. Ltd. (TSX:WFT),TSX:WFT,4791.0,6118.0,5.0,Materials
128,WestJet Airlines Ltd. (TSX:WJA),TSX:WJA,6758.1,4728.4,4.0,Industrials
129,Wheaton Precious Metals Corp. (TSX:WPM),TSX:WPM,8828.6,1083.5,9.0,Materials
130,WSP Global Inc. (TSX:WSP),TSX:WSP,7766.6,7908.1,8.0,Industrials
131,Yamana Gold Inc. (TSX:YRI),TSX:YRI,10933.9,2454.1,5.0,Materials


In [10]:
# Column names
df.columns

Index(['Name', 'Ticker', 'TotalAssets', 'TotalRevenue', 'GeographicSegments',
       'PrimarySector'],
      dtype='object')

In [11]:
# Current index
print(df.index)
print()
print(df.index.values[:5]) # returns numpy array
print()
#set new index
df.set_index('Ticker', inplace=True) 
print(df.index)


RangeIndex(start=0, stop=132, step=1)

[0 1 2 3 4]

Index(['NYSE:AEM', 'TSX:AC', 'TSX:AGI', 'TSX:AQN', 'TSX:ATD.B', 'TSX:AP.UN',
       'TSX:ALA', 'TSX:ACO.X', 'TSX:ACB', 'TSX:BTO',
       ...
       'TSX:TIH', 'TSX:TOU', 'TSX:RNW', 'TSX:VET', 'NYSE:WCN', 'TSX:WFT',
       'TSX:WJA', 'TSX:WPM', 'TSX:WSP', 'TSX:YRI'],
      dtype='object', name='Ticker', length=132)


In [12]:
df.head()

Unnamed: 0_level_0,Name,TotalAssets,TotalRevenue,GeographicSegments,PrimarySector
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NYSE:AEM,Agnico Eagle Mines Limited (NYSE:AEM),10715.5,2990.0,5.0,Materials
TSX:AC,Air Canada (TSX:AC),19197.0,18065.0,5.0,Industrials
TSX:AGI,Alamos Gold Inc. (TSX:AGI),4455.5,889.4,4.0,Materials
TSX:AQN,Algonquin Power & Utilities Corp. (TSX:AQN),12811.6,2247.9,2.0,Utilities
TSX:ATD.B,Alimentation Couche-Tard Inc. (TSX:ATD.B),30426.3,79562.8,3.0,Consumer Staples


In [13]:
# Indicate the column name to use as the index when reading in the CSV
# Limit the columns we read-in, this saves space
print(pd.read_csv(path+'SPTSXComposite.csv',index_col='Ticker',usecols=['Name', 'PrimarySector','Ticker']).head(3))

                                           Name PrimarySector
Ticker                                                       
NYSE:AEM  Agnico Eagle Mines Limited (NYSE:AEM)     Materials
TSX:AC                      Air Canada (TSX:AC)   Industrials
TSX:AGI              Alamos Gold Inc. (TSX:AGI)     Materials


In [14]:
# Datatypes
df.dtypes

Name                   object
TotalAssets           float64
TotalRevenue          float64
GeographicSegments    float64
PrimarySector          object
dtype: object

In [15]:
# Converting types
df = df.astype({'PrimarySector':'category'})

df.dtypes

Name                    object
TotalAssets            float64
TotalRevenue           float64
GeographicSegments     float64
PrimarySector         category
dtype: object

In [16]:
# Exercise
# Create a copy of "df" and assign it a different variable name (hint: df.copy()) - use this new dataframe 
# Change the type of "TotalAssets" to int
# Check that it worked 
# Make "Name" the index instead of "Ticker"
# Set the columns of the new dataframe equal to list('ABCDE')
# Check the top 5 rows and all of the column names
new_df = df.copy()
new_df = new_df.astype({'TotalAssets':int})
print(new_df.dtypes)
new_df.set_index("Name",inplace=True) # To avoid removing "Ticker", use new_df.reindex(df.Name), reset with new_df.reset_index()
new_df.columns = list('ABCD')
print(new_df.head())

Name                    object
TotalAssets              int32
TotalRevenue           float64
GeographicSegments     float64
PrimarySector         category
dtype: object
                                                 A        B    C  \
Name                                                               
Agnico Eagle Mines Limited (NYSE:AEM)        10715   2990.0  5.0   
Air Canada (TSX:AC)                          19197  18065.0  5.0   
Alamos Gold Inc. (TSX:AGI)                    4455    889.4  4.0   
Algonquin Power & Utilities Corp. (TSX:AQN)  12811   2247.9  2.0   
Alimentation Couche-Tard Inc. (TSX:ATD.B)    30426  79562.8  3.0   

                                                            D  
Name                                                           
Agnico Eagle Mines Limited (NYSE:AEM)               Materials  
Air Canada (TSX:AC)                               Industrials  
Alamos Gold Inc. (TSX:AGI)                          Materials  
Algonquin Power & Utilities Corp. 

In [17]:
# Describe and summarize
df.describe().round(1) # automatically selects numeric columns, ignores nans

# The describe method is also implemented for series

Unnamed: 0,TotalAssets,TotalRevenue,GeographicSegments
count,132.0,132.0,124.0
mean,79793.3,11634.7,4.8
std,218163.2,15739.7,4.2
min,261.1,15.7,1.0
25%,5077.7,1683.9,2.0
50%,11690.3,5005.6,3.5
75%,35186.8,14151.5,6.0
max,1334903.0,79975.7,24.0


In [18]:
# See what happens when you ask pandas to describe a categorical variable
df.PrimarySector.describe()

count            132
unique            11
top       Financials
freq              21
Name: PrimarySector, dtype: object

In [19]:
df.PrimarySector.value_counts()

Financials                21
Materials                 19
Energy                    18
Industrials               17
Utilities                 12
Real Estate               11
Consumer Discretionary     9
Consumer Staples           8
Information Technology     6
Communication Services     6
Health Care                5
Name: PrimarySector, dtype: int64

In [20]:
# Slicing and indexing
# Integer position based indexing - "iloc"
# Reference: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iloc.html
print(df.iloc[1]) #first row
print()
print(df.iloc[-1]) #last row
print()
print(df.iloc[:5,1]) #first five rows of first column

Name                  Air Canada (TSX:AC)
TotalAssets                         19197
TotalRevenue                        18065
GeographicSegments                      5
PrimarySector                 Industrials
Name: TSX:AC, dtype: object

Name                  Yamana Gold Inc. (TSX:YRI)
TotalAssets                              10933.9
TotalRevenue                              2454.1
GeographicSegments                             5
PrimarySector                          Materials
Name: TSX:YRI, dtype: object

Ticker
NYSE:AEM     10715.5
TSX:AC       19197.0
TSX:AGI       4455.5
TSX:AQN      12811.6
TSX:ATD.B    30426.3
Name: TotalAssets, dtype: float64


In [21]:
# Label based indexing
# Reference:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.loc.html
print(df.loc['TSX:RY']) #get row with info on RBC
print()
print(df.loc['TSX:RY','GeographicSegments'])# get number of geographic segments at RBC

Name                  Royal Bank of Canada (TSX:RY)
TotalAssets                             1.33473e+06
TotalRevenue                                  41269
GeographicSegments                                3
PrimarySector                            Financials
Name: TSX:RY, dtype: object

3.0


In [22]:
df.loc['TSX:RY','TotalRevenue':'PrimarySector']

TotalRevenue               41269
GeographicSegments             3
PrimarySector         Financials
Name: TSX:RY, dtype: object

In [23]:
# Select a column - three ways to do the same thing
print(df.Name[:3])
print(df['Name'][:3])
print(df.loc[:,'Name'][:3])

Ticker
NYSE:AEM    Agnico Eagle Mines Limited (NYSE:AEM)
TSX:AC                        Air Canada (TSX:AC)
TSX:AGI                Alamos Gold Inc. (TSX:AGI)
Name: Name, dtype: object
Ticker
NYSE:AEM    Agnico Eagle Mines Limited (NYSE:AEM)
TSX:AC                        Air Canada (TSX:AC)
TSX:AGI                Alamos Gold Inc. (TSX:AGI)
Name: Name, dtype: object
Ticker
NYSE:AEM    Agnico Eagle Mines Limited (NYSE:AEM)
TSX:AC                        Air Canada (TSX:AC)
TSX:AGI                Alamos Gold Inc. (TSX:AGI)
Name: Name, dtype: object


In [24]:
# Select multiple columns
print(df[['Name','PrimarySector']].head(3))
print()
print(df.filter(like='Total').head(3))

                                           Name PrimarySector
Ticker                                                       
NYSE:AEM  Agnico Eagle Mines Limited (NYSE:AEM)     Materials
TSX:AC                      Air Canada (TSX:AC)   Industrials
TSX:AGI              Alamos Gold Inc. (TSX:AGI)     Materials

          TotalAssets  TotalRevenue
Ticker                             
NYSE:AEM      10715.5        2990.0
TSX:AC        19197.0       18065.0
TSX:AGI        4455.5         889.4


In [25]:
# Sorting
df.sort_values('TotalAssets').head()

Unnamed: 0_level_0,Name,TotalAssets,TotalRevenue,GeographicSegments,PrimarySector
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TSX:CRON,Cronos Group Inc. (TSX:CRON),261.1,15.7,2.0,Health Care
TSX:GOOS,Canada Goose Holdings Inc. (TSX:GOOS),725.4,830.5,3.0,Consumer Discretionary
TSX:DSG,The Descartes Systems Group Inc (TSX:DSG),858.9,361.7,4.0,Information Technology
TSX:BYD.UN,Boyd Group Income Fund (TSX:BYD.UN),1233.5,1864.6,2.0,Industrials
TSX:FSV,FirstService Corporation (TSX:FSV),1374.7,2635.6,2.0,Real Estate


In [26]:
print(df.nlargest(3,'TotalRevenue')[['Name','TotalRevenue']])
print(df.nsmallest(3,'TotalRevenue')[['Name','TotalRevenue']])

                                                   Name  TotalRevenue
Ticker                                                               
TSX:BAM.A  Brookfield Asset Management Inc. (TSX:BAM.A)       79975.7
TSX:ATD.B     Alimentation Couche-Tard Inc. (TSX:ATD.B)       79562.8
TSX:MG                Magna International Inc. (TSX:MG)       55710.1
                                          Name  TotalRevenue
Ticker                                                      
TSX:CRON          Cronos Group Inc. (TSX:CRON)          15.7
TSX:ACB         Aurora Cannabis Inc. (TSX:ACB)          55.2
TSX:WEED  Canopy Growth Corporation (TSX:WEED)         226.3


In [27]:
# Slicing based on conditions
df.loc[df.TotalAssets>9000,['Name','TotalAssets']].head()

Unnamed: 0_level_0,Name,TotalAssets
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
NYSE:AEM,Agnico Eagle Mines Limited (NYSE:AEM),10715.5
TSX:AC,Air Canada (TSX:AC),19197.0
TSX:AQN,Algonquin Power & Utilities Corp. (TSX:AQN),12811.6
TSX:ATD.B,Alimentation Couche-Tard Inc. (TSX:ATD.B),30426.3
TSX:ALA,AltaGas Ltd. (TSX:ALA),23487.7


In [28]:
bool_series=df.TotalAssets>9000
bool_series[:5]

Ticker
NYSE:AEM      True
TSX:AC        True
TSX:AGI      False
TSX:AQN       True
TSX:ATD.B     True
Name: TotalAssets, dtype: bool

In [29]:
# Exercise
# 1. Get the ticker and name ONLY for the company that has the most geographic segments
# 2. Use the index of the company found in step 1 to get its primary sector 
# 3. Craete a new dataframe containing only companies in the same sector as the company in question
# 4. What is the shape of this new dataframe? 
# 5. Does the shape match the information shown by df.PrimarySector.value_counts()?
# 6. Sort the dataframe created in step 3 by total revenue (in place) and view the top 5 companies
# 7. Create a third dataframe from the sorted dataframe (step 6) with only two columns: "TotalAssets" and "TotalRevenue"
# and exclude companies with less than 4 geographic segments
# 8. How many companies were excluded? 
# 9. Print the summary statistics (count, mean, std etc.) of the dataframe created in step 7.

most_segments = df.nlargest(1,'GeographicSegments')['Name']
print(most_segments)
print()
most_segments_ticker = most_segments.index
sector = df.loc[most_segments_ticker]['PrimarySector'].values
print(sector)
print()
sector_df = df.loc[df.PrimarySector==sector]
print(sector_df.shape)
print()
print(df.PrimarySector.value_counts())
print()
sector_df=sector_df.sort_values('TotalRevenue')
print(sector_df.head())
print()
mini_sector_df = sector_df.loc[sector_df.GeographicSegments>=4., ['TotalAssets','TotalRevenue']]
print(mini_sector_df.shape)
print('Number of companies excluded is ', len(sector_df)-len(mini_sector_df))
print()
print(mini_sector_df.describe())

Ticker
TSX:FM    First Quantum Minerals Ltd. (TSX:FM)
Name: Name, dtype: object

[Materials]
Categories (11, object): [Communication Services, Consumer Discretionary, Consumer Staples, Energy, ..., Information Technology, Materials, Real Estate, Utilities]

(19, 5)

Financials                21
Materials                 19
Energy                    18
Industrials               17
Utilities                 12
Real Estate               11
Consumer Discretionary     9
Consumer Staples           8
Information Technology     6
Communication Services     6
Health Care                5
Name: PrimarySector, dtype: int64

                                             Name  TotalAssets  TotalRevenue  \
Ticker                                                                         
TSX:FNV       Franco-Nevada Corporation (TSX:FNV)       6729.6         888.2   
TSX:AGI                Alamos Gold Inc. (TSX:AGI)       4455.5         889.4   
TSX:DGC         Detour Gold Corporation (TSX:DGC)       336

In [30]:
# Dealing with missing values
df.isna().any() # identify which variables contain missing values

Name                  False
TotalAssets           False
TotalRevenue          False
GeographicSegments     True
PrimarySector         False
dtype: bool

In [31]:
df.loc[df.GeographicSegments.isna(),['Name','GeographicSegments']]

Unnamed: 0_level_0,Name,GeographicSegments
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
TSX:CNQ,Canadian Natural Resources Limited (TSX:CNQ),
TSX:CSH.UN,Chartwell Retirement Residences (TSX:CSH.UN),
TSX:EMP.A,Empire Company Limited (TSX:EMP.A),
TSX:IAG,iA Financial Corporation Inc. (TSX:IAG),
TSX:IPL,Inter Pipeline Ltd. (TSX:IPL),
TSX:PAAS,Pan American Silver Corp. (TSX:PAAS),
TSX:PPL,Pembina Pipeline Corporation (TSX:PPL),
TSX:SLF,Sun Life Financial Inc. (TSX:SLF),


In [32]:
df.GeographicSegments=df.GeographicSegments.fillna(0) #df.GeographicSegments.fillna(0,inplace=True)
df.GeographicSegments.isna().any() # check if we succeeded

False

In [33]:
df.GeographicSegments = df.GeographicSegments.astype('int')

In [34]:
# Arithmetic & aggregation

numeric_df = df.select_dtypes(include='number')
print(numeric_df.head())

           TotalAssets  TotalRevenue  GeographicSegments
Ticker                                                  
NYSE:AEM       10715.5        2990.0                   5
TSX:AC         19197.0       18065.0                   5
TSX:AGI         4455.5         889.4                   4
TSX:AQN        12811.6        2247.9                   2
TSX:ATD.B      30426.3       79562.8                   3


In [35]:
numeric_df.apply(np.log).head().round(1) # applies function to each cell

Unnamed: 0_level_0,TotalAssets,TotalRevenue,GeographicSegments
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NYSE:AEM,9.3,8.0,1.6
TSX:AC,9.9,9.8,1.6
TSX:AGI,8.4,6.8,1.4
TSX:AQN,9.5,7.7,0.7
TSX:ATD.B,10.3,11.3,1.1


In [36]:
numeric_df.sum() # defaults to summing over axis 0/axis='rows'

TotalAssets           10532718.0
TotalRevenue           1535777.6
GeographicSegments         597.0
dtype: float64

In [37]:
numeric_df.sum(axis='columns')[:5]

Ticker
NYSE:AEM      13710.5
TSX:AC        37267.0
TSX:AGI        5348.9
TSX:AQN       15061.5
TSX:ATD.B    109992.1
dtype: float64

In [38]:
numeric_df.sum(axis=1)[:5] # same as NumPy (and as above)

Ticker
NYSE:AEM      13710.5
TSX:AC        37267.0
TSX:AGI        5348.9
TSX:AQN       15061.5
TSX:ATD.B    109992.1
dtype: float64

In [51]:
print(numeric_df.loc['NYSE:AEM'])
10715.5+2990+5 # check pandas sum of numeric variables

TotalAssets           10715.5
TotalRevenue           2990.0
GeographicSegments        5.0
Name: NYSE:AEM, dtype: float64


13710.5

In [40]:
a=df.TotalAssets*2
print(a[:5])

Ticker
NYSE:AEM     21431.0
TSX:AC       38394.0
TSX:AGI       8911.0
TSX:AQN      25623.2
TSX:ATD.B    60852.6
Name: TotalAssets, dtype: float64


In [41]:
b=df.TotalAssets*df.GeographicSegments # elementwise multiplication
b[:5]

Ticker
NYSE:AEM     53577.5
TSX:AC       95985.0
TSX:AGI      17822.0
TSX:AQN      25623.2
TSX:ATD.B    91278.9
dtype: float64

In [42]:
# Groupby
# References: 
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html, 
# https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html

df.groupby(['PrimarySector']).median().round(1).sort_values('TotalAssets', ascending=False)

Unnamed: 0_level_0,TotalAssets,TotalRevenue,GeographicSegments
PrimarySector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Financials,271827.0,24231.1,3.0
Utilities,23415.8,4316.8,3.0
Communication Services,23171.0,9667.0,1.0
Energy,16199.5,7398.6,2.0
Real Estate,10453.1,1083.2,2.0
Consumer Staples,10403.9,19816.4,2.0
Industrials,7766.6,6996.0,4.0
Materials,7027.6,2454.1,5.0
Information Technology,4589.4,2608.5,5.5
Consumer Discretionary,4094.8,3964.0,6.0


In [53]:
# Pivot table
df.pivot_table(values=['TotalAssets','TotalRevenue'],columns=['PrimarySector'],aggfunc=np.mean).round(1)

PrimarySector,Communication Services,Consumer Discretionary,Consumer Staples,Energy,Financials,Health Care,Industrials,Information Technology,Materials,Real Estate,Utilities
TotalAssets,25534.3,11894.1,17547.7,35740.5,396182.0,11711.8,14655.5,5902.0,13551.0,23424.5,25867.3
TotalRevenue,10750.4,10514.8,29310.0,13658.6,24718.1,2520.0,10309.5,3741.9,4859.9,2188.7,4208.4


In [44]:
# Assigning new data
df['AssetsInBillions']= np.round(df.TotalAssets/1000)
df.head()

Unnamed: 0_level_0,Name,TotalAssets,TotalRevenue,GeographicSegments,PrimarySector,AssetsInBillions
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NYSE:AEM,Agnico Eagle Mines Limited (NYSE:AEM),10715.5,2990.0,5,Materials,11.0
TSX:AC,Air Canada (TSX:AC),19197.0,18065.0,5,Industrials,19.0
TSX:AGI,Alamos Gold Inc. (TSX:AGI),4455.5,889.4,4,Materials,4.0
TSX:AQN,Algonquin Power & Utilities Corp. (TSX:AQN),12811.6,2247.9,2,Utilities,13.0
TSX:ATD.B,Alimentation Couche-Tard Inc. (TSX:ATD.B),30426.3,79562.8,3,Consumer Staples,30.0


In [45]:
# If a company has the number of geographic segments listed as 0, change it to 1
print(len(df.loc[df.GeographicSegments==0]))
df.loc[df.GeographicSegments==0,'GeographicSegments']=1
print(len(df.loc[df.GeographicSegments==0]))

8
0


In [47]:
# Correct the total assets figure for TSX:RNW from 3747 to 3477
print(df.loc['TSX:RNW','TotalAssets'])
df.loc['TSX:RNW','TotalAssets'] = 3477.
print(df.loc['TSX:RNW','TotalAssets'])

3747.0
3477.0
