# Class Two Notes

### Review of Pandas and Numpy

In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.float_format = '{:,.2f}'.format


### Why we like Numpy
- Multi-dimensional arrays
- much faster than lists
- cool functionality 

In [24]:
np.__version__

'1.21.5'

In [25]:
pd.__version__

'1.5.3'

In [26]:
x = np.array([[1, 2, 3],
              [4, 5, 6],
              [7, 8, 9],
              [10, 11, 12]])

In [27]:
x[0][1]

2

In [28]:
x[1:,1:]

array([[ 5,  6],
       [ 8,  9],
       [11, 12]])

In [29]:
x.reshape(2, 6)

array([[ 1,  2,  3,  4,  5,  6],
       [ 7,  8,  9, 10, 11, 12]])

In [30]:
big_array = np.arange(10000000)

In [31]:
big_array + 1 #this is fast

array([       1,        2,        3, ...,  9999998,  9999999, 10000000])

### List Comprehensions

In [32]:
y = np.array([x for x in range(0,10)])

In [33]:
y.mean()

4.5

In [34]:
y.std()

2.8722813232690143

In [35]:
y.sum()

45

### Boolean Operations

In [36]:
x[ x > 2].reshape(2, 5)

array([[ 3,  4,  5,  6,  7],
       [ 8,  9, 10, 11, 12]])

In [37]:
np.count_nonzero(x)

12

In [38]:
np.size(x)

12

In [39]:
x.shape

(4, 3)

"np.where" can be used to replace elements in an array

In [40]:
np.where(salary>350000,"rich",np.where(salary>100000,"middle","poor"))

NameError: name 'salary' is not defined

### Pandas

- pandas is cool
- can do a lot of stuff

In [41]:
pd.__version__

'1.5.3'

In [43]:
df = pd.DataFrame([["Rao","Tucson",33422.12,27127.22],
                   ["Montalbano","Chicago",45233.27,41322.13],
                   ["Zhang","Miami",36234.22,39123.45],
                   ["Brown","New York",57322.83,41486.28],
                   ["Achebe","Los Angeles",23490.81,22540.36]])

df

Unnamed: 0,0,1,2,3
0,Rao,Tucson,33422.12,27127.22
1,Montalbano,Chicago,45233.27,41322.13
2,Zhang,Miami,36234.22,39123.45
3,Brown,New York,57322.83,41486.28
4,Achebe,Los Angeles,23490.81,22540.36


Pandas is cool because different columns can have different data types. BUT within the column all rows must have the same type

In [46]:
df.columns = ["Manager","City","Revenue","Expenses"]

In [47]:
df

Unnamed: 0,Manager,City,Revenue,Expenses
0,Rao,Tucson,33422.12,27127.22
1,Montalbano,Chicago,45233.27,41322.13
2,Zhang,Miami,36234.22,39123.45
3,Brown,New York,57322.83,41486.28
4,Achebe,Los Angeles,23490.81,22540.36


In [48]:
df['Revenue']

0    33422.12
1    45233.27
2    36234.22
3    57322.83
4    23490.81
Name: Revenue, dtype: float64

In [49]:
df['Profit'] = df['Revenue'] - df['Expenses']

In [50]:
df

Unnamed: 0,Manager,City,Revenue,Expenses,Profit
0,Rao,Tucson,33422.12,27127.22,6294.9
1,Montalbano,Chicago,45233.27,41322.13,3911.14
2,Zhang,Miami,36234.22,39123.45,-2889.23
3,Brown,New York,57322.83,41486.28,15836.55
4,Achebe,Los Angeles,23490.81,22540.36,950.45


In [52]:
df['Revenue'].iloc[4]

23490.81

In [57]:
df['City'][0]

'Tucson'

In [60]:
df['profit_bool'] = np.where(df['Profit']>= 0,True,False)

In [61]:
df

Unnamed: 0,Manager,City,Revenue,Expenses,Profit,profit_bool
0,Rao,Tucson,33422.12,27127.22,6294.9,True
1,Montalbano,Chicago,45233.27,41322.13,3911.14,True
2,Zhang,Miami,36234.22,39123.45,-2889.23,False
3,Brown,New York,57322.83,41486.28,15836.55,True
4,Achebe,Los Angeles,23490.81,22540.36,950.45,True


In [62]:
df[['Revenue','Expenses']]

Unnamed: 0,Revenue,Expenses
0,33422.12,27127.22
1,45233.27,41322.13
2,36234.22,39123.45
3,57322.83,41486.28
4,23490.81,22540.36


In [85]:
emp_id = np.array([100,101,102,103,104,105,106,107,108,109,110,111])
names = np.array(['Bill','Ludovica','Qing','Savitri','Giovanni',"Birgit",
                  "Bercù","Elodie","Gurumul","Kwame","Rosa","João"])
bonus = np.array([232300.56,478123.45,3891.24,98012.36,52123.50,0,
                  321000.23,37345.22,121200,59621.33,94123.5,45123.2])
gender = np.array(['M','F','F','F','M','F','F','F',"M","M","F","M"])
city = np.array(["New York","Catania","Paris","New York","Sydney","Sydney",
                 "Paris","New York","Sydney","Paris","New York","Paris"])
salary = np.array([455000,722321,95223,135000,132033,700000,832123,
                   78123.11,13243.32,456122.17,912321.22,31123])
columns=["name","gender","city","salary","bonus"]

#transpose flips a dataframe!
df = pd.DataFrame([names,gender,city,salary,bonus]).transpose().set_index(emp_id)


df.columns = columns

df['salary'] =df['salary'].astype('float64')
df['bonus'] =df['bonus'].astype('float64')


df


Unnamed: 0,name,gender,city,salary,bonus
100,Bill,M,New York,455000.0,232300.56
101,Ludovica,F,Catania,722321.0,478123.45
102,Qing,F,Paris,95223.0,3891.24
103,Savitri,F,New York,135000.0,98012.36
104,Giovanni,M,Sydney,132033.0,52123.5
105,Birgit,F,Sydney,700000.0,0.0
106,Bercù,F,Paris,832123.0,321000.23
107,Elodie,F,New York,78123.11,37345.22
108,Gurumul,M,Sydney,13243.32,121200.0
109,Kwame,M,Paris,456122.17,59621.33


In [77]:
avg_salary_by_gender = df.groupby('gender')['salary'].agg([np.mean,np.median])

In [78]:
avg_salary_by_gender

Unnamed: 0_level_0,mean,median
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
F,496444.475714,700000.0
M,217504.298,132033.0


In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12 entries, 100 to 111
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    12 non-null     object 
 1   gender  12 non-null     object 
 2   city    12 non-null     object 
 3   salary  12 non-null     float64
 4   bonus   12 non-null     float64
dtypes: float64(2), object(3)
memory usage: 576.0+ bytes


In [88]:
city_gender_group = df.groupby(['city','gender'])
cgg = city_gender_group['salary'].mean() 
cgg

city      gender
Catania   F        722,321.00
New York  F        375,148.11
          M        455,000.00
Paris     F        463,673.00
          M        243,622.58
Sydney    F        700,000.00
          M         72,638.16
Name: salary, dtype: float64

### Apply and Lambda Functions

In [89]:
df

Unnamed: 0,name,gender,city,salary,bonus
100,Bill,M,New York,455000.0,232300.56
101,Ludovica,F,Catania,722321.0,478123.45
102,Qing,F,Paris,95223.0,3891.24
103,Savitri,F,New York,135000.0,98012.36
104,Giovanni,M,Sydney,132033.0,52123.5
105,Birgit,F,Sydney,700000.0,0.0
106,Bercù,F,Paris,832123.0,321000.23
107,Elodie,F,New York,78123.11,37345.22
108,Gurumul,M,Sydney,13243.32,121200.0
109,Kwame,M,Paris,456122.17,59621.33


In [90]:
def ratio_group(df,index,threshold):
    try:
        if df.loc[index]['bonus']/df.loc[index]['salary'] > threshold:
            return "High Bonus"
    except:
        pass
    return "Low Bonus"

In [96]:
ratio_group(df, 110, .5)

'Low Bonus'

In [122]:
def ratio_group(df,index,threshold):
    try:
        if df.loc[index]['bonus']/df.loc[index]['salary'] > threshold:
            return "High Bonus"
    except:
        pass
    return "Low Bonus"

groups = df.groupby(lambda x: ratio_group(df,x,0.5))


In [115]:
df.drop(['bonus_level'], axis=1, inplace=True)

In [121]:
df['bonus_level'] = df.apply(lambda row: 'High Bonus' if row['bonus']/row['salary'] > .5 else 'Low Bonus', axis=1)

In [125]:
df

Unnamed: 0,name,gender,city,salary,bonus,bonus_level
100,Bill,M,New York,455000.0,232300.56,High Bonus
101,Ludovica,F,Catania,722321.0,478123.45,High Bonus
102,Qing,F,Paris,95223.0,3891.24,Low Bonus
103,Savitri,F,New York,135000.0,98012.36,High Bonus
104,Giovanni,M,Sydney,132033.0,52123.5,Low Bonus
105,Birgit,F,Sydney,700000.0,0.0,Low Bonus
106,Bercù,F,Paris,832123.0,321000.23,Low Bonus
107,Elodie,F,New York,78123.11,37345.22,Low Bonus
108,Gurumul,M,Sydney,13243.32,121200.0,High Bonus
109,Kwame,M,Paris,456122.17,59621.33,Low Bonus


In [126]:
df['bonus_level'] = df.apply(lambda row: "High" if row['bonus'] > 100000 else 'Medium' if row['bonus'] > 50000 else 'Low', axis=1)

In [127]:
df

Unnamed: 0,name,gender,city,salary,bonus,bonus_level
100,Bill,M,New York,455000.0,232300.56,High
101,Ludovica,F,Catania,722321.0,478123.45,High
102,Qing,F,Paris,95223.0,3891.24,Low
103,Savitri,F,New York,135000.0,98012.36,Medium
104,Giovanni,M,Sydney,132033.0,52123.5,Medium
105,Birgit,F,Sydney,700000.0,0.0,Low
106,Bercù,F,Paris,832123.0,321000.23,High
107,Elodie,F,New York,78123.11,37345.22,Low
108,Gurumul,M,Sydney,13243.32,121200.0,High
109,Kwame,M,Paris,456122.17,59621.33,Medium


### Joins

In [128]:
depts = pd.DataFrame([['A1','Accounting',102],['A2','Admin',105],['S1','Sales',108],['Q1','Quant Trading',101]],
                     columns=['Id','LongName','Head']).set_index('Id')
depts

Unnamed: 0_level_0,LongName,Head
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
A1,Accounting,102
A2,Admin,105
S1,Sales,108
Q1,Quant Trading,101


In [129]:
heads = depts.join(df,on='Head') 


In [130]:
heads

Unnamed: 0_level_0,LongName,Head,name,gender,city,salary,bonus,bonus_level
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A1,Accounting,102,Qing,F,Paris,95223.0,3891.24,Low
A2,Admin,105,Birgit,F,Sydney,700000.0,0.0,Low
S1,Sales,108,Gurumul,M,Sydney,13243.32,121200.0,High
Q1,Quant Trading,101,Ludovica,F,Catania,722321.0,478123.45,High


### Reading HTML
Pandas actually has a function for retrieving html tables from websites. This can save some time as you don't have to web scrape it.

In [137]:
df_list = pd.read_html('https://www.x-rates.com/table/?from=USD&amount=1')

In [142]:
df_list[0]

Unnamed: 0,US Dollar,1.00 USD,inv. 1.00 USD
0,Euro,0.92,1.09
1,British Pound,0.81,1.23
2,Indian Rupee,81.76,0.01
3,Australian Dollar,1.42,0.71
4,Canadian Dollar,1.33,0.75
5,Singapore Dollar,1.31,0.76
6,Swiss Franc,0.92,1.09
7,Malaysian Ringgit,4.27,0.23
8,Japanese Yen,130.06,0.01
9,Chinese Yuan Renminbi,6.75,0.15


In [143]:
df_list[1]

Unnamed: 0,US Dollar,1.00 USD,inv. 1.00 USD
0,Argentine Peso,186.97,0.01
1,Australian Dollar,1.42,0.71
2,Bahraini Dinar,0.38,2.66
3,Botswana Pula,12.85,0.08
4,Brazilian Real,5.07,0.2
5,Bruneian Dollar,1.31,0.76
6,Bulgarian Lev,1.8,0.56
7,Canadian Dollar,1.33,0.75
8,Chilean Peso,796.17,0.0
9,Chinese Yuan Renminbi,6.75,0.15
