## Function Application and Mapping

In [1]:
import pandas as pd
import numpy as np

In [3]:
# Numpy ufunc (element-wise array methods) also work with pandas objects
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,-2.594322,-1.336589,1.273112
Ohio,-0.343685,-1.410702,1.74753
Texas,0.172009,0.368404,-0.851919
Oregon,-0.964851,1.844986,1.389498


In [4]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,2.594322,1.336589,1.273112
Ohio,0.343685,1.410702,1.74753
Texas,0.172009,0.368404,0.851919
Oregon,0.964851,1.844986,1.389498


In [6]:
f = lambda x: x.max() - x.min()  # function f computes the different between the max and min of a Series (each columns)
frame.apply(f)

b    2.766330
d    3.255688
e    2.599450
dtype: float64

In [7]:
# axis='column' the function will be invoked once per row instead
frame.apply(f, axis='columns')

Utah      3.867434
Ohio      3.158232
Texas     1.220323
Oregon    2.809837
dtype: float64

In [16]:
(frame.max(axis=1)).max()  # Maximum value of frame

1.8449860448817152

In [17]:
# Function passed to apply need not return a scalar value; it can also return a Series with multiple values
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

frame.apply(f)

Unnamed: 0,b,d,e
min,-2.594322,-1.410702,-0.851919
max,0.172009,1.844986,1.74753


In [24]:
# Element-wise Python functions can be used, too. Suppose you wanted to compute a formatted string from
# each floating-point value in frame. You can do this with apply map
format = lambda x: '%.2f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-2.59,-1.34,1.27
Ohio,-0.34,-1.41,1.75
Texas,0.17,0.37,-0.85
Oregon,-0.96,1.84,1.39


In [30]:
frame['e'].map(format)

Utah       1.27
Ohio       1.75
Texas     -0.85
Oregon     1.39
Name: e, dtype: object

## Sorting and Ranking

In [32]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [33]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [36]:
frame = pd.DataFrame(np.arange(8).reshape(2, 4), index=['three', 'one'], columns=list('dabc'))
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [38]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [39]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [40]:
# the Data is sorted in ascending order by default, but can be sorted in descending order too
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [41]:
# To sort a Series by its values, use its sort_value method
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [42]:
# Any missing values are sorted to the end of the Series by default
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [45]:
# When sorting a DataFrame, you can use the data in one or more columns as teh sort key
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 2, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,2
3,2,1


In [46]:
frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,2
3,2,1
0,4,0
1,7,1


In [47]:
# To sort by multiple columns, pass a list of names
frame.sort_values(by=['a', 'b'])

Unnamed: 0,b,a
0,4,0
3,2,1
1,7,1
2,-3,2


## Axis Indexes with Duplicate label

In [48]:
obj = pd.Series(range(5), index=list('aabbc'))
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [49]:
obj.index.is_unique

False

In [50]:
obj['a']

a    0
a    1
dtype: int64

In [73]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [71]:
# df[pd.isnull(df)] = 0

In [74]:
# calling DataFrame's sum method returns a Series containing column sums
df.sum()

one    9.25
two   -5.80
dtype: float64

In [75]:
# NA values are excluded unless the entire slice (row or column in this case) is NA.
# This can be disabled with the "skipna" optioon
df.mean(axis=1, skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [76]:
# Some methods, return indirect statistics
df.idxmax()

one    b
two    d
dtype: object

In [77]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [78]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [79]:
# On non-numeric Data, describe produces alternative summary statistic
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

## Correlation and Covariance

In [12]:
import pandas_datareader as web
all_data = {ticker: web.get_data_yahoo(ticker)
            for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

price = pd.DataFrame({ticker: data['Adj Close']
                      for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume']
                       for ticker, data in all_data.items()})

In [13]:
returns = price.pct_change()

In [14]:
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-09-12,0.038508,0.011379,0.008281,0.000805
2022-09-13,-0.05868,-0.026098,-0.054978,-0.05864
2022-09-14,0.009555,0.003458,0.000913,0.005318
2022-09-15,-0.01893,-0.017229,-0.027119,-0.018608
2022-09-16,-0.01096,0.014184,-0.002608,-0.002599


The corr method of Series computes the correlation of the overlapping, non-NA, aligned-by-index values in two Series.
Relatedly, cov computes the covariance

In [16]:
returns['MSFT'].corr(returns['IBM'])

0.481158727158707

In [18]:
returns.MSFT.cov(returns.IBM)

0.0001561756037318457

In [20]:
# DataFrame's corr and cov methods, on the other hand, return a full correlation or covariance matrix as a Dataframe
returns.corr()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,1.0,0.440124,0.763195,0.688895
IBM,0.440124,1.0,0.481159,0.448498
MSFT,0.763195,0.481159,1.0,0.790191
GOOG,0.688895,0.448498,0.790191,1.0


In [21]:
returns.cov()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,0.000416,0.000154,0.000294,0.000266
IBM,0.000154,0.000296,0.000156,0.000146
MSFT,0.000294,0.000156,0.000356,0.000282
GOOG,0.000266,0.000146,0.000282,0.000358


In [22]:
returns.corrwith(returns.IBM)

AAPL    0.440124
IBM     1.000000
MSFT    0.481159
GOOG    0.448498
dtype: float64