# 5.3 Summarizing and Computing Descriptive Statistics

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]],index=['a', 'b', 'c', 'd'],columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [3]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [4]:
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [5]:
df.mean(axis='columns', skipna=True)

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [6]:
df.mean(axis='columns', skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

![Options for reduction methods](./snaps/5.3.PNG)

In [7]:
df.idxmax()

one    b
two    d
dtype: object

In [8]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [9]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [10]:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [11]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

![Descriptive and summary statistics](./snaps/5.3a.PNG)

## 5.3.1 Correlation and Covariance

In [12]:
import pandas_datareader.data as web
all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}
price = pd.DataFrame({ticker: data['Adj Close'] for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume'] for ticker, data in all_data.items()})

In [13]:
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-04-22,-0.027821,-0.011441,-0.024144,-0.042609
2022-04-25,0.006737,0.006148,0.024413,0.030398
2022-04-26,-0.037328,-0.02092,-0.037404,-0.030377
2022-04-27,-0.001467,-0.008297,0.048109,-0.037534
2022-04-28,0.045155,0.005035,0.022633,0.038176


In [14]:
returns['MSFT'].corr(returns['IBM'])

0.4769986851074962

In [15]:
returns['MSFT'].cov(returns['IBM'])

0.00014533999652160604

In [16]:
returns.MSFT.corr(returns.IBM)

0.4769986851074962

In [17]:
returns.corr()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,1.0,0.427851,0.746122,0.671685
IBM,0.427851,1.0,0.476999,0.450884
MSFT,0.746122,0.476999,1.0,0.782726
GOOG,0.671685,0.450884,0.782726,1.0


In [18]:
returns.cov()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,0.000387,0.000142,0.000265,0.000236
IBM,0.000142,0.000284,0.000145,0.000136
MSFT,0.000265,0.000145,0.000327,0.000253
GOOG,0.000236,0.000136,0.000253,0.000318


In [19]:
returns.corrwith(returns.IBM)

AAPL    0.427851
IBM     1.000000
MSFT    0.476999
GOOG    0.450884
dtype: float64

In [20]:
returns.corrwith(volume)

AAPL   -0.079991
IBM    -0.101966
MSFT   -0.067466
GOOG   -0.098366
dtype: float64

In [21]:
returns.corrwith(volume, axis="columns")

Date
2017-05-01         NaN
2017-05-02    0.637537
2017-05-03   -0.407847
2017-05-04   -0.702918
2017-05-05    0.738403
                ...   
2022-04-22    0.012549
2022-04-25   -0.464784
2022-04-26   -0.758451
2022-04-27    0.600149
2022-04-28    0.618941
Length: 1259, dtype: float64

## 5.3.2 Unique Values, Value Counts, and Membership

In [22]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [23]:
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [24]:
uniques.sort()
uniques

array(['a', 'b', 'c', 'd'], dtype=object)

In [25]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [26]:
pd.value_counts(obj.values, sort=False)

c    3
a    3
d    1
b    2
dtype: int64

In [27]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [28]:
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [29]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [30]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2], dtype=int64)

![Unique, value counts, and set membership methods](./snaps/5.3b.PNG)

In [31]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],'Qu2': [2, 3, 1, 2, 3],'Qu3': [1, 5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [32]:
result = data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
