# Summarizing and Computing Descriptive Statistics

In [1]:
import numpy as np
import pandas as pd

In [2]:
dataframe = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                          [np.nan, np.nan], [0.75, -1.3]],
                         index=['a', 'b', 'c', 'd'],
                         columns=['one', 'two'])

In [3]:
dataframe

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


## 1. Sum and Mean

In [4]:
#axis=0, column sums
dataframe.sum()

one    9.25
two   -5.80
dtype: float64

In [5]:
#axis=1, rows
dataframe.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [6]:
#skipna
dataframe.mean(axis=1, skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [7]:
#labels where min/max values attained
#column and row cross over certain value
dataframe.idxmax()

one    b
two    d
dtype: object

In [8]:
dataframe.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


## 2. Describe

In [9]:
#description for each column, axis=0
dataframe.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [10]:
#on non-numeric data, alternative summary statistics
series = pd.Series(['a', 'a', 'b', 'c'] * 4)
series

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [11]:
pd.DataFrame(series.describe())

Unnamed: 0,0
count,16
unique,3
top,a
freq,8


## 3. Correlation and Covariance
* Correlation and Covariance are computed from pairs of arguments (pair of columns)

In [12]:
# !pip install pandas-datareader

In [13]:
import pandas_datareader.data as web

all_data = {ticker: web.get_data_yahoo(ticker)
            for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

price = pd.DataFrame({ticker: data['Adj Close']
                      for ticker, data in all_data.items()})

volume = pd.DataFrame({ticker: data['Volume']
                       for ticker, data in all_data.items()})

In [14]:
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01-24,-0.004864,-0.004097,0.001149,0.002152
2022-01-25,-0.011385,0.056513,-0.026588,-0.027893
2022-01-26,-0.000563,-0.01352,0.028493,0.019762
2022-01-27,-0.002943,-0.01296,0.010549,-0.000921
2022-01-28,0.069778,0.014941,0.028082,0.032284


### 3.1 corr()

In [15]:
#correlation of 2 columns
returns['MSFT'].corr(returns['IBM'])

0.4848624930907788

In [16]:
returns.MSFT.corr(returns.IBM)

0.4848624930907788

### 3.2 cov()

In [17]:
returns['MSFT'].cov(returns['IBM'])

0.00014110988809420624

In [18]:
returns.MSFT.cov(returns.IBM)

0.00014110988809420624

### 3.3 Full matrix of correlation and covariance

In [19]:
returns.corr()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,1.0,0.426739,0.736716,0.661593
IBM,0.426739,1.0,0.484862,0.458422
MSFT,0.736716,0.484862,1.0,0.782259
GOOG,0.661593,0.458422,0.782259,1.0


In [20]:
returns.cov()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,0.000374,0.000138,0.000249,0.000219
IBM,0.000138,0.000278,0.000141,0.000131
MSFT,0.000249,0.000141,0.000305,0.000234
GOOG,0.000219,0.000131,0.000234,0.000293


In [21]:
returns.corrwith(returns.IBM)

AAPL    0.426739
IBM     1.000000
MSFT    0.484862
GOOG    0.458422
dtype: float64

## 4. Unique Values, Value Counts, and Membership

### 4.1 Series

In [22]:
series = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
series.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [23]:
#frequency of each value
series.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [24]:
#value_counts() sorts the values by its value frequency, but to sort lexicographically
pd.value_counts(series.values, sort=False)

b    2
d    1
c    3
a    3
dtype: int64

In [25]:
mask = series.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [26]:
series[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

### 4.2 DataFrame

In [27]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})

In [28]:
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [29]:
result = data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
