# Descriptive Statistics: 
### Measure of central tendancy: Mean, Median, Mode
### Measure of variation: SD, var, z-score, CV
### Measure of position: quartiles

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statistics as st
import scipy
from scipy import stats
from scipy.stats import zscore, kurtosis, variation, scoreatpercentile
from scipy.stats.mstats import gmean  

In [2]:
import statistics
print(st.mean([1, 2, 3, 4, 4]))
print(st.stdev([1, 2, 3, 4, 4]))

2.8
1.3038404810405297


## Read the data

In [3]:
df = pd.read_csv("Data.csv")
df

Unnamed: 0,Annual income
0,62000.0
1,64000.0
2,49000.0
3,324000.0
4,1264000.0
5,54330.0
6,64000.0
7,51000.0
8,55000.0
9,48000.0


In [4]:
df.describe()

Unnamed: 0,Annual income
count,11.0
mean,189848.2
std,365285.4
min,48000.0
25%,52000.0
50%,55000.0
75%,64000.0
max,1264000.0


In [5]:
df.mean()

Annual income    189848.181818
dtype: float64

In [6]:
df.mean().round()

Annual income    189848.0
dtype: float64

## Geometric & Harmonic mean

In [7]:
stats.gmean(df)

array([86292.96812607])

In [17]:
stats.gmean(df.iloc[:, 0:1], axis=0) # axis=0 gives you the gm col wise

array([86292.96812607])

In [18]:
stats.gmean(df.iloc[:, 0:1], axis=1) # axis=1 gives you the gm row wise

array([  62000.,   64000.,   49000.,  324000., 1264000.,   54330.,
         64000.,   51000.,   55000.,   48000.,   53000.])

In [23]:
stats.hmean(df) # harmonic mean

array([65647.45574851])

## Median

In [24]:
df.median()

Annual income    55000.0
dtype: float64

### Mode

In [25]:
df.mode()

Unnamed: 0,Annual income
0,64000.0


In [14]:
range=df.max()-df.min()
range

Annual income    1216000.0
dtype: float64

## Variance Sample

In [33]:
df.var()

Annual income    1.334334e+11
dtype: float64

In [None]:
## Variance Population 

https://www.statisticshowto.datasciencecentral.com/population-variance/

## Standard Deviation

In [11]:
#df['Annual income'].std()
df.std()

Annual income    365285.380951
dtype: float64

In [34]:
s=df.var()
np.sqrt(s) ## SD using formula

Annual income    365285.380951
dtype: float64

In [20]:
np.std(df)

Annual income    348285.945135
dtype: float64

## Measure of Position

In [18]:
df.quantile() # default quantile is Q2

Annual income    55000.0
Name: 0.5, dtype: float64

In [38]:
df.quantile(q=0.50, axis=0,)
# default axis=0

Annual income    55000.0
Name: 0.5, dtype: float64

In [22]:
df.describe().loc['50%']

Annual income    55000.0
Name: 50%, dtype: float64

In [20]:
df.quantile(q=0.2, axis=0)

Annual income    51000.0
Name: 0.2, dtype: float64

In [4]:
df.quantile([0.2, 0.5])

Unnamed: 0,Annual income
0.2,51000.0
0.5,55000.0


In [15]:
IQR=df.quantile(0.75) - df.quantile(0.25)
IQR

Annual income    12000.0
dtype: float64

In [5]:
scoreatpercentile(df, 50)

55000.0

## z-scores

In [39]:
zscore(df) # using scipy
# directly using the function

array([[-0.36707821],
       [-0.3613358 ],
       [-0.40440386],
       [ 0.38517724],
       [ 3.084109  ],
       [-0.38910035],
       [-0.3613358 ],
       [-0.39866146],
       [-0.38717664],
       [-0.40727507],
       [-0.39291905]])

In [21]:
stats.zscore(df)

array([[-0.36707821],
       [-0.3613358 ],
       [-0.40440386],
       [ 0.38517724],
       [ 3.084109  ],
       [-0.38910035],
       [-0.3613358 ],
       [-0.39866146],
       [-0.38717664],
       [-0.40727507],
       [-0.39291905]])

In [4]:
zscore=(df-df.mean())/df.std()
zscore

Unnamed: 0,Annual income
0,-0.349995
1,-0.34452
2,-0.385584
3,0.367252
4,2.940583
5,-0.370993
6,-0.34452
7,-0.380109
8,-0.369158
9,-0.388322


## Multivariate analysis

In [34]:
import pandas as pd
iris = pd.read_csv('iris.data.csv', header=None, 
    names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'])

In [35]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [36]:
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [37]:
iris.describe(include="all")

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
count,150.0,150.0,150.0,150.0,150
unique,,,,,3
top,,,,,Iris-virginica
freq,,,,,50
mean,5.843333,3.054,3.758667,1.198667,
std,0.828066,0.433594,1.76442,0.763161,
min,4.3,2.0,1.0,0.1,
25%,5.1,2.8,1.6,0.3,
50%,5.8,3.0,4.35,1.3,
75%,6.4,3.3,5.1,1.8,


In [38]:
np.cov(iris['sepal_length'],iris['sepal_width'])[0,1]

-0.03926845637583892

In [39]:
iris.corr()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,1.0,-0.109369,0.871754,0.817954
sepal_width,-0.109369,1.0,-0.420516,-0.356544
petal_length,0.871754,-0.420516,1.0,0.962757
petal_width,0.817954,-0.356544,0.962757,1.0


In [42]:
iris_specis = pd.DataFrame(iris['species'].value_counts())
iris_specis

Unnamed: 0,species
Iris-virginica,50
Iris-setosa,50
Iris-versicolor,50


In [46]:
iris_specis['Relative Frequency'] = iris_specis['species'] 
iris_specis['Relative Frequency'] = iris_specis['Relative Frequency'].apply(lambda x: (x/iris_specis['species'].sum())*100)
iris_specis

Unnamed: 0,species,Relative Frequency
Iris-virginica,50,33.333333
Iris-setosa,50,33.333333
Iris-versicolor,50,33.333333


In [51]:
iris_specis['Relative Frequency'].sum()

99.99999999999999

In [50]:
iris_specis['Cumulative Frequency'] = iris_specis['Relative Frequency'].cumsum()
iris_specis

Unnamed: 0,species,Relative Frequency,Cumulative Frequency
Iris-virginica,50,33.333333,33.333333
Iris-setosa,50,33.333333,66.666667
Iris-versicolor,50,33.333333,100.0


In [32]:
plt.figure(figsize=(6,5))
sns.heatmap(corr,annot=True, cmap='plasma',vmin=-1,vmax=1, ) # corr instead of cor if all columns involved, cmap:coolwarm is good
# annot = True : mention correlation values in the heatmap.
plt.show()

NameError: name 'corr' is not defined

<Figure size 432x360 with 0 Axes>

In [8]:
pd.crosstab(iris['petal_length'] > 3.758667, iris['petal_width'] > 1.198667)

petal_width,False,True
petal_length,Unnamed: 1_level_1,Unnamed: 2_level_1
False,56,1
True,4,89
