## Python Plot

### Quantitive
- [Describe](#discribe)
- [Histogram](#histogram)
- [Density Plot](#density-plot)
- [Box Plot](#boxplots)
- [QQ Plot](#qq-plot)
- [Scatterplot Matrices](#scatterplot-matrices)
- [Correlation Plot](#correlation-plot)

### Categorical
- [Contingency Table](#contingency-table)
- [Mosaic Plot](#mosaic-plot)
- [Chi Square Test](#chi-square-Test)
- [Fisher Exact Test](#fisher-exact-test)
- [Odds Ratio](#odds-ratio)
- [Tau for Ordinal Variable](#tau-for-ordinal-variable)

### Tutorial 
- [Regression](#regression)

In [None]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy import special
import statsmodels.api as sm # -> for QQ plot
from statsmodels.graphics.mosaicplot import mosaic # -> for mosaic plot

path = "../src/"

stud_pref = pd.read_csv(path + "data/student/student-mat.csv", sep=";")
# stud_pref = pd.read_csv(path + "data/student/student-mat.csv", delimiter=";")
# sep="," by default
concrete = pd.read_csv(path + "data/concrete+slump+test/slump_test.data")
er_arrivals = pd.read_csv(path+"data/er_arrivals.csv")
er_arrivals['date'] = pd.to_datetime(er_arrivals['date'])

#### Discribe

dataframe.describe(percentiles, include, exclude, datetime_is_numeric)

In [None]:
stud_pref.G3.describe() # -> information

In [None]:
stud_pref[['Medu', 'G3']].describe()

In [None]:
stud_pref[['Medu', 'G3']].groupby('Medu').describe()

In [None]:
stud_pref[['Medu', 'G3', 'G1']].groupby('Medu').describe()

#### Numerical Summary

In [None]:
stud_pref.loc[:,'G1':'G3'].shape

In [None]:
stud_pref.G3.value_counts()

#### info

Prints information about a DataFrame including the index dtype and columns, non-null values and memory usage.

In [None]:
stud_pref.G3.info()

### Histogram

In [None]:
fig = stud_pref.G3.hist(grid=False)
fig.set_title('G3 Histogram')

In [None]:
stud_pref.G3.hist(by=stud_pref.Medu, figsize=(15, 10), density=True, layout=(2,3))

In [None]:
stud_pref[['G1', 'G2', 'G3']].hist(layout=(1,3), figsize=(15,4), grid=False)

#### Density-Plot

In [None]:
fig, axs = plt.subplots(2, 3, squeeze=False, figsize=(15, 6))
out2 = stud_pref.groupby("Medu")
for index, df in enumerate(out2):
    # df[0] -> value of Medu, df[1] -> dataframe
    tmp = plt.subplot(2, 3, index+1) # subplot(nrows, ncols, position index)
    df[1].G3.plot(kind='kde')
    tmp.set_title(df[0])

#### Boxplots

In [None]:
stud_pref.plot.box(column='G3', by='goout')

#### QQ Plot

In [None]:
concrete

In [None]:
concrete.rename(columns={'No': 'id', 'Compressive Strength (28-day)(Mpa)': 'Comp_Strength'}, inplace=True)

In [None]:
sm.qqplot(concrete.Comp_Strength, line='q')

#### Scatterplot Matrices

In [None]:
pd.plotting.scatter_matrix(concrete[['Cement', 'Slag', 'Water', 'SLUMP(cm)', 'FLOW(cm)']], figsize=(8,8))

#### Correlation Plot

In [None]:
corr = concrete[['Cement', 'Slag', 'Water', 'SLUMP(cm)', 'FLOW(cm)']].corr()
corr.style.background_gradient(cmap='coolwarm_r')

#### Contingency Table

In [None]:
tab = np.array([[4, 184], [2, 260]])
prop = tab / tab.sum(axis=1).reshape((2,1)) # need to reshape sum(axis=1) as it return a vector in shape (1,2)
xx = pd.DataFrame(prop, columns=['nervous', 'not nervous'], index=['claritin', 'placebo'])
xx

In [None]:
ax = xx.plot(kind='bar', stacked=False, rot=1.0, figsize=(8,4))
ax = xx.plot(kind='bar', stacked=True, rot=0.5, figsize=(5,4))
ax.legend(loc='upper left')

#### Mosaic Plot

In [None]:
tab = np.asarray([[762,327,468], [484,239,477]])
mosaic(tab, statistic=True, gap=0.05)

#### Chi Square Test

$$H_0: \text{The two variables are independent}$$
$$H_1: \text{The two variables are not independent}$$

Set Significance level $5\%$

- If $p-\text{value} < 0.05$, Reject $H_0$, The two variables are not independent.
- If $p-\text{value} > 0.05$, not enough evidence to reject $H_0$, The two variables are independent.

In [None]:
def chisq_test(array):
    chisq_output = stats.chi2_contingency(array)
    print(f"The p-value is {chisq_output.pvalue:.4f}")
    print(f"The test statistic value is {chisq_output.statistic:.4f}.")
    if np.any(chisq_output.expected_freq < 5):
        print("Expected cell count assumption violated!")
    else :
        print("Expected cell counts all at least 5.")
    
    # conclusion
    if chisq_output.pvalue < 0.05:
        print("Conclusion: The two variable is not independent.")
    else:
        print("Conclusion: The two variable is independent.")
    print("---")

In [None]:
array = np.array([[46, 474], [37, 516]])
chisq_output = stats.chi2_contingency(array)
chisq_output.pvalue # pvalue
chisq_output.statistic # test statistic value
chisq_output.expected_freq # expected cell counts

$\chi^2$ Test for $r\times c $ table

In [None]:
rc = np.array([[46, 474], [37, 516], [50, 75]])
rc_output = stats.chi2_contingency(rc)
rc_output.pvalue

chisq_test(rc)

#### Fisher Exact Test

When there exists expected cell counts are less than 5, use Fisher Test instead of $\chi^2$

In [None]:
def fisher_test(array):
    fisher_output = stats.fisher_exact(array)
    print(f"The p-value is {fisher_output.pvalue:.4f}")
    print(f"The test statistic value is {fisher_output.statistic:.4f}.")
    
    # conclusion
    if fisher_output.pvalue < 0.05:
        print("Conclusion: The two variable is not independent.")
    else:
        print("Conclusion: The two variable is independent.")
    print("---")

In [None]:
tab = np.array([[4, 184], [2, 260]])
fisher_output = stats.fisher_exact(tab)
fisher_output.statistic
fisher_output.pvalue

fisher_test(tab)

#### Odds Ratio

In [None]:
array = np.array([[46, 474], [37, 516]])
tab2 = sm.stats.Table2x2(array)
tab2.summary()
# odds ratio is 1.353

#### Tau for Ordinal Variable

In [None]:
us_svy_tab = np.array([[1, 3, 10, 6], 
                      [2, 3, 10, 7],
                      [1, 6, 14, 12],
                      [0, 1,  9, 11]])

dim1 = us_svy_tab.shape
x = []; y=[]
for i in range(0, dim1[0]):
    for j in range(0, dim1[1]):
        for k in range(0, us_svy_tab[i,j]):
            x.append(i)
            y.append(j)
            
kt_output = stats.kendalltau(x, y)
print(f"The estimate of tau-b is {kt_output.statistic:.4f}.")

#### Trimed Mean

In [None]:
stats.trim_mean(stud_pref.G3, proportiontocut=0.1)

#### Winsorize Mean

In [None]:
stats.mstats.winsorize(stud_pref.G3, limits=0.1).mean()

#### Standard Deviation

In [None]:
stud_pref.G3.std()

#### MAD

$$\hat{\sigma} = \frac{1}{0.6745} MAD(X)$$

In [None]:
stats.median_abs_deviation(stud_pref.G3)

#### IQR

$$\hat{\sigma} = \frac{1}{1.35} IQR $$

In [None]:
stats.iqr(stud_pref.G3)

### Regression

In [None]:
# Step 1: Extract Yk and compute Xk
Yk = er_arrivals['num_arrivals']
Xk = Yk.value_counts().sort_index() # Frequency table
k = Xk.index.to_numpy() # Unique arrival values
Xk = Xk.to_numpy() # Counts for each unique value

# Step 2: Compute N and phi
N = len(Yk)
phi = special.gammaln(k + 1) + np.log(Xk / N)

# Step 3: Compute lam_hat from slope
slope, intercept, _, _, _ = stats.linregress(k, phi)
lam_hat = np.exp(slope)

In [None]:
# Step 4: Plotting
plt.scatter(k, phi, label=r'$\phi_k$', color='black', s=50);
plt.plot(k, slope * k + intercept, linestyle='--', color='blue', label=f"Slope: {slope:.2f}")
plt.xlabel('k')
plt.ylabel(r'$\phi_k$')
plt.title("Poisson-ness for E.R. arrivals")
plt.legend()