# Descriptive Statistics

---

In [None]:
# Visual Python: Data Analysis > Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
from matplotlib import rcParams
rcParams['font.family'] = 'New Gulim'
rcParams['font.size'] = 10
rcParams['axes.unicode_minus'] = False

## 1 Descriptive Statistics

In [None]:
# Visual Python: Data Analysis > File
df = pd.read_csv('./data/02_1_기술통계.csv')
df

In [None]:
# Visual Python: Descriptive Statistics
vp_df = df.copy()

# Descriptive statistics
from IPython.display import display, Markdown
display(Markdown('### Descriptive statistics'))
display(pd.DataFrame({
    'N Total':vp_df.shape[0],
    'N Valid':vp_df.count(numeric_only=True),
    'N Missing':vp_df.loc[:,vp_df.apply(pd.api.types.is_numeric_dtype)].isnull().sum(),
    'Mean':vp_df.mean(numeric_only=True),
    'Sum':vp_df.sum(numeric_only=True),
    'Std. deviation':vp_df.std(numeric_only=True),
    'Percentile: 25':vp_df.quantile(q=0.25, numeric_only=True),
    'Percentile: 50':vp_df.quantile(q=0.50, numeric_only=True),
    'Percentile: 75':vp_df.quantile(q=0.75, numeric_only=True),
}).round(3).T)

# Frequency table
display(Markdown('### Frequency table'))
for col in vp_df.columns:
    if pd.api.types.is_numeric_dtype(vp_df[col]) and  vp_df[col].value_counts().size > 10:
        _bins = 10
    else: _bins = None
        
    _dfr = pd.DataFrame({
              'Frequency':vp_df[col].value_counts(bins=_bins, sort=False),
              'Percent':100*(vp_df[col].value_counts(bins=_bins, sort=False) / vp_df[col].size),
              'Valid percent':100*(vp_df[col].value_counts(bins=_bins, sort=False)/vp_df[col].count())
}).round(2)
    _dfr['Cumulative percent'] = _dfr['Valid percent'].cumsum()
    _dfr.loc['N Valid',:] =  _dfr.iloc[:,:3].sum()
    _dfr.loc['N Missing','Frequency'] =  vp_df[col].isnull().sum()
    _dfr.loc['N Total','Frequency'] =  vp_df[col].size
    display(_dfr)

# Charts
import seaborn as sns
import warnings
with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=Warning)

    # Histogram
    idx = 1
    for col in vp_df.columns:
        plt.subplot(2,2, idx)
        if pd.api.types.is_numeric_dtype(vp_df[col]) and  vp_df[col].value_counts().size > 10:
            sns.histplot(data=vp_df, x=col, kde=True)
        else:
            sns.countplot(data=vp_df, x=col)
        
        if idx < 4:
            idx += 1
        else:
            idx = 1
            plt.tight_layout()
            plt.show()

    # Scatter matrix        
    pd.plotting.scatter_matrix(vp_df, marker='o', hist_kwds={'bins': 30}, s=30, alpha=.8)
    plt.show()

    # Boxplot
    sns.boxplot(vp_df)
    plt.show()

---

In [None]:
# End of file