In [2]:
import numpy as np
import pandas as pd

## `.skew()`

- skew == 0 indicates symmetry
- skey < 0 indicates tail is a larger towards the left side
- skew > 0 indicates tail is larger towards the right side

In [None]:
df['age'].skew()

## Shapiro-Wilk test - Test to see if the distribution of your data set is Gaussian-like

- Returns p-value
- H0 is that data came from a normally distributed population

In [None]:
from scipy.stats import shapiro

shapiro(df['age'])[1] # This returns a p-value -> if this values is lower than 0.05 then we know it is not Gaussian-like

In [None]:
df['age'].plot(kind = 'box')

In [None]:
def interquartile(df, col, k = 1.5):
    q1 = np.percentile(df[col], 25)
    q3 = np.percentile(df[col], 75)
    cutoff = (q3 - q1) *k
    lower_fence = q1 - cutoff
    upper_fence = q3 + cutoff
    print(lower_fence, upper_fence)
    return df.query(f"{lower_fence} < age < {upper_fence})
                    
iqr = interquartile(df, 'age')
                    
sns.distplot(iqr['age'])

## Normalization

- scaling all variables so that they take values form the same range
- Typically this range is [0,1]
- Done to minimize effects of bias (sampling or measurement errors)


### `MinMacScaler`

- `.min()` == 0
- `.max()` == 1
- Argement must be 2D numpy array

In [None]:
iqr['age'].values[:5]

In [3]:
np.random.rand(5,1)

array([[0.31356227],
       [0.63541487],
       [0.69035064],
       [0.79447823],
       [0.45991638]])

In [None]:
iqr['age'].values.reshape(-1, 1)[:5]
# -1 TELLS PANDAS TO INFER THE VALUE FORM THE LENGTH OF THE SERIES

In [4]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
rescaled = MinMaxScaler().fit_transform(iqr['age'].values.reshape(-1,1))
rescaled[:5]

In [None]:
pd.Series(rescaled.ravel())[:5]

## Review: `round()`

In [None]:
iqr['fare'].head()

In [None]:
iqr['fare'].apply(round).head()

In [None]:
iqr['fare'].apply(lambda x: round(x, 2)).head()

## Review: `pd.get_dummies()` -> categorical data into numerical by making them binary

In [None]:
pd.get_dummies(iqr['sex']).head()

## Review: `.sample()` -> randomly sample data

In [None]:
# shuffled = iqr.sample(3) #frac == fraction of rows to randomly sample
shuffled = iqr.sample(frac = 1) -> shuffles the entire data
shuffled.head()

## Review: `train_test_split()`

In [5]:
from sklearn.model_selection import train_test_split

In [None]:
features_train, features_test, target_train, target_test = train_test_split(
    iqr[['age', 'sex']], # Features (X)
    iqr[['survived']], # Target (y)
    test_sezi = .2
)

features_train.head()

In [None]:
target_train.head()