# Cheatsheet

In [None]:
# Import standard set of libraries
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics 
np.random.seed(321) # just in case I forget later

In [None]:
url = 'https://drive.google.com/file/d/1Q9EgeAzyreI1tsCR7R9xBOXvJpQOEfWL/view?usp=sharing'
url = 'https://drive.google.com/uc?id=' + url.split('/')[-2]
df = pd.read_csv(url)
df_raw = df #create back up in case I mess up later
df.columns = df.columns.str.replace(' ', '_') # change column names 

In [None]:
BACKUP
df = pd.read_csv("NAME.csv")
df_raw = df #create back up in case I mess up later
df.columns = df.columns.str.replace(' ', '_') # change column names 

### Basics

In [None]:
# Select Columns except Target Variable
features = np.setdiff1d(df.columns, ['TARGET']).tolist()

In [None]:
#Check for Null values and remove them
N_null = sum(df[features].isnull().sum())
df = df.dropna()
print("The raw_dataset contains {} null values".format(N_null))

In [None]:
# Remove Variables from df
to_remove = ['var1', 'var2', 'var3']
df = df.drop(to_remove, axis = 1)

#update features
features = np.setdiff1d(features, to_remove).tolist()

In [None]:
#Select labels
df_labels = df.columns[1:df.shape[1]]

### Scaling

MAKE SURE YOUR SCALED VARIABLES ARE NUMERIC!

In [None]:
# MinMaxScaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(df[features])
scaler.transform(df[features])
df[features] = scaler.transform(df[features])
df[features]

In [None]:
# Z-Transfomration
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df[features])
scaler.transform(df[features])
df[features] = scaler.transform(df[features])
df[features]

In [None]:
# we first need our function to handle outliers 
def outlier_truncation(x, factor=1.5):
    x_new = x.copy()
    IQR = x.quantile(0.75) - x.quantile(0.25) # Calculate IQR
    # Define upper/lower bound
    upper = x.quantile(0.75) + factor*IQR
    lower = x.quantile(0.25) - factor*IQR
    # Truncation
    x_new[x < lower] = lower
    x_new[x > upper] = upper
    return x_new

num_cols = [x for x in df.columns if df[x].dtype == 'float32' and x not in ['DUMMY OR BAD DISTRIBUTION (>75% = unique']]  
df[num_cols] = df[num_cols].apply(outlier_truncation, axis=0, args=(3,))  


In [None]:
# BoxCox Transformation
from scipy import stats
bc_fitted_feature, bc_fitted_lambda = stats.boxcox(df['var']+1) # Again, we are adding 1 because all features have 0s (not allowed in BC)
bc_fitted_lambda

In [None]:
#Yeo-Johson Transformation
from scipy import stats
yj_fitted_feature, yj_fitted_lambda = stats.yeojohnson(df['var'])
yj_fitted_lambda

### Dummy Encoding

In [None]:
# Dummy encode features: 'Intl Plan', 'VMail Plan', and 'Area Code'
to_dummy = ['Intl_Plan', 'VMail_Plan', 'Area_Code']

df = pd.get_dummies(df, columns = to_dummy)

### WOE Encoding

In [None]:
import scorecardpy as sc

bins = sc.woebin(train_df, y="BAD", x=['JOB', 'REASON'])
sc.woebin_plot(bins)

X_train_woe = sc.woebin_ply(X_train_scaled, bins)
X_test_woe = sc.woebin_ply(X_test_scaled, bins)

X_train_woe.head()

In [None]:
import scorecardpy as sc

bins = sc.woebin(train_df, y="BAD", x=['JOB', 'REASON'])
sc.woebin_plot(bins)

X_train_woe = sc.woebin_ply(X_train_scaled, bins)
X_test_woe = sc.woebin_ply(X_test_scaled, bins)

X_train = sc.woebin_ply(X_train, bins)
X_test = sc.woebin_ply(X_test, bins)

### SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE().fit_resample(X_train, np.ravel(y_train))

In [None]:
from imblearn.over_sampling import BorderlineSMOTE

X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X_train, np.ravel(y_train))

## EDA

### General

In [None]:
df.describe(include='all')

In [None]:
df.groupby('TARGET')['VAR1', 'VAR2'].mean()

In [None]:
# Compute imbalance ratio
freq = pd.crosstab(index=df['TARGET'], columns='count')
print(freq)

ir = freq['count'][0]/freq['count'][1]
print('Imbalance ratio: ' ,ir)

In [None]:
#MEAN, leave out normalize for count
reason = pd.crosstab(df.VAR1, df.VAR2, normalize='index')
reason

In [None]:
# Is a variable normally distributed?
from scipy import stats

stats.normaltest(df['VAR'])
#If you would like a more scienfitic test for normality, you can use `normaltest` from `scipy`. This function's documentation is 
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.normaltest.html. It outputs 2 numbers, the first is the sum of 
# squares of the z-scores returned by a skewtest and kurtosistest. The second is a 2-sided chi2 probability hypothesis test. Thus, 
# if the p-value is below 0.05, it is likely you do not have normally distributed data.

#OR

from statsmodels.graphics.gofplots import qqplot

qqplot(feature, line='s')
plt.show()

### Selection

CAREFUL: Some categorical variables CAN be numerical (telephone numbers for example). Make sure to exclude them with setdiff e.g.

In [None]:
# Select Numerical Variables only
num_vars = df.select_dtypes(include=[np.float64, np.float32, np.int32, np.int64]).columns
df[num_vars] = df[num_vars].astype(np.float32)

#num_vars = np.setdiff1d(num_vars, ['CATEGORICALVARIABLE'])

#
df['JOB'] = df['JOB'].astype('category')

In [None]:
# Select categorical Data and transform to category if not already category
categories = df.select_dtypes(include=[np.object]).columns
df[categories] = df[categories].astype(np.category)

In [None]:
#Select Data based on Condition
df.loc[df.BAD == 1, ['LOAN', 'JOB', 'YOJ']]

In [None]:
#Filtering Data:
df.loc[df['BAD'] == 1]
#syntax:
# df.loc[condition]

#Alternatively
df.query('VAR > 10')

### Replacements

In [None]:
# Replace NA with anything, here mode e.g.
df.VAR[df.VAR.isnull()] = df.VAR.mode()[0]  # the index [0] ensures that we only extract the value from the result of calling mode()

In [None]:
#Replace multiple numerical columns with median
for col in df.select_dtypes(include='float32').columns:  # loop over all numeric columns
    if df[col].isna().sum() > 0:                         # check if there are any missing values in the current feature
        m = df[col].median(skipna=True)                  # compute the median of that feature
        df[col].fillna(m, inplace=True)                  # replace missing values with the median

### Graphs

In [None]:
plt.rcParams['figure.figsize'] = 24,8
# width, height

In [None]:
freq = pd.crosstab(index=df['TARGET'], columns='count')
print(freq)

In [None]:
freq = pd.crosstab(index=df['TARGET'], columns='count')

#test1 = freq.index.values #if categorical
test1 =['0', '1']
test2 = freq['count']

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
plt.title('Event Distribution')
ax.bar(test1, test2)

In [None]:
freq = pd.crosstab(index=df['TARGET'], columns='count')

#test1 = freq.index.values #if categorical
test1 =['0', '1']
test2 = freq['count']

sns.barplot(x = test1, y = test2, ci= None)

#https://python-graph-gallery.com/grouped-barplot

In [None]:
#Histographs for all numerical variables CHECK IF ALL VARIABLES ARE 'float64'
df.select_dtypes(include='float64').hist(bins=30, figsize=(15, 10));

### Correlation Matrix

In [None]:
#Heatmap
corr= df.corr()
f,ax = plt.subplots(figsize=(18, 15))
sns.heatmap(corr ,annot=True,linewidth=.5,fmt='1f');

In [None]:
#Heatmap with Threshold

corr_threshold = 0.3
f,ax = plt.subplots(figsize=(18, 15))
sns.heatmap(corr[(corr >= corr_threshold) | (corr <= -corr_threshold)],
            annot=True);

In [None]:
#Alternative Correlation Plot View
heatmap = sns.heatmap(df.corr()[['Target']].sort_values(by='Target', ascending=False), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features Correlating with Target', fontdict={'fontsize':18}, pad=16);


## Models

In [None]:
# We add a constant column to X. Think of this as the Python way to include an intercept in your model 
X = add_constant(X, prepend=True, has_constant='raise')

#note we have to disable adding a constant in the models then (this is mostly default for the models

List of Classifiers known:

### Regression:
sklearn.linear_model.LinearRegression
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

sklearn.linear_model.Lasso
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html

sklearn.linear_model.LassoCV
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html

sklearn.linear_model.Ridge
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge

sklearn.linear_model.RidgeCV
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html

sklearn.linear_model.ElasticNet
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html

sklearn.linear_model.ElasticNetCV
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNetCV.html

sklearn.tree.DecisionTreeRegressor
https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html

sklearn.ensemble.RandomForestRegressor
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

sklearn.ensemble.GradientBoostingRegressor
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html

### Classification:
sklearn.linear_model.LogisticRegression
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

sklearn.linear_model.LogisticRegressionCV
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV

sklearn.linear_model.RidgeClassifier
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifier.html

sklearn.tree.DecisionTreeClassifier
https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

sklearn.ensemble.RandomForestClassifier
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

sklearn.ensemble.GradientBoostingClassifier
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

### Basic Models

#### Logistic Regression

In [None]:
from sklearn.ensemble import GradientBoostingClassifier  
from sklearn.model_selection import GridSearchCV

parameters = {
    "learning_rate": [1e-1, 1e-2], #obviously this is not the perfect approach, in real life setting I would suggest using small jumps and covering a wide range of learning rates as to avoid overshooting
    "max_depth":[3,8]
    }

clf_gb = GridSearchCV(GradientBoostingClassifier(), parameters, scoring = 'f1', cv=3, n_jobs=-1)
clf_gb.fit(X_train, y_train)
y_gb = clf_gb.predict(X_test)
y_gb_proba = clf_gb.predict_proba(X_test)[:,1]
print(clf_gb.best_params_)

#### Decision Tree

In [None]:
from sklearn.ensemble import GradientBoostingClassifier  
from sklearn.model_selection import GridSearchCV

parameters = {
    "learning_rate": [1e-1, 1e-2], #obviously this is not the perfect approach, in real life setting I would suggest using small jumps and covering a wide range of learning rates as to avoid overshooting
    "max_depth":[3,8]
    }

clf_gb = GridSearchCV(GradientBoostingClassifier(), parameters, scoring = 'f1', cv=3, n_jobs=-1)
clf_gb.fit(X_train, y_train)
y_gb = clf_gb.predict(X_test)
y_gb_proba = clf_gb.predict_proba(X_test)[:,1]
print(clf_gb.best_params_)

### Advanced Models

#### ElasticNet

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

parameters = {
    "l1_ratio": [0, 1],
    "C":[0.2, 1]
    }

clf_lrreg = GridSearchCV(LogisticRegression(penalty = 'elasticnet', solver = 'saga', max_iter = 10000), parameters, scoring = 'f1', cv=3, n_jobs=-1)
clf_lrreg.fit(X_train, y_train)
y_lrreg = clf_lrreg.predict(X_test)
y_lrreg_proba = clf_lrreg.predict_proba(X_test)[:,1]
print(clf_lrreg.best_params_)

#### GBM

In [None]:
from sklearn.ensemble import GradientBoostingClassifier  
from sklearn.model_selection import GridSearchCV

parameters = {
    "learning_rate": [1e-1, 1e-2], #obviously this is not the perfect approach, in real life setting I would suggest using small jumps and covering a wide range of learning rates as to avoid overshooting
    "max_depth":[3,8]
    }

clf_gb = GridSearchCV(GradientBoostingClassifier(), parameters, scoring = 'f1', cv=3, n_jobs=-1)
clf_gb.fit(X_train, y_train)
y_gb = clf_gb.predict(X_test)
y_gb_proba = clf_gb.predict_proba(X_test)[:,1]
print(clf_gb.best_params_)

#### XGBoost

In [None]:
import xgboost as xgb

# Setting up the grid of meta-parameters
xgb_param_grid = {
    'colsample_bytree': np.linspace(0.5, 0.9, 5),  # random subspace
    'n_estimators': [100, 200],  # ensemble size or number of gradient steps
    'max_depth': [5, 10],   # max depth of decision trees
    'learning_rate': [0.1, 0.01],  # learning rate
    'early_stopping_rounds': [10]}  # early stopping if no improvement after that many iterations

gs_xgb = GridSearchCV(estimator=xgb.XGBClassifier(), param_grid=xgb_param_grid, scoring='roc_auc', cv=5, verbose=0)
gs_xgb.fit(X_train, y_train.values.ravel())
gs_xgb_proba = gs_xgb.predict_proba(X_test)[:,1]

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier  # import library
from sklearn.model_selection import GridSearchCV
print('Tuning random forest classifier')
rf = RandomForestClassifier(random_state=888, max_samples = 0.5)  # This way, bootstrap sample size will be 50% of the training set

# Define meta-parameter grid of candidate settings
# The following settings are just for illustration
param_grid = {'n_estimators': [100, 200, 500],
              'max_features': [1, 2, 4]
              }

# Set up the grid object specifying the tuning options
gs_rf = GridSearchCV(rf, param_grid, cv=5, scoring='roc_auc', verbose=1)
gs_rf.fit(X_train, y_train.values.ravel())
gs_rf_proba = gs_rf.predict_proba(X_test)[:,1]

## Evaluation

### Metrics

In [None]:
#clf_lr.fit(X_train, y_train).decision_function(X_test)

#AUC
fpr, tpr, thresholds = metrics.roc_curve(y_test, clf_lr.predict_proba(X_test)[:,1])
roc_auc = metrics.auc(fpr, tpr)
print(roc_auc)

In [None]:
# CV Scoring

from sklearn.model_selection import cross_validate 

# Add list of scoring parameters directly to the function (Acc)
score = cross_validate(logit, X, y, scoring=p_measures, cv=10)
score

pd.DataFrame(score).mean()

#OR 
lasso_scores = cross_val_score(lasso_sk, X, y.ravel(), cv=folds)

#### Graphs

#### PrecisionRecallDisplay

In [None]:
# Code to create the chart
from sklearn import metrics
from sklearn.metrics import PrecisionRecallDisplay

display = PrecisionRecallDisplay.from_predictions(y_test, clf_dt.predict_proba(X_test)[:,1], name="Decision Tree")
_ = display.ax_.set_title("2-class Precision-Recall curve")

#### ROC Curve

In [None]:
metrics.RocCurveDisplay.from_predictions(y_test, y_lr_proba, name = 'Logit')

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_lr_proba)
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                  estimator_name='ESTIMATOR NAME')
display.plot()  

### Residual Plot

In [None]:
import matplotlib.pyplot as plt

y_pred = model.predict(X_test)[:,1]

residuals = y_test - y_pred

plt.scatter(residuals,y_pred)

plt.show()

In [None]:
mean = residuals.mean()
std = residuals.std()

residuals = (residuals - mean)/std

### Manual Thresholds

In [None]:
# Train a model and get probaility predictions
clf = LogisticRegression(penalty='none', fit_intercept=True).fit(X_train, y_train.ravel())
pred_proba = clf.predict_proba(X_test)[:,1]

# Use the array above to manually determine the cut-off and convert to class predictions
pred_default = np.where(pred_proba >= 0.5, 1, 0) # 0.5 is the default cut-off, equivalant to y_pred from above
pred_th = np.where(pred_proba >= threshold_bayes, 1, 0) # Using the cut-off defined by the cost-minimal threshold function
print(np.mean(pred_default), np.mean(pred_bayes)) # Shows the percentage of observations that are now predicted with the label 1, default

### Sonstiges, keine Ahnung, wird wahrscheinlich nicht relevant sein

In [None]:
#Logit
1/(1+math.e**-(clf_lr.fit(X_train, y_train).decision_function(X_test)))

clf_lr.fit(X_train, y_train).decision_function(X_test)

In [None]:
class filter_binary_target:
    def __init__(self, df, target):
        self.target = target
        self.data_head = df.head()

    def auto_filter_binary_target(self):
        print('Data must be in a clean pandas DataFrame. Categorical variables must be of data type bool or category. Continuous variables must be int64 or float64.')
        data_no_target = df.drop(columns=self.target)
        columns = ['Data Type', 'Metric', 'Score']
        index = data_no_target.columns
        result = pd.DataFrame(index=index, columns=columns)

        for col in data_no_target:
            if data_no_target.dtypes[col] == 'bool' or data_no_target.dtypes[col].name == 'category':
                result.loc[col, 'Data Type'] = "discrete"
                result.loc[col, 'Metric'] = "IV"
                result.loc[col, 'Score'] = self.IV_binary_target(feature=col)

            if data_no_target.dtypes[col] == 'int64' or data_no_target.dtypes[col] == 'float64':
                result.loc[col, 'Data Type'] = "continuous"
                result.loc[col, 'Metric'] = "Fisher"
                result.loc[col, 'Score'] = self.fisher_binary_target(feature=col)

        return result

    def IV_binary_target(self, feature):  # same code as used above
        data = pd.DataFrame()
    
        data['Count'] = df[feature].value_counts()
        data['Bad'] = df.groupby([feature])[self.target].sum()
        data['Good'] = data['Count'] - data['Bad']
    
        data["Distribution Bad"] = data["Bad"] / data["Bad"].sum()
        data["Distribution Good"] = data["Good"] / data["Good"].sum()
    
        data['WOE'] = np.log(data["Distribution Good"] / data["Distribution Bad"])
        data.replace({"WOE": {np.inf: 0, -np.inf: 0}})

        data["IV"] = data["WOE"] * (data["Distribution Good"] - data["Distribution Bad"])

        iv = data["IV"].sum()

        return iv

    def fisher_binary_target(self, feature):
        mu_0 = df.groupby(df[self.target])[feature].mean()[0]
        mu_1 = df.groupby(df[self.target])[feature].mean()[1]
        var_0 = df.groupby(df[self.target])[feature].var()[0]
        var_1 = df.groupby(df[self.target])[feature].var()[1]

        num = abs(mu_0 - mu_1)
        den = (var_0 + var_1) ** 0.5
        score = num/den
    
        return score

    def pearson(self, feature):  # since our target is binary, we actually don't need this. However, if you would like to expand this class, you can use this code
        mean_feature = df[feature].mean()
        mean_target = df[self.target].mean()
        num = ((df[feature] - mean_feature)*(df[self.target] - mean_target)).sum()
        den = (((df[feature] - mean_feature)**2).sum() * ((df[self.target] - mean_target)**2).sum()) ** .5
        rho = num/den
        return rho

In [None]:
filter = filter_binary_target(df=train_df, target="BAD")

filter.auto_filter_binary_target()