#### Imports

In [8]:
import pandas as pd
import random
import numpy as np
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#### Data Preprocessing and Cleaning

In [3]:
df = pd.read_csv('application_train_clean.csv')
df = df.drop(['Unnamed: 0','SK_ID_CURR'],axis=1)
print(df.columns[df.dtypes == object])
df = pd.get_dummies(df, columns = df.select_dtypes(include=['object']).columns, drop_first=True, dtype=int)
df.to_csv('cleaned_train.csv', index=False)
df.head()


Index(['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
       'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'WEEKDAY_APPR_PROCESS_START',
       'ORGANIZATION_TYPE'],
      dtype='object')


Unnamed: 0,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_XNA
0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,-3648,...,0,0,0,0,0,0,0,0,0,0
1,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,-1186,...,0,0,0,0,0,0,0,0,0,0
2,0,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,-225,-4260,...,0,0,0,0,0,0,0,0,0,0
3,0,0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932,-3038,-4311,...,0,0,0,0,0,0,0,0,0,0
4,0,0,99000.0,490495.5,27517.5,454500.0,0.035792,-16941,-1588,-4970,...,0,0,0,0,0,0,0,0,0,0


In [None]:
X, y = df.drop('TARGET',axis=1), df['TARGET']

0    1
1    0
2    0
3    0
4    0
Name: TARGET, dtype: int64

### Cross-Validation Splits

#### Stratified Split

In [40]:
# Stratified Split
def stratified_split(data, test_size=0.1, validation_size = 0.1):

    class_1 = data[data['TARGET'] == 1]
    class_0 = data[data['TARGET'] == 0]

    test_count_1 = int(len(class_1) * test_size)
    validation_count_1 = int(len(class_1) * validation_size)
    test_count_0 = int(len(class_0) * test_size)
    validation_count_0 = int(len(class_0) * validation_size)

    class_1 = class_1.sample(frac=1)
    class_0 = class_0.sample(frac=1)

    # Split each class into test, validation, and train sets
    test_data = pd.concat([class_1.iloc[:test_count_1], class_0.iloc[:test_count_0]])
    validation_data = pd.concat([
        class_1.iloc[test_count_1:test_count_1 + validation_count_1],
        class_0.iloc[test_count_0:test_count_0 + validation_count_0]
    ])
    train_data = pd.concat([
        class_1.iloc[test_count_1 + validation_count_1:],
        class_0.iloc[test_count_0 + validation_count_0:]
    ])

    # Split features and target for each set
    X_train = train_data.drop('TARGET',axis=1)
    X_test = test_data.drop('TARGET',axis=1)
    X_val = validation_data.drop('TARGET',axis=1)
    y_train = train_data['TARGET']
    y_test = test_data['TARGET']
    y_val = validation_data['TARGET']

    return X_train, X_test, X_val, y_train, y_test, y_val

In [42]:
X_train, X_test, X_val, y_train, y_test, y_val = stratified_split(df)
y_val.head

<bound method NDFrame.head of 134495    1
78076     1
166992    1
72513     1
158398    1
         ..
63766     0
135108    0
167545    0
101199    0
155797    0
Name: TARGET, Length: 26341, dtype: int64>

#### Custom Split

In [None]:
# Custom Split

def custom_split(data, valid_prop = 0.1, test_prop = 0.1, random_seed = 1738):

    train_prop = 1 - valid_prop - test_prop

    # define bins for age and income
    age_bins = [0, 40, 60, np.inf]
    age_labels = ['young', 'middle_aged', 'senior']
    income_bins = [0, 30000, 70000, np.inf]
    income_labels = ['low', 'medium', 'high']

    # convert age to years
    data['age'] = data['DAYS_BIRTH']/-365

    # create binned variables
    data['age_group'] = pd.cut(data['age'], bins=age_bins, labels=age_labels)
    data['income_group'] = pd.cut(data['AMT_INCOME_TOTAL'], bins=income_bins, labels=income_labels)

    # create a key for each group (combination of gender, age, and income)
    data['group_key'] = data['CODE_GENDER_M'].astype(str) + '_' + data['age_group'].astype(str) + '_' + data['income_group'].astype(str)

    # shuffle the data
    data = data.sample(frac=1, random_state=random_seed).reset_index(drop=True)

    # split the data based on key
    train_data = pd.DataFrame()
    val_data = pd.DataFrame()
    test_data = pd.DataFrame()

    for key, group in data.group_by('group_key'):
        n = len(group)
        n_train = int(n * train_prop)
        n_val = int(n * valid_prop)

        train_data = pd.concat([train_data, group[:n_train]])
        val_data = pd.concat([val_data, group[n_train:n_train + n_val]])
        test_data = pd.concat([test_data, group[n_train + n_val:]])

    # drop all unnecesary columns
    train_data = train_data.drop(columns=['age','age_group', 'income_group', 'strat_key'])
    val_data = val_data.drop(columns=['age','age_group', 'income_group', 'strat_key'])
    test_data = test_data.drop(columns=['age','age_group', 'income_group', 'strat_key'])

    return train_data, val_data, test_data


0         202500.0
1         270000.0
2          67500.0
3         121500.0
4          99000.0
            ...   
263414    112500.0
263415    112500.0
263416    153000.0
263417    171000.0
263418    157500.0
Name: AMT_INCOME_TOTAL, Length: 263419, dtype: float64

#### Random Split

In [None]:
def random_train_test_split(X, y, test_size=0.2, val_size=0.2, random_seed=42):
    np.random.seed(random_seed)
    
    n = len(X)
    indices = np.arange(n)
    np.random.shuffle(indices)
    
    test_indices = indices[:int(n * test_size)]
    val_indices = indices[int(n * test_size):int(n * (test_size + val_size))]
    train_indices = indices[int(n * (test_size + val_size)):]
    
    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]
    X_val, y_val = X[val_indices], y[val_indices]
    
    return X_train, y_train, X_val, y_val, X_test, y_test

### Metrics

#### Accuracy

In [9]:
# Accuracy
def compute_accuracy(truth, predicted):
    return np.mean(truth == predicted)


0         1
1         0
2         1
3         1
4         1
         ..
263414    0
263415    0
263416    0
263417    0
263418    0
Name: CODE_GENDER_M, Length: 263419, dtype: int64

#### F1 Score

In [None]:
# F1 Score
def calculate_f1(y_true, y_pred):
    TP = FP = TN = FN = 0

    for true, pred in zip(y_true, y_pred):
        if true == 1 and pred == 1:
            TP += 1
        elif true == 0 and pred == 1:
            FP += 1
        elif true == 0 and pred == 0:
            TN += 1
        elif true == 1 and pred == 0:
            FN += 1  ,

    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    if precision + recall == 0:
        return 0

    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

#### ROC-AUC

In [None]:
def calc_roc_auc(y_true, y_prob):
  sorted_indices = np.argsort(y_prob)
  true_sort = y_true[sorted_indices]
  prob_sort = y_prob[sorted_indices]

  TP = np.cumsum(true_sort)
  FP = np.cumsum(1 - true_sort)

  TPR = TP / TP[-1]
  FPR = FP / FP[-1]

  # rocauc
  rocauc_score = np.trapz(FPR, TPR)

  return rocauc_score

### Models

In [None]:
[2024-11-12T22:16:19.393Z] From https://github.com/lamkin1/data403-project2
 * branch            main       -> FETCH_HEAD
hint: You have divergent branches and need to specify how to reconcile them.
hint: You can do so by running one of the following commands sometime before
hint: your next pull:
hint: 
hint:   git config pull.rebase false  # merge
hint:   git config pull.rebase true   # rebase
hint:   git config pull.ff only       # fast-forward only
hint: 
hint: You can replace "git config" with "git config --global" to set a default
hint: preference for all repositories. You can also pass --rebase, --no-rebase,
hint: or --ff-only on the command line to override the configured default per
hint: invocation.
fatal: Need to specify how to reconcile divergent branches.
[2024-11-12T22:16:20.481Z] > git status -z -uall [25ms]
[2024-11-12T22:16:20.504Z] > git symbolic-ref --short HEAD [22ms]
[2024-11-12T22:16:20.530Z] > git for-each-ref --format=%(refname)%00%(upstream:short)%00%(objectname)%00%(upstream:track)%00%(upstream:remotename)%00%(upstream:remoteref) refs/heads/main refs/remotes/main [25ms]
[2024-11-12T22:16:20.557Z] > git remote --verbose [25ms]
[2024-11-12T22:16:20.565Z] > git for-each-ref --sort -committerdate --format %(refname) %(objectname) %(*objectname) [34ms]
[2024-11-12T22:16:20.606Z] > git config --get commit.template [35ms]

#### Logistic Regression W/ Penalization

#### Logistic Regression

In [None]:
# logistic regression w penalty
lr_penalty = LogisticRegression(penalty='l1', C=1.0)
lr_penalty.fit(X, y)
w = lr_penalty.coef_[0]
b = lr_penalty.intercept_[0]
print(f'intercept: {b} \n weights: {w}')

#### SVC

In [None]:
# SVC Model
#SVC
svc_model = SVC(gamma='auto', kernel='linear') #Can add standard scaler if you want to scale, but left it out to interpret results
svc_model.fit(X, y)
weights = svc_model.coef_
b = svc_model.intercept_
print(f'weights: {weights} \n constant: {b}')
print(f'Actual decision boundary: {-b/weights}')
