# boosters.pro/championship/digital_reputation_challenge
# pub: 0.612335
# priv: 0.612814
# 19 PLACE 🥈

# Importing...

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from scipy import sparse, stats
from scipy.linalg import svd
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import (KFold, StratifiedKFold, cross_val_score,
                                     cross_validate, train_test_split, cross_val_predict)
from tqdm import tqdm
from xgboost import XGBClassifier
from sklearn.svm import SVC
import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression
import implicit
from scipy.sparse import csr_matrix
import umap
from sklearn.neural_network import MLPClassifier

# Some functions

In [None]:
def cross_validation_score_statement(estimator,
                                     X,
                                     y,
                                     scoring,
                                     n_splits=5,
                                     statement=None,
                                     random_state=0):
    cv = StratifiedKFold(n_splits=n_splits,
                             shuffle=True,
                             random_state=random_state)
    cv_iter = list(cv.split(X, y))
    scores = []

    for train, test in cv_iter:
        estimator.fit(X.iloc[train, :].values, y.iloc[train].values)
        if statement is not None:
            y_statement = y.iloc[test].loc[statement[test]]
            pred_statement = estimator.predict_proba(
                X.iloc[test, :].loc[statement[test]].values)[:, 1]
        else:
            y_statement = y.iloc[test]
            pred_statement = estimator.predict_proba(X.iloc[test, :].values)[:, 1]
        scores.append(scoring(y_statement, pred_statement))
    return np.array(scores)

In [None]:
def get_xgb(X, Y): 
    res_ = []
    n_estimators = [100, 200, 250]
    min_child = [2,3,4,5]
    max_depth = [2,3]
    lr = [0.017, 0.009, 0.005, 0.02, 0.1]
    for e in n_estimators:
        for md in max_depth:
            for mcw in min_child:
                for l in lr:
                    print(e, md, mcw, l)
                    res = cross_validation_score_statement(XGBClassifier(n_jobs=8, random_state=0, learning_rate=l, min_child_weight=mcw, max_depth=md, n_estimators=e),
                                             X.drop(columns=['id']),
                                             Y,
                                             roc_auc_score,
                                             n_splits=3,
                                             statement=None,
                                             random_state=0)
                    print(res.mean())
                    res_.append(((e, md, mcw, l), res.mean(), res))
    return sorted(res_, key=lambda x: x[1])[-1]

In [None]:
def get_stack_preds(estim, X, target, n_splits):
    return cross_val_predict(estim, X, target, cv=n_splits, n_jobs=8, method='predict_proba')[:, 1]

# Datasets

In [None]:
TRAIN_PATH = 'train/'
X1 = pd.read_csv(TRAIN_PATH + 'X1.csv')
X2 = pd.read_csv(TRAIN_PATH + 'X2.csv')
X3 = pd.read_csv(TRAIN_PATH + 'X3.csv')

Y = pd.read_csv(TRAIN_PATH + 'Y.csv')

TEST_PATH = 'test/'

X1_test = pd.read_csv(TEST_PATH + 'X1.csv')
X2_test = pd.read_csv(TEST_PATH + 'X2.csv')
X3_test = pd.read_csv(TEST_PATH + 'X3.csv')

# Feature extraction

## 20 and 10 factors from ALS

In [None]:
k=20

model = implicit.als.AlternatingLeastSquares(factors=k, iterations=20, calculate_training_loss=True, regularization=1)
X2_ = X2.append(X2_test).sort_values('id')
res = csr_matrix((np.ones(len(X2_)), (X2_['id'], X2_['A'])))
model.fit(res)
fac = pd.DataFrame(model.item_factors[X1['id']], columns=list(range(26, 26 + k)))
fac_test = pd.DataFrame(model.item_factors[X1_test['id']], columns=list(range(26, 26 + k)))
fac['id'] = X1['id']
fac_test['id'] = X1_test['id']
X = X1.merge(fac, on='id').drop(columns=['19'])
X_test = X1_test.merge(fac_test, on='id').drop(columns=['19'])

In [None]:
k=10

model = implicit.als.AlternatingLeastSquares(factors=k, iterations=20, calculate_training_loss=True, regularization=1)
X2_ = X2.append(X2_test).sort_values('id')
res = csr_matrix((np.ones(len(X2_)), (X2_['id'], X2_['A'])))
model.fit(res)
fac = pd.DataFrame(model.item_factors[X1['id']], columns=list(range(26, 26 + k)))
fac_test = pd.DataFrame(model.item_factors[X1_test['id']], columns=list(range(26, 26 + k)))
fac['id'] = X1['id']
fac_test['id'] = X1_test['id']
X_ = X1.merge(fac, on='id').drop(columns=['19'])
X_test_ = X1_test.merge(fac_test, on='id').drop(columns=['19'])

In [None]:
estims_data = [('xgb_1', XGBClassifier(learning_rate=0.02, max_depth=3, min_child_weight=5, n_estimators=100, random_state=0), X, X_test),
              ('lr_1', LogisticRegression(C=3, class_weight='balanced'), (X - X.mean()) / X.std(), (X_test- X.mean()) / X.std()),
              ('rf_1', RandomForestClassifier(n_estimators=300, criterion='entropy', max_depth=3, random_state=0), X, X_test),
               ('svm_r', SVC(C=3, class_weight='balanced', probability=True, random_state=0), (X - X.mean()) / X.std(), (X_test- X.mean()) / X.std()),
               ('nn_l', MLPClassifier((200, 100,), solver='sgd'), (X - X.mean()) / X.std(), (X_test- X.mean()) / X.std()),
                ('xgb_2', XGBClassifier(learning_rate=0.02, max_depth=2, min_child_weight=5, n_estimators=250, random_state=0), X_, X_test_),
              ('lr_2', LogisticRegression(C=3, class_weight='balanced'), (X_ - X_.mean()) / X_.std(), (X_test_- X_.mean()) / X_.std()),
              ('rf_2', RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=3, random_state=0), X_, X_test_),
               ('svm_r_2', SVC(C=3, class_weight='balanced', probability=True, random_state=0), (X_ - X_.mean()) / X_.std(), (X_test_- X_.mean()) / X_.std()),
               ('nn_2', MLPClassifier((200, 100,), solver='sgd'), (X_ - X_.mean()) / X_.std(), (X_test_- X_.mean()) / X_.std()),
              ]

## Stacking for ALS features

In [None]:
stacked_preds_1 = {}
test_preds_1 = {}
stacked_preds_2 = {}
test_preds_2 = {}
stacked_preds_3 = {}
test_preds_3 = {}
stacked_preds_4 = {}
test_preds_4 = {}
stacked_preds_5 = {}
test_preds_5 = {}

In [None]:
target = Y['1']
for name, estim, data_tr, data_test in estims_data:
    print(name)
    stacked_preds_1[name] = get_stack_preds(estim, data_tr.drop(columns=['id']), target, 10)
    print(roc_auc_score(target, stacked_preds_1[name]))
    estim.fit(data_tr.drop(columns=['id']), target)
    test_preds_1[name] = estim.predict_proba(data_test.drop(columns=['id']))[:, 1]

In [None]:
target = Y['2']
for name, estim, data_tr, data_test in estims_data:
    print(name)
    stacked_preds_2[name] = get_stack_preds(estim, data_tr.drop(columns=['id']), target, 10)
    print(roc_auc_score(target, stacked_preds_2[name]))
    estim.fit(data_tr.drop(columns=['id']), target)
    test_preds_2[name] = estim.predict_proba(data_test.drop(columns=['id']))[:, 1]

In [None]:
target = Y['3']
for name, estim, data_tr, data_test in estims_data:
    print(name)
    stacked_preds_3[name] = get_stack_preds(estim, data_tr.drop(columns=['id']), target, 10)
    print(roc_auc_score(target, stacked_preds_3[name]))
    estim.fit(data_tr.drop(columns=['id']), target)
    test_preds_3[name] = estim.predict_proba(data_test.drop(columns=['id']))[:, 1]

In [None]:
target = Y['4']
for name, estim, data_tr, data_test in estims_data:
    print(name)
    stacked_preds_4[name] = get_stack_preds(estim, data_tr.drop(columns=['id']), target, 10)
    print(roc_auc_score(target, stacked_preds_4[name]))
    estim.fit(data_tr.drop(columns=['id']), target)
    test_preds_4[name] = estim.predict_proba(data_test.drop(columns=['id']))[:, 1]

In [None]:
target = Y['5']
for name, estim, data_tr, data_test in estims_data:
    print(name)
    stacked_preds_5[name] = get_stack_preds(estim, data_tr.drop(columns=['id']), target, 10)
    print(roc_auc_score(target, stacked_preds_5[name]))
    estim.fit(data_tr.drop(columns=['id']), target)
    test_preds_5[name] = estim.predict_proba(data_test.drop(columns=['id']))[:, 1]

In [None]:
ds_1 = {}
ds_1['cl_1'] = stacked_preds_1['xgb_1']
ds_1['cl_10'] = stacked_preds_1['xgb_2']
ds_1['cl_3'] = stacked_preds_1['rf_1']
ds_1['cl_5'] = stacked_preds_1['lr_1']
ds_1['cl_8'] = stacked_preds_1['svm_r_2']
ds_1['cl_9'] = stacked_preds_1['nn_2']
ds_1_test = {}
ds_1_test['cl_1'] = test_preds_1['xgb_1']
ds_1_test['cl_10'] = test_preds_1['xgb_2']
ds_1_test['cl_3'] = test_preds_1['rf_1']
ds_1_test['cl_5'] = test_preds_1['lr_1']
ds_1_test['cl_8'] = test_preds_1['svm_r_2']
ds_1_test['cl_9'] = test_preds_1['nn_2']
ds_1_ = pd.DataFrame(ds_1)
ds_1_test_ = pd.DataFrame(ds_1_test)

In [None]:
ds_1 = {}
ds_1['cl_1'] = stacked_preds_2['xgb_1']
ds_1['cl_2'] = stacked_preds_2['xgb_2']
ds_1['cl_31'] = stacked_preds_2['rf_1']
ds_1['cl_4'] = stacked_preds_2['lr_1']
ds_1['cl_5'] = stacked_preds_2['svm_r_2']
ds_1['cl_6'] = stacked_preds_2['nn_2']
ds_1_test = {}
ds_1_test['cl_1'] = test_preds_2['xgb_1']
ds_1_test['cl_2'] = test_preds_2['xgb_2']
ds_1_test['cl_31'] = test_preds_2['rf_1']
ds_1_test['cl_4'] = test_preds_2['lr_1']
ds_1_test['cl_5'] = test_preds_2['svm_r_2']
ds_1_test['cl_6'] = test_preds_2['nn_2']
ds_2 = pd.DataFrame(ds_1)
ds_2_test = pd.DataFrame(ds_1_test)

In [None]:
ds_1 = {}
ds_1['cl_1'] = stacked_preds_3['xgb_1']
ds_1['cl_2'] = stacked_preds_3['xgb_2']
ds_1['cl_3'] = stacked_preds_3['rf_1']
ds_1['cl_4'] = stacked_preds_3['lr_1']
ds_1['cl_5'] = stacked_preds_3['svm_r_2']
ds_1['cl_6'] = stacked_preds_3['nn_2']
ds_1_test = {}
ds_1_test['cl_1'] = test_preds_3['xgb_1']
ds_1_test['cl_2'] = test_preds_3['xgb_2']
ds_1_test['cl_3'] = test_preds_3['rf_1']
ds_1_test['cl_4'] = test_preds_3['lr_1']
ds_1_test['cl_5'] = test_preds_3['svm_r_2']
ds_1_test['cl_6'] = test_preds_3['nn_2']
ds_3 = pd.DataFrame(ds_1)
ds_3_test = pd.DataFrame(ds_1_test)

In [None]:
ds_1 = {}
ds_1['cl_1'] = stacked_preds_4['xgb_1']
ds_1['cl_2'] = stacked_preds_4['xgb_2']
ds_1['cl_3'] = stacked_preds_4['rf_1']
ds_1['cl_4'] = stacked_preds_4['lr_1']
ds_1['cl_5'] = stacked_preds_4['svm_r_2']
ds_1['cl_6'] = stacked_preds_4['nn_2']
ds_1_test = {}
ds_1_test['cl_1'] = test_preds_4['xgb_1']
ds_1_test['cl_2'] = test_preds_4['xgb_2']
ds_1_test['cl_3'] = test_preds_4['rf_1']
ds_1_test['cl_4'] = test_preds_4['lr_1']
ds_1_test['cl_5'] = test_preds_4['svm_r_2']
ds_1_test['cl_6'] = test_preds_4['nn_2']
ds_4 = pd.DataFrame(ds_1)
ds_4_test = pd.DataFrame(ds_1_test)

In [None]:
ds_1 = {}
ds_1['cl_1'] = stacked_preds_5['xgb_1']
ds_1['cl_2'] = stacked_preds_5['xgb_2']
ds_1['cl_3'] = stacked_preds_5['rf_1']
ds_1['cl_4'] = stacked_preds_5['lr_1']
ds_1['cl_5'] = stacked_preds_5['svm_r_2']
ds_1['cl_6'] = stacked_preds_5['nn_2']
ds_1_test = {}
ds_1_test['cl_1'] = test_preds_5['xgb_1']
ds_1_test['cl_2'] = test_preds_5['xgb_2']
ds_1_test['cl_3'] = test_preds_5['rf_1']
ds_1_test['cl_4'] = test_preds_5['lr_1']
ds_1_test['cl_5'] = test_preds_5['svm_r_2']
ds_1_test['cl_6'] = test_preds_5['nn_2']
ds_5 = pd.DataFrame(ds_1)
ds_5_test = pd.DataFrame(ds_1_test)

In [None]:
ds_1_['id'] = X1.id
ds_2['id'] = X1.id
ds_3['id'] = X1.id
ds_4['id'] = X1.id
ds_5['id'] = X1.id

ds_1_test_['id'] = X1_test.id
ds_2_test['id'] = X1_test.id
ds_3_test['id'] = X1_test.id
ds_4_test['id'] = X1_test.id
ds_5_test['id'] = X1_test.id

# Feature extraction UMAP

In [None]:
ustacked_preds_1 = {}
utest_preds_1 = {}
ustacked_preds_2 = {}
utest_preds_2 = {}
ustacked_preds_3 = {}
utest_preds_3 = {}
ustacked_preds_4 = {}
utest_preds_4 = {}
ustacked_preds_5 = {}
utest_preds_5 = {}

In [None]:
umap_ = umap.UMAP(n_components=10, random_state=0, verbose=True, metric='cosine')
X2_ = X2.append(X2_test).sort_values('id')
res = csr_matrix((np.ones(len(X2_)), (X2_['id'], X2_['A'])))
fac_umap = umap_.fit_transform(res)

In [None]:
k = 10
fac_u = pd.DataFrame(fac_umap[X1['id']], columns=list(range(26, 26 + k)))
fac_u_test = pd.DataFrame(fac_umap[X1_test['id']], columns=list(range(26, 26 + k)))
fac_u['id'] = X1['id']
fac_u_test['id'] = X1_test['id']
UX = X1.merge(fac_u, on='id').drop(columns=['19'])
UX_test = X1_test.merge(fac_u_test, on='id').drop(columns=['19'])

## Stacking for UMAP features

In [None]:
estims_data = [('xgb_1', XGBClassifier(learning_rate=0.009, max_depth=2, min_child_weight=5, n_estimators=250, random_state=0), UX, UX_test),
               ('lgbm_1', lgb.LGBMClassifier(n_estimators=200, class_weight='balanced'), (UX - UX.mean()) / UX.std(), (UX_test- UX.mean()) / UX.std()),
              ('lr_1', LogisticRegression(C=3, class_weight='balanced'), (UX - UX.mean()) / UX.std(), (UX_test- UX.mean()) / UX.std()),
              ('rf_1', RandomForestClassifier(n_estimators=300, criterion='entropy', max_depth=3, random_state=0), UX, UX_test),
               ('svm_r', SVC(C=3, class_weight='balanced', probability=True, random_state=0), (UX - UX.mean()) / UX.std(), (UX_test- UX.mean()) / UX.std()),
               ('nn', MLPClassifier((200, 100,), solver='sgd'), (UX - UX.mean()) / UX.std(), (UX_test- UX.mean()) / UX.std())
              ]

In [None]:
estims_data = [('xgb_1', XGBClassifier(learning_rate=0.02, max_depth=2, min_child_weight=5, n_estimators=200, random_state=0), UX, UX_test),
              ('lr_1', LogisticRegression(C=3, class_weight='balanced'), (UX - UX.mean()) / UX.std(), (UX_test- UX.mean()) / UX.std()),
              ('rf_1', RandomForestClassifier(n_estimators=300, criterion='entropy', max_depth=3, random_state=0), UX, UX_test),
               ('svm_r', SVC(C=3, class_weight='balanced', probability=True, random_state=0), (UX - UX.mean()) / UX.std(), (UX_test- UX.mean()) / UX.std()),
               ('svm_l', MLPClassifier((200, 100,), solver='sgd'), (UX - UX.mean()) / UX.std(), (UX_test- UX.mean()) / UX.std())
              ]

In [None]:
estims_data = [('xgb_1', XGBClassifier(learning_rate=0.005, max_depth=2, min_child_weight=3, n_estimators=250, random_state=0), UX, UX_test),
              ('lr_1', LogisticRegression(C=3, class_weight='balanced'), (UX - UX.mean()) / UX.std(), (UX_test- UX.mean()) / UX.std()),
              ('rf_1', RandomForestClassifier(n_estimators=300, criterion='entropy', max_depth=3, random_state=0), UX, UX_test),
              ('svm_r', SVC(C=3, class_weight='balanced', probability=True, random_state=0), (UX - UX.mean()) / UX.std(), (UX_test- UX.mean()) / UX.std()),
              ('nn', MLPClassifier((200, 100,), solver='sgd'), (UX - UX.mean()) / UX.std(), (UX_test- UX.mean()) / UX.std()),]

In [None]:
target = Y['1']
for name, estim, data_tr, data_test in estims_data:
    print(name)
    ustacked_preds_1[name] = get_stack_preds(estim, data_tr.drop(columns=['id']), target, 10)
    print(roc_auc_score(target, ustacked_preds_1[name]))
    estim.fit(data_tr.drop(columns=['id']), target)
    utest_preds_1[name] = estim.predict_proba(data_test.drop(columns=['id']))[:, 1]

In [None]:
target = Y['2']
for name, estim, data_tr, data_test in estims_data:
    print(name)
    ustacked_preds_2[name] = get_stack_preds(estim, data_tr.drop(columns=['id']), target, 10)
    print(roc_auc_score(target, ustacked_preds_2[name]))
    estim.fit(data_tr.drop(columns=['id']), target)
    utest_preds_2[name] = estim.predict_proba(data_test.drop(columns=['id']))[:, 1]

In [None]:
target = Y['3']
for name, estim, data_tr, data_test in estims_data:
    print(name)
    ustacked_preds_3[name] = get_stack_preds(estim, data_tr.drop(columns=['id']), target, 10)
    print(roc_auc_score(target, ustacked_preds_3[name]))
    estim.fit(data_tr.drop(columns=['id']), target)
    utest_preds_3[name] = estim.predict_proba(data_test.drop(columns=['id']))[:, 1]

In [None]:
target = Y['4']
for name, estim, data_tr, data_test in estims_data:
    print(name)
    ustacked_preds_4[name] = get_stack_preds(estim, data_tr.drop(columns=['id']), target, 10)
    print(roc_auc_score(target, ustacked_preds_4[name]))
    estim.fit(data_tr.drop(columns=['id']), target)
    utest_preds_4[name] = estim.predict_proba(data_test.drop(columns=['id']))[:, 1]

In [None]:
target = Y['5']
for name, estim, data_tr, data_test in estims_data:
    print(name)
    ustacked_preds_5[name] = get_stack_preds(estim, data_tr.drop(columns=['id']), target, 10)
    print(roc_auc_score(target, ustacked_preds_5[name]))
    estim.fit(data_tr.drop(columns=['id']), target)
    utest_preds_5[name] = estim.predict_proba(data_test.drop(columns=['id']))[:, 1]

In [None]:
uds_1 = {}
uds_1['ucl_1'] = ustacked_preds_1['xgb_1']
uds_1['ucl_3'] = ustacked_preds_1['rf_1']
uds_1['ucl_5'] = ustacked_preds_1['lr_1']
uds_1['ucl_8'] = ustacked_preds_1['svm_r']
uds_1['ucl_9'] = ustacked_preds_1['nn']
uds_1_test = {}
uds_1_test['ucl_1'] = utest_preds_1['xgb_1']
uds_1_test['ucl_3'] = utest_preds_1['rf_1']
uds_1_test['ucl_5'] = utest_preds_1['lr_1']
uds_1_test['ucl_8'] = utest_preds_1['svm_r']
uds_1_test['ucl_9'] = utest_preds_1['nn']
uds_1_ = pd.DataFrame(uds_1)
uds_1_test_ = pd.DataFrame(uds_1_test)

In [None]:
uds_1 = {}
uds_1['ucl_1'] = ustacked_preds_2['xgb_1']
uds_1['ucl_3'] = ustacked_preds_2['rf_1']
uds_1['ucl_5'] = ustacked_preds_2['lr_1']
uds_1['ucl_8'] = ustacked_preds_2['svm_r']
uds_1['ucl_9'] = ustacked_preds_2['nn']
uds_1_test = {}
uds_1_test['ucl_1'] = utest_preds_2['xgb_1']
uds_1_test['ucl_3'] = utest_preds_2['rf_1']
uds_1_test['ucl_5'] = utest_preds_2['lr_1']
uds_1_test['ucl_8'] = utest_preds_2['svm_r']
uds_1_test['ucl_9'] = utest_preds_2['nn']
uds_2 = pd.DataFrame(uds_1)
uds_2_test = pd.DataFrame(uds_1_test)

In [None]:
uds_1 = {}
uds_1['ucl_1'] = ustacked_preds_3['xgb_1']
uds_1['ucl_3'] = ustacked_preds_3['rf_1']
uds_1['ucl_5'] = ustacked_preds_3['lr_1']
uds_1['ucl_8'] = ustacked_preds_3['svm_r']
uds_1['ucl_9'] = ustacked_preds_3['nn']
uds_1_test = {}
uds_1_test['ucl_1'] = utest_preds_3['xgb_1']
uds_1_test['ucl_3'] = utest_preds_3['rf_1']
uds_1_test['ucl_5'] = utest_preds_3['lr_1']
uds_1_test['ucl_8'] = utest_preds_3['svm_r']
uds_1_test['ucl_9'] = utest_preds_3['nn']
uds_3 = pd.DataFrame(uds_1)
uds_3_test = pd.DataFrame(uds_1_test)

In [None]:
uds_1 = {}
uds_1['ucl_1'] = ustacked_preds_4['xgb_1']
uds_1['ucl_3'] = ustacked_preds_4['rf_1']
uds_1['ucl_5'] = ustacked_preds_4['lr_1']
uds_1['ucl_8'] = ustacked_preds_4['svm_r']
uds_1['ucl_9'] = ustacked_preds_4['nn']
uds_1_test = {}
uds_1_test['ucl_1'] = utest_preds_4['xgb_1']
uds_1_test['ucl_3'] = utest_preds_4['rf_1']
uds_1_test['ucl_5'] = utest_preds_4['lr_1']
uds_1_test['ucl_8'] = utest_preds_4['svm_r']
uds_1_test['ucl_9'] = utest_preds_4['nn']
uds_4 = pd.DataFrame(uds_1)
uds_4_test = pd.DataFrame(uds_1_test)

In [None]:
uds_1 = {}
uds_1['ucl_1'] = ustacked_preds_5['xgb_1']
uds_1['ucl_3'] = ustacked_preds_5['rf_1']
uds_1['ucl_5'] = ustacked_preds_5['lr_1']
uds_1['ucl_8'] = ustacked_preds_5['svm_r']
uds_1['ucl_9'] = ustacked_preds_5['nn']
uds_1_test = {}
uds_1_test['ucl_1'] = utest_preds_5['xgb_1']
uds_1_test['ucl_3'] = utest_preds_5['rf_1']
uds_1_test['ucl_5'] = utest_preds_5['lr_1']
uds_1_test['ucl_8'] = utest_preds_5['svm_r']
uds_1_test['ucl_9'] = utest_preds_5['nn']
uds_5 = pd.DataFrame(uds_1)
uds_5_test = pd.DataFrame(uds_1_test)

## Merging ALS and UMAP extractions

In [None]:
ds_1_['id'] = X1.id
uds_1_['id'] = X1.id
ds_1_test_['id'] = X1_test.id
uds_1_test_['id'] = X1_test.id
lr = LogisticRegression(class_weight='balanced', C=2)
lr.fit(ds_1_.merge(uds_1_, on='id').drop(columns=['id']), Y['1'])
Y1 = lr.predict_proba(ds_1_test_.merge(uds_1_test_, on='id').drop(columns=['id']))[:, 1]

In [None]:
ds_2['id'] = X1.id
uds_2['id'] = X1.id
ds_2_test['id'] = X1_test.id
uds_2_test['id'] = X1_test.id
lr = LogisticRegression(class_weight='balanced', C=2)
lr.fit(ds_2.merge(uds_2, on='id').drop(columns=['id']), Y['2'])
Y2 = lr.predict_proba(ds_2_test.merge(uds_2_test, on='id').drop(columns=['id']))[:, 1]

In [None]:
ds_3['id'] = X1.id
uds_3['id'] = X1.id

ds_3_test['id'] = X1_test.id
uds_3_test['id'] = X1_test.id
lr = LogisticRegression(class_weight='balanced', C=2)
lr.fit(ds_3.merge(uds_3, on='id').drop(columns=['id']), Y['3'])
Y3 = lr.predict_proba(ds_3_test.merge(uds_3_test, on='id').drop(columns=['id']))[:, 1]

In [None]:
ds_4['id'] = X1.id
uds_4['id'] = X1.id
ds_4_test['id'] = X1_test.id
uds_4_test['id'] = X1_test.id
lr = LogisticRegression(class_weight='balanced', C=2)
lr.fit(ds_4.merge(uds_4, on='id').drop(columns=['id']), Y['4'])
Y4 = lr.predict_proba(ds_4_test.merge(uds_4_test, on='id').drop(columns=['id']))[:, 1]

In [None]:
ds_5['id'] = X1.id
uds_5['id'] = X1.id
ds_5_test['id'] = X1_test.id
uds_5_test['id'] = X1_test.id
lr = LogisticRegression(class_weight='balanced', C=2)
lr.fit(ds_5.merge(uds_5, on='id').drop(columns=['id']), Y['5'])
Y5 = lr.predict_proba(ds_5_test.merge(uds_5_test, on='id').drop(columns=['id']))[:, 1]

In [None]:
df = pd.DataFrame()

df['id'] = X1_test.id
df['1'] = Y1
df['2'] = Y2
df['3'] = Y3
df['4'] = Y4
df['5'] = Y5

In [None]:
df.to_csv('final.csv', index=False)