In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#preprocesing
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Removing warnings
import warnings
warnings.filterwarnings('ignore')

# Storing plots

%matplotlib inline

In [3]:
# pip install catboost

# Load train dataset

In [4]:
train_dataset = pd.read_feather('/kaggle/input/amexfeather/train_data.ftr')

In [5]:
train_dataset = train_dataset.set_index('customer_ID', drop=True)

In [None]:
train_dataset.describe()

In [None]:
train_dataset.shape

In [None]:
train_dataset.isnull().sum()

# Remove Features with min 75% null values

In [6]:
min_null_count =  int(((100-25)/100)*train_dataset.shape[0] + 1)
train_dataset = train_dataset.dropna( axis=1, 
                thresh=min_null_count)

In [None]:
train_dataset.shape

In [None]:
# for column in train_dataset.columns:
#     print(column,"----->", train_dataset[column].unique())

Drop unuseful coulums

In [7]:
train_dataset.drop(["S_2"],axis=1,inplace=True)

In [None]:
# train_dataset.drop(["customer_ID"],axis=1,inplace=True)

# Handle Categorical Features

In [8]:
categories=[]
for categorical_column in train_dataset.select_dtypes(include=['category','object']).columns:
    categories.append(categorical_column)
categories

['D_63',
 'D_64',
 'D_68',
 'B_30',
 'B_38',
 'D_114',
 'D_116',
 'D_117',
 'D_120',
 'D_126']

In [None]:
# enc = LabelEncoder()
# for categorical_column in train_dataset.select_dtypes(include=['category','object']).columns:
#     print(train_dataset[categorical_column].head())
#     features[categorical_column]=enc.fit_transform(features[categorical_column]

In [9]:
enc = LabelEncoder()
for categorical_column in train_dataset.select_dtypes(include=['category','object']).columns:
    train_dataset[categorical_column]=enc.fit_transform(train_dataset[categorical_column])

# Handle Null values

In [10]:
for category in categories:
    train_dataset[category]=train_dataset[category].fillna(train_dataset[category].mode())

In [10]:
train_dataset["P_2"].isnull().sum()

45985

In [11]:
train_dataset["P_2"].mean()

nan

In [11]:
for column in train_dataset.columns:
    if(column not in categories):
        print(column)
        train_dataset[column]=train_dataset[column].fillna(train_dataset[column].median())

P_2
D_39
B_1
B_2
R_1
S_3
D_41
B_3
D_44
B_4
D_45
B_5
R_2
D_46
D_47
D_48
B_6
B_7
B_8
D_51
B_9
R_3
D_52
P_3
B_10
S_5
B_11
S_6
D_54
R_4
S_7
B_12
S_8
D_55
B_13
R_5
D_58
B_14
D_59
D_60
D_61
B_15
S_11
D_62
D_65
B_16
B_18
B_19
B_20
S_12
R_6
S_13
B_21
D_69
B_22
D_70
D_71
D_72
S_15
B_23
P_4
D_74
D_75
B_24
R_7
B_25
B_26
D_78
D_79
R_8
S_16
D_80
R_10
R_11
B_27
D_81
S_17
R_12
B_28
R_13
D_83
R_14
R_15
D_84
R_16
S_18
D_86
R_17
R_18
B_31
S_19
R_19
B_32
S_20
R_20
R_21
B_33
D_89
R_22
R_23
D_91
D_92
D_93
D_94
R_24
R_25
D_96
S_22
S_23
S_24
S_25
S_26
D_102
D_103
D_104
D_107
B_36
B_37
R_27
D_109
D_112
B_40
D_113
D_115
D_118
D_119
D_121
D_122
D_123
D_124
D_125
D_127
D_128
D_129
B_41
D_130
D_131
D_133
R_28
D_139
D_140
D_141
D_143
D_144
D_145
target


# Split fatrues and target

In [12]:
X = train_dataset.iloc[:, :-1]
y = train_dataset.iloc[:, -1:]

In [14]:
y.head()

Unnamed: 0_level_0,target
customer_ID,Unnamed: 1_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0


In [13]:
del train_dataset

In [14]:
import gc
gc.collect()

142

Get the average value of rows of each customer

In [15]:
X = X.groupby('customer_ID').mean()

In [None]:
X.head()

# Drop Correlated Features

In [17]:
cor_matrix = X.corr()
col_core = set()

for i in range(len(cor_matrix.columns)):
    for j in range(i):
        if(cor_matrix.iloc[i, j] > 0.9):
            col_name = cor_matrix.columns[i]
            col_core.add(col_name)
col_core

{'B_11',
 'B_15',
 'B_20',
 'B_23',
 'B_33',
 'B_37',
 'D_104',
 'D_119',
 'D_141',
 'D_143',
 'D_74',
 'D_75',
 'S_24',
 'S_7'}

In [18]:
X = X.drop(col_core, axis=1)

In [19]:
y= y.groupby('customer_ID').mean()

In [22]:
y.head()

Unnamed: 0_level_0,target
customer_ID,Unnamed: 1_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0.0
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,0.0
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,0.0
000041bdba6ecadd89a52d11886e8eaaec9325906c9723355abb5ca523658edc,0.0
00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8ad51ca8b8c4a24cefed,0.0


In [20]:
y=y.round(0).astype(int)

In [21]:
for category in categories:
    X[category]=X[category].round(0).astype(int)

# Train/Test Split

In [22]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=23)

In [27]:
x_train.isnull().sum()

P_2      0
D_39     0
B_1      0
B_2      0
R_1      0
        ..
R_28     0
D_139    0
D_140    0
D_144    0
D_145    0
Length: 141, dtype: int64

# LogisticRegression

In [23]:
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

LogisticRegression(max_iter=1000)

# LinearRegression

In [20]:
# model = LinearRegression()
# model.fit(x_train, y_train)

LinearRegression()

# Lightgbm

In [None]:
import lightgbm as lgb

d_train = lgb.Dataset(x_train, label=y_train, categorical_feature = categories)

params = {'objective': 'binary','n_estimators': 1200,'metric': 'binary_logloss','boosting': 'gbdt','num_leaves': 90,'reg_lambda' : 50,'colsample_bytree': 0.19,'learning_rate': 0.03,'min_child_samples': 2400,'max_bins': 511,'seed': 42,'verbose': -1}

# trained model with 100 iterations
model = lgb.train(params, d_train, 100)

# CatBoostClassifier

In [26]:
from catboost import CatBoostClassifier
#build model
model = CatBoostClassifier(cat_features = categories)
# Fit model 
model.fit( x_train, y_train,
               eval_set=(x_test, y_test),
               plot=True)

# Xgboost

In [2]:
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(x_train, y_train)

In [3]:
model = XGBClassifier(n_estimators=500)
model.fit(x_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(x_test, y_test)],
             verbose=False)

In [None]:
# xgb1 = XGBRegressor()
# parameters = {'nthread':[4], 
#               'objective':['reg:linear'],
#               'learning_rate': [.4, .03, .02,.04,.05], 
#               'max_depth': [3,4,6, 7],
#               'min_child_weight': [4],
#               'silent': [1],
#               'subsample': [0.7],
#               'colsample_bytree': [0.7],
#               'n_estimators': [1000]}

# parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
#               'objective':['reg:linear'],
#               'learning_rate': [.04], #so called `eta` value
#               'max_depth': [8],
#               'min_child_weight': [4],
#               'silent': [1],
#               'subsample': [0.7],
#               'colsample_bytree': [0.7],
#               'n_estimators': [1250]}

# parameters={'colsample_bytree': [0.7], 'learning_rate': [0.04], 'max_depth':[8],
# 'min_child_weight':[4], 'n_estimators': [1250], 'nthread': [4], 'objective':
#             ['reg:linear'], 'silent': [1], 'subsample':[ 0.7]}

# parameters={"learning_rate" :[0.1],
#  "n_estimators":[1000],
#  "max_depth":[5],
#  "min_child_weight":[1],
#  "gamma":[0],
#  "subsample":[0.8],
#  "colsample_bytree":[0.8],
#  "objective": ['reg:linear'],
#  "nthread":[4],
#  "scale_pos_weight":[1],
#  "seed":[27]
# }

# xgb_grid = GridSearchCV(xgb1,
#                         parameters,
#                         cv = 2,
#                         n_jobs = 5,
#                         verbose=True)

# xgb_grid.fit(x_train,
#          y_train)

# print(xgb_grid.best_score_)
# print(xgb_grid.best_params_)

In [None]:
# # XGB MODEL PARAMETERS
# xgb_parms = { 
#     'max_depth':4, 
#     'learning_rate':0.05, 
#     'subsample':0.8,
#     'colsample_bytree':0.6, 
#     'eval_metric':'logloss',
#     'objective':'binary:logistic',
#     'tree_method':'gpu_hist',
#     'predictor':'gpu_predictor',
#     'random_state':SEED
# }

# Load Test Data and apply same methods done for test data

In [24]:
test_dataset = pd.read_feather('/kaggle/input/amexfeather/test_data.ftr')

In [25]:
test_dataset = test_dataset.set_index('customer_ID', drop=True)

In [26]:
num_columns = [col for col in X.columns]

In [27]:
test_dataset=test_dataset[num_columns]

In [28]:
enc = LabelEncoder()
for categorical_column in test_dataset.select_dtypes(include=['category','object']).columns:
    test_dataset[categorical_column]=enc.fit_transform(test_dataset[categorical_column])

In [29]:
for category in categories:
    test_dataset[category]=test_dataset[category].fillna(test_dataset[category].mode())

In [30]:
for column in test_dataset.columns:
    if(column not in categories):
#         print(column)
        test_dataset[column]=test_dataset[column].fillna(test_dataset[column].median())

In [31]:
test_dataset = test_dataset.groupby('customer_ID').mean()

In [32]:
for category in categories:
    test_dataset[category]=test_dataset[category].round(0).astype(int)

In [33]:
y_pred = model.predict(test_dataset)

In [None]:
# def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

#     def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
#         df = (pd.concat([y_true, y_pred], axis='columns')
#               .sort_values('prediction', ascending=False))
#         df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
#         four_pct_cutoff = int(0.04 * df['weight'].sum())
#         df['weight_cumsum'] = df['weight'].cumsum()
#         df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
#         return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
#     def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
#         df = (pd.concat([y_true, y_pred], axis='columns')
#               .sort_values('prediction', ascending=False))
#         df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
#         df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
#         total_pos = (df['target'] * df['weight']).sum()
#         df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
#         df['lorentz'] = df['cum_pos_found'] / total_pos
#         df['gini'] = (df['lorentz'] - df['random']) * df['weight']
#         return df['gini'].sum()

#     def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
#         y_true_pred = y_true.rename(columns={'target': 'prediction'})
#         return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

#     g = normalized_weighted_gini(y_true, y_pred)
#     d = top_four_percent_captured(y_true, y_pred)

#     return 0.5 * (g + d)

In [None]:
# print(amex_metric(pd.DataFrame({'target':y_test["target"].values}), pd.DataFrame({'prediction':y_pred})))

In [60]:
def amex_metric(y_true, y_pred):
    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

In [4]:
print(amex_metric(y_test["target"].values,y_pred))

# Submission

In [34]:
output = pd.DataFrame({'customer_ID': test_dataset.index, 'prediction': y_pred})


In [35]:
cd /kaggle/working/

/kaggle/working


In [36]:
output.to_csv('submission7.csv', index=False)