In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


#preprocesing
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Removing warnings



import warnings
warnings.filterwarnings('ignore')

# Storing plots

%matplotlib inline

# Load train dataset

In [None]:
train_dataset_ = pd.read_feather('/kaggle/input/amexfeather/train_data.ftr')

**Keep the latest statement features for each customer**

In [None]:
# # Keep the latest statement features for each customer
train_dataset = train_dataset_.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()

# Remove Features with min 75% null values

In [None]:
min_null_count =  int(((100-25)/100)*train_dataset.shape[0] + 1)
train_dataset = train_dataset.dropna( axis=1, 
                thresh=min_null_count)

In [None]:
train_dataset.drop(["S_2"],axis=1,inplace=True)

Drop unuseful coulums

# Handle Categorical Features

In [None]:
categories=[]
for categorical_column in train_dataset.select_dtypes(include=['category','object']).columns:
    categories.append(categorical_column)
categories

In [None]:
enc = LabelEncoder()
for categorical_column in train_dataset.select_dtypes(include=['category','object']).columns:
    train_dataset[categorical_column]=enc.fit_transform(train_dataset[categorical_column])

# Handle Null values

In [None]:
for category in categories:
    train_dataset[category]=train_dataset[category].fillna(train_dataset[category].mode())

In [None]:
for column in train_dataset.columns:
    if(column not in categories):
#         print(column)
        train_dataset[column]=train_dataset[column].fillna(train_dataset[column].median())

# Split fatrues and target

In [None]:
X = train_dataset.iloc[:, :-1]
y = train_dataset.iloc[:, -1:]

# Drop Correlated Features

In [None]:
cor_matrix = X.corr()
col_core = set()

for i in range(len(cor_matrix.columns)):
    for j in range(i):
        if(cor_matrix.iloc[i, j] > 0.9):
            col_name = cor_matrix.columns[i]
            col_core.add(col_name)
col_core

In [None]:
X = X.drop(col_core, axis=1)

# Train/Test Split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=23)

In [None]:
x_train

# Lightgbm

In [None]:
import lightgbm as lgb

d_train = lgb.Dataset(x_train, label=y_train, categorical_feature = categories)

params = {'objective': 'binary','n_estimators': 1200,'metric': 'binary_logloss','boosting': 'gbdt','num_leaves': 90,'reg_lambda' : 50,'colsample_bytree': 0.19,'learning_rate': 0.03,'min_child_samples': 2400,'max_bins': 511,'seed': 42,'verbose': -1}

# trained model with 100 iterations
model = lgb.train(params, d_train, 100)

# SVM

In [None]:
from sklearn import svm
model = svm.SVC(kernel='linear').fit(x_train, y_train)

# KNN Classifier

In [None]:
#Import knearest neighbors Classifier model
from sklearn.neighbors import KNeighborsClassifier

#Create KNN Classifier
model = KNeighborsClassifier(n_neighbors=11).fit(x_train, y_train)


# Load Test Data and apply same methods done for test data

In [None]:
test_dataset_ = pd.read_feather('/kaggle/input/amexfeather/test_data.ftr')


In [None]:
test_dataset = test_dataset_.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()

In [None]:
num_columns = [col for col in X.columns]

In [None]:
test_dataset=test_dataset[num_columns]

In [None]:
enc = LabelEncoder()
for categorical_column in test_dataset.select_dtypes(include=['category','object']).columns:
    test_dataset[categorical_column]=enc.fit_transform(test_dataset[categorical_column])

In [None]:
for category in categories:
    test_dataset[category]=test_dataset[category].fillna(test_dataset[category].mode())

In [None]:
for column in test_dataset.columns:
    if(column not in categories):
#         print(column)
        test_dataset[column]=test_dataset[column].fillna(test_dataset[column].median())

In [None]:
y_pred = model.predict(test_dataset)

# CatBoost with bagging

In [None]:
predictions = []

for i in range(10):
    classifier = CatBoostClassifier(random_seed=i)
    classifier.fit(X_train, y_train, cat_features=categories)
    predictions.append(classifier.predict_proba(test_dataset)[:,1])

In [None]:
y_pred = np.mean(predictions, axis=0)

# Submission

In [None]:
output = pd.DataFrame({'customer_ID': test_dataset.index, 'prediction': y_pred})

In [None]:
cd /kaggle/working/

In [None]:
output.to_csv('submission9.csv', index=False)