In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


#preprocesing
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Removing warnings



import warnings
warnings.filterwarnings('ignore')

# Storing plots

%matplotlib inline

In [2]:
train_dataset_ = pd.read_feather('/kaggle/input/amexfeather/train_data.ftr')

In [3]:
# Keep the latest statement features for each customer
train_dataset = train_dataset_.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()

In [4]:
min_null_count =  int(((100-25)/100)*train_dataset.shape[0] + 1)
train_dataset = train_dataset.dropna( axis=1, 
                thresh=min_null_count)

In [5]:
train_dataset.drop(["S_2"],axis=1,inplace=True)

In [6]:
categories=[]
for categorical_column in train_dataset.select_dtypes(include=['category','object']).columns:
    categories.append(categorical_column)
categories

['D_63',
 'D_64',
 'D_68',
 'B_30',
 'B_38',
 'D_114',
 'D_116',
 'D_117',
 'D_120',
 'D_126']

In [7]:
enc = LabelEncoder()
for categorical_column in train_dataset.select_dtypes(include=['category','object']).columns:
    train_dataset[categorical_column]=enc.fit_transform(train_dataset[categorical_column])

In [8]:
for category in categories:
    train_dataset[category]=train_dataset[category].fillna(train_dataset[category].mode())

In [9]:
for column in train_dataset.columns:
    if(column not in categories):
#         print(column)
        train_dataset[column]=train_dataset[column].fillna(train_dataset[column].median())

In [10]:
X = train_dataset.iloc[:, :-1]
y = train_dataset.iloc[:, -1:]

In [11]:
cor_matrix = X.corr()
col_core = set()

for i in range(len(cor_matrix.columns)):
    for j in range(i):
        if(cor_matrix.iloc[i, j] > 0.9):
            col_name = cor_matrix.columns[i]
            col_core.add(col_name)
col_core

{'B_11',
 'B_13',
 'B_15',
 'B_23',
 'B_33',
 'B_37',
 'D_104',
 'D_119',
 'D_141',
 'D_143',
 'D_74',
 'D_75',
 'S_24',
 'S_7'}

In [12]:
X = X.drop(col_core, axis=1)

In [13]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=23)

In [14]:
x_train

Unnamed: 0_level_0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_44,B_4,...,D_129,B_41,D_130,D_131,D_133,R_28,D_139,D_140,D_144,D_145
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9bd44ca1f2f84150e5bd4b763c09cd138eeba13750cf88350578cd8b8a012bd1,0.516113,0.097046,0.138062,0.085510,0.004734,0.119812,0.003475,0.388916,0.377441,0.631836,...,1.001953,0.009758,0.006367,0.000141,0.004658,0.005013,0.006992,0.004738,0.003790,0.006172
30321b4a8b80d3429ce16322d5e46bf3de1d9b3ccecdd40c37578a913d698ba5,0.480225,0.774414,0.009285,0.232788,0.004555,0.161377,0.007545,0.067322,0.128662,0.030167,...,1.000000,0.007587,0.005093,0.000281,0.008339,0.006248,0.000399,0.002218,0.005753,0.008446
ccb34362035900506354729bb17636558cadb1dfed6a617b8dd1e864ecc8d650,0.917969,0.007545,0.006870,1.007812,0.004829,0.150879,0.006092,0.015457,0.006618,0.015053,...,0.005711,0.008308,1.006836,0.008163,0.006931,0.000503,0.007233,0.003401,0.009064,0.007889
972243d1b0a2ea4dcda2427ebd31db9b8ccddb5b78ee673ff2a47058485fd629,0.764648,0.479492,0.017593,1.008789,0.003031,0.025970,0.009186,0.002382,0.132812,0.645996,...,0.009903,0.009026,1.008789,0.998535,0.393311,0.000484,0.006371,0.002411,0.009789,0.003139
44fb840a4e521cc42f809c571bab9d438b7c863d6dc043d65df2b464b7a92d9f,0.516602,0.656738,0.418701,0.030960,0.757812,0.761230,0.009514,0.774902,0.754883,0.711914,...,1.005859,0.007210,1.009766,0.007805,0.001237,0.000468,0.007271,0.009758,0.005482,0.005642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
bcced58957ffcc297512f4ee7d51dcf9f9f80e2f34b8d91b38112cf1a7a9b9e4,0.279053,0.036743,0.259766,0.025146,0.000261,0.173950,0.006100,0.730469,0.882812,0.804199,...,1.006836,0.009735,1.000977,0.007809,0.001313,0.001293,0.008934,0.007343,0.004494,0.009529
7c3847b6382892f5c198a908fc4a1342f471ba7e2562f3509a1b64c7225392b6,0.348633,0.388184,0.028809,0.401611,0.008461,2.992188,0.867188,0.094666,0.627930,0.312012,...,1.000977,0.003057,0.008034,0.001480,0.006527,0.006973,0.006828,0.008919,0.002354,0.004658
9722ce1374e54306b3a0e4d0c27453eb788072c49f9a63bb79f394e31d865c84,0.715820,0.265137,0.007965,0.816895,0.002092,0.137451,0.007675,0.008987,0.000676,0.019135,...,0.009964,0.001906,0.009804,0.005119,0.002993,0.001152,0.008934,0.000588,0.002995,0.004997
46f25716efd9c47422d0b7d1b87369a1f94cc0bcefce0a79e55966a1be67ff4e,0.642578,0.008255,0.168335,0.816895,0.009079,0.196655,0.003271,0.005402,0.005688,0.111450,...,0.009285,0.009361,0.004078,0.003099,0.000169,0.001531,0.007263,0.003910,0.007538,0.001724


In [15]:
# import lightgbm as lgb

# d_train = lgb.Dataset(x_train, label=y_train, categorical_feature = categories)

# params = {'objective': 'binary','n_estimators': 1200,'metric': 'binary_logloss','boosting': 'gbdt','num_leaves': 90,'reg_lambda' : 50,'colsample_bytree': 0.19,'learning_rate': 0.03,'min_child_samples': 2400,'max_bins': 511,'seed': 42,'verbose': -1}

# # trained model with 100 iterations
# model = lgb.train(params, d_train, 100)

In [16]:
from sklearn import svm
model = svm.SVC(kernel='sigmoid').fit(x_train, y_train)


In [17]:
# #Import knearest neighbors Classifier model
# from sklearn.neighbors import KNeighborsClassifier

# #Create KNN Classifier
# model = KNeighborsClassifier(n_neighbors=11).fit(x_train, y_train)


In [18]:
test_dataset_ = pd.read_feather('/kaggle/input/amexfeather/test_data.ftr')


In [19]:
test_dataset = test_dataset_.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()

In [20]:
num_columns = [col for col in X.columns]

In [21]:
test_dataset=test_dataset[num_columns]

In [22]:
enc = LabelEncoder()
for categorical_column in test_dataset.select_dtypes(include=['category','object']).columns:
    test_dataset[categorical_column]=enc.fit_transform(test_dataset[categorical_column])

In [23]:
for category in categories:
    test_dataset[category]=test_dataset[category].fillna(test_dataset[category].mode())

In [24]:
for column in test_dataset.columns:
    if(column not in categories):
#         print(column)
        test_dataset[column]=test_dataset[column].fillna(test_dataset[column].median())

In [25]:
y_pred = model.predict(test_dataset)

In [26]:
output = pd.DataFrame({'customer_ID': test_dataset.index, 'prediction': y_pred})


In [27]:
cd /kaggle/working/

/kaggle/working


In [28]:
output.to_csv('submission9.csv', index=False)