In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.combine import SMOTETomek

from sklearn.model_selection import train_test_split, KFold, cross_val_predict
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [23]:
train = pd.read_csv(r'.\data\train.csv')
test = pd.read_csv(r'.\data\test.csv')

print(train.info())
print(train.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219129 entries, 0 to 219128
Data columns (total 32 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      219129 non-null  int64  
 1   Time    219129 non-null  float64
 2   V1      219129 non-null  float64
 3   V2      219129 non-null  float64
 4   V3      219129 non-null  float64
 5   V4      219129 non-null  float64
 6   V5      219129 non-null  float64
 7   V6      219129 non-null  float64
 8   V7      219129 non-null  float64
 9   V8      219129 non-null  float64
 10  V9      219129 non-null  float64
 11  V10     219129 non-null  float64
 12  V11     219129 non-null  float64
 13  V12     219129 non-null  float64
 14  V13     219129 non-null  float64
 15  V14     219129 non-null  float64
 16  V15     219129 non-null  float64
 17  V16     219129 non-null  float64
 18  V17     219129 non-null  float64
 19  V18     219129 non-null  float64
 20  V19     219129 non-null  float64
 21  V20     21

In [24]:
features = []

for i in range(1,29):
    features.append(f"V{i}")

features = ['Time', 'Amount'] + features

In [25]:
start = 0
end = 3600

#train['Time'] = np.where(train['Time'] > 86400, train['Time'] - 86400, train['Time'])
#test['Time'] = np.where(test['Time'] > 86400, test['Time'] - 86400, test['Time'])

In [26]:
def sampler(x, y, type, random_state=0):
    xsmp, ysmp = -1, -1
    match type:
        case 0: xsmp, ysmp = RandomOverSampler(random_state=random_state).fit_resample(x,y)
        case 1: xsmp, ysmp = SMOTE(random_state=random_state).fit_resample(x,y)
        case 2: xsmp, ysmp = ADASYN(random_state=random_state).fit_resample(x,y)
        case 3: xsmp, ysmp = SMOTETomek(random_state=random_state).fit_resample(x,y)
    return xsmp, ysmp


In [27]:
xtrn, ytrn = train[features], train[['Class']]
xtst = test[features]

In [28]:
# 1. Normalize Data         # 1. Sample Data
# 2. Sample Data            # 2. Split Data
# 3. Split Data             # 3. Normalize Data
# 4. Model Selection        # 4. Model Selection

In [29]:
# 1. Normalize Data

xtrn = MinMaxScaler().fit_transform(xtrn)

# 2. Sample Data
xtrn, ytrn = sampler(xtrn, ytrn, 2)

# 3. Split Data
xtrn, xvld, ytrn, yvld = train_test_split(xtrn, ytrn, test_size=0.2)


In [30]:
xtrn = pd.DataFrame(xtrn, columns=[features])
xvld = pd.DataFrame(xvld, columns=[features])
# xtst = pd.DataFrame(xtst, columns=[features])

In [34]:
# 4. Model Selection

# a. Random Forest

# model = RandomForestClassifier()
# model.fit(xtrn, ytrn)
# 



In [None]:
# b. RandomForest + Adaboost

# rf = RandomForestClassifier()
# model = AdaBoostClassifier(estimator=rf)
# model.fit(xtrn, ytrn)
# yvld_ = model.predict(xvld)

# print(classification_report(yvld, yvld_))

# pred = model.predict(xtst)

In [None]:
# c.RandomForest + Adabosst + KFold

# n_splits = 10
# kf = KFold(n_splits=n_splits)
# rf = RandomForestClassifier()
# model = AdaBoostClassifier(estimator=rf)

# models = []
# fold = 1
# for train_indices, vld_indices in kf.split(xtrn):
#     print(f"Fold {fold}")
#     print("-----------------------------------------------")
#     fold+=1

#     xtrn_, ytrn_ = pd.DataFrame(xtrn).iloc[train_indices], pd.DataFrame(ytrn).iloc[train_indices]
#     xvld_, yvld_ = pd.DataFrame(xtrn).iloc[vld_indices], pd.DataFrame(ytrn).iloc[vld_indices]

#     rf = RandomForestClassifier()
#     model = AdaBoostClassifier(estimator=rf)

#     model.fit(xtrn_, ytrn_)
#     yprd_ = model.predict(xvld_)

#     print(classification_report(yvld_, yprd_))
#     models.append(model)
#     print()

#     predictions = []

# for model in models:
#   yprd = model.predict(xtst[features])
#   predictions.append(yprd)
# summ = np.zeros(shape=predictions[0].shape)

# for p in predictions:
#     summ+=p

# summ = np.where(summ >=5, 1, 0)

In [None]:
# d. Just XGB - Best

# model = XGBClassifier()
# model.fit(xtrn, ytrn)

# yprd = model.predict(xvld)

# print(classification_report(yvld, yprd))

# pred = model.predict(xtst)

In [None]:
#e. XGB + KFold

# models = []
# fold=1
# kf = KFold(n_splits=10)
# for trn_ind, vld_ind in kf.split(xtrn):
#     xtrn_, ytrn_ = pd.DataFrame(xtrn).iloc[trn_ind], pd.DataFrame(ytrn).iloc[trn_ind]
#     xvld_, yvld_ = pd.DataFrame(xtrn).iloc[vld_ind], pd.DataFrame(ytrn).iloc[vld_ind]
#     xgb = XGBClassifier()
#     xgb.fit(xtrn_, ytrn_)

#     yprd_ = xgb.predict(xvld_)

#     print(f"Fold {fold}")
#     fold+=1
#     print(classification_report(yvld_, yprd_))

#     models.append(xgb)

# pred = models[0].predict(xtst)

In [None]:
#f. Perceptron

# model = Perceptron()
# model.fit(xtrn, ytrn)
# yprd = model.predict(xvld)

# print(classification_report(yvld, yprd))

# pred = model.predict(xtst)

In [None]:
#g. Perceptron + KFold

model = Perceptron()
model.fit(xtrn, ytrn)
yprd = model.predict(xvld)

print(classification_report(yvld, yprd))

pred = model.predict(xtst)

In [33]:
outp = pd.DataFrame()

outp['id'] = test['id']
outp['Class'] = pred

outp.to_csv('.\\attempt18.csv', index=False)