In [None]:
!gdown --id dataset_id --output train.csv
!gdown --id  dataset_id test.csv

In [None]:
%pip install -U scikit-learn
%pip install xgboost
%pip install wandb

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import zipfile
import os

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV

import wandb

from xgboost.sklearn import XGBClassifier

from scipy.stats import uniform, randint

### Configuration

In [None]:
config = {
    'batch_size' : 10000,
    'batch_number' : 500,
    'train_file' : "train.csv",
    'test_file' : "test.csv",
    'feature_selected' : [4, 5, 7, 8, 9, 11, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23],
    'all_feature' : True,
    'all_data' : False
}

random.seed(0)

### Utility Functions

In [None]:
def select_random_numbers(lower_limit, upper_limit, n):

  selected_numbers = random.sample(range(lower_limit, upper_limit + 1), n)

  sorted_result = sorted(selected_numbers)

  return sorted_result


def feature_select(x_train, x_test):
    if(config['all_feature']):

      return x_train, x_test
    else:
      select_idx = config['all_feature']

      return x_train[:, select_idx], x_test[:, select_idx]



def create_zip_archive(source_folder, zip_filename):
    with zipfile.ZipFile(zip_filename, 'w') as zipf:
        for root, dirs, files in os.walk(source_folder):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, source_folder)
                zipf.write(file_path, arcname)

### Preprocessing Fucntions

In [None]:
def preprocess_training_data(data, imputer = None):

  print("Dropping attributes...")

  data = data.drop(["txkey"], axis=1)

  # Encode data
  print("Encoding attributes...")

  label_encoder = LabelEncoder()
  id_type_columns = ["chid", "cano", "mchno", "acqic"]

  for column in id_type_columns:
      series = pd.Series(data[column])
      label_encoder.fit(series)
      data[column] = label_encoder.transform(data[column])

  numeric_features = data.select_dtypes(include=['float64', 'int64']).columns

  # Deal with NaN
  data["stscd"] = data["stscd"].fillna(0)
  if imputer == None:
    data = data.dropna(axis = 0, how = "any")
  else:
    data[numeric_features] = imputer.fit_transform(data[numeric_features])

  return data


def preprocess_testing_data(data, imputer = None):
  test_data_keys = data["txkey"]
  data = data.drop(["txkey"], axis = 1)


  # encode data
  print("Encoding attributes...")

  label_encoder = LabelEncoder()
  id_type_columns = ["chid", "cano", "mchno", "acqic"]

  for column in id_type_columns:
      series = pd.Series(data[column])
      label_encoder.fit(series)
      data[column] = label_encoder.transform(data[column])

  # Deal with NaN
  data["stscd"] = data["stscd"].fillna(0)
  if imputer != None:
    data = pd.DataFrame(imputer.fit_transform(data))

  return data, test_data_keys


### Plot the distribution of the data

In [None]:
def plot_data_distribution(data_before, data_after, columns, sample_size=None):
    if sample_size is not None:
        data_before = data_before.sample(n=sample_size, random_state=42)
        data_after = data_after.sample(n=sample_size, random_state=42)

    print(data_before.shape)
    print(data_after.shape)

    common_columns = set(data_before.columns).intersection(set(data_after.columns))
    data_before = data_before[common_columns]
    data_after = data_after[common_columns]

    figure_type_number = len(common_columns)

    save_dir = "src"
    os.makedirs(save_dir, exist_ok=True)

    for i, column in enumerate(common_columns):
        print("Column Processing: " + column)

        fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
        fig.subplots_adjust(hspace=0.5)

        if pd.api.types.is_numeric_dtype(data_before[column]):
            sns.kdeplot(data_before[column], ax=axes[0])
            axes[0].set_title(f'{column} - before')

            sns.kdeplot(data_after[column], ax=axes[1])
            axes[1].set_title(f'{column} - after')

        else:
            axes[0].set_title(f'{column} - before')

            sns.kdeplot(data_after[column], ax=axes[1])
            axes[1].set_title(f'{column} - after')

        print("{} in {} diagrams is printed.".format(i + 1, figure_type_number))

        save_path = os.path.join(save_dir, f'{column}_comparison.png')
        plt.savefig(save_path)

        plt.show()

        plt.close()


### Training Data Processing

In [None]:
batch_size = config['batch_size']
train_file = config['train_file']

training_data = pd.read_csv(train_file, nrows=1)

original_columns = training_data.columns.tolist()

training_data = pd.DataFrame()

# imputer = IterativeImputer(max_iter=8, random_state=0)
imputer = None

training_data_before = pd.DataFrame()
training_data_after = pd.DataFrame()
selected_chunk_idx = []
all_data = config['all_data']
batch_number = config['batch_number']
i, j = 0, 0

if not(all_data):
  selected_chunk_idx = select_random_numbers(0, 868, config['batch_number'])


for chunk in pd.read_csv(train_file, chunksize=batch_size, skiprows=batch_size):

  if not(all_data):
    if j >= batch_number:
      break
    elif selected_chunk_idx[j] != i:
      i += 1
      continue
    else:
      j += 1


  print("Start Loading batch no.{} ...".format(i))
  i += 1

  loaded = i if all_data == True else j
  print("Already Load {} batches !!".format(loaded))

  chunk.columns = original_columns
  chunk_before = chunk.copy()
  chunk_after = preprocess_training_data(chunk_before, imputer)


  training_data_before = pd.concat([training_data_before, chunk_before], ignore_index=True)
  training_data_after = pd.concat([training_data_after, chunk_after], ignore_index=True)

  training_data = pd.concat([training_data, chunk_after], ignore_index=True)

In [None]:
print(training_data['label'].value_counts())

### Testing Data Processing

In [None]:
test_file  = config['test_file']
testing_data = pd.read_csv(test_file)

imputer = IterativeImputer(max_iter=8, random_state=0)

testing_data, test_data_keys = preprocess_testing_data(testing_data, imputer)

In [None]:
print(testing_data.isnull().sum())

### Visualize the Data

In [None]:
plot_data_distribution(training_data_before, training_data_after, training_data.columns)

In [None]:
source_directory = "src"
zip_file_name = "src.zip"

create_zip_archive(source_directory, zip_file_name)
print(f"{zip_file_name} created successfully.")

### Final Data Processing

In [None]:
# seperate training data label
training_data_labels = training_data["label"]
training_data = training_data.drop(["label"], axis = 1)

# transform data to array
print("Transforming data frames to arrays...")
training_data_2Darray = training_data.values
training_data_label_array = training_data_labels.values
test_data_2Darray = testing_data.values
test_data_key_array = test_data_keys.values
training_data_2Darray, test_data_2Darray = feature_select(training_data_2Darray, test_data_2Darray)

### Training

In [None]:
param_dist = {
    'learning_rate': uniform(0, 0.5),
    'max_depth': randint(3, 12),
    'n_estimators': randint(200, 400),
    'subsample': uniform(0.2, 0.5),
    'colsample_bytree': uniform(0.2, 0.8),
    'gamma': uniform(0.0, 10.0)
}

random_search = RandomizedSearchCV(XGBClassifier(), param_distributions=param_dist, n_iter=30, cv=5, random_state=42, scoring='f1')
random_search.fit(training_data_2Darray, training_data_label_array)

best_params = random_search.best_params_
print(best_params)
best_model = random_search.best_estimator_

feature_importance = best_model.feature_importances_

predicted_data = best_model.predict(test_data_2Darray)

prediction_pair_data = [["txkey", "pred"]]
for i in range(len(test_data_key_array)):
    prediction_pair_data.append([str(test_data_key_array[i]), str(int(predicted_data[i]))])

pd.DataFrame(prediction_pair_data).to_csv("result.csv", header=None, index=False, encoding='utf-8')

In [None]:
feature_names = training_data.columns.tolist()

print(feature_importance)

selected_idx = config['feature_selected']
if(config['select_all'] == False):
  feature_names = [feature_names[i] for i in selected_idx]

feature_df = pd.DataFrame({'FeatureName': feature_names, 'Importance': feature_importance})

csv_file_path = 'feature_importance.csv'
feature_df.to_csv(csv_file_path, index=False)