In [None]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
import sys

pd.set_option("display.max.columns", None)

In [2]:
def reduce_mem_usage(data_df, sparse=False):
    """Reduce memory usage of Pandas DF.
    From https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
    
    :param df: Pandas DF
    :type df: :class:`pandas.DataFrame`
    :return: Pandas DF
    :rtype: :class:`pandas.DataFrame`
    """
    megabyte = 1024**2
    start_mem = data_df.memory_usage().sum()/megabyte
    print(f'Memory usage of dataframe is {start_mem:.2f} MB')

    for col in data_df.columns:
        col_type = data_df[col].dtype

        if col_type != object:
            c_min = data_df[col].min()
            c_max = data_df[col].max()

            if str(col_type).startswith('int') or str(col_type).startswith('uint'):
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data_df[col] = data_df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data_df[col] = data_df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data_df[col] = data_df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data_df[col] = data_df[col].astype(np.int64)
            elif str(col_type).startswith('float'):
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data_df[col] = data_df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data_df[col] = data_df[col].astype(np.float32)
                else:
                    data_df[col] = data_df[col].astype(np.float64)
        else:
            data_df[col] = data_df[col].astype('category')

    if sparse:
        data_df = csr_matrix(data_df)
        end_mem = (data_df.data.nbytes + data_df.indptr.nbytes + data_df.indices.nbytes)/megabyte
    else:
        end_mem = data_df.memory_usage().sum()/megabyte
        
    percent_change = 100*(start_mem-end_mem)/start_mem
    print(f'Memory usage after optimization is: {end_mem:.2f} MB')
    print(f'Decreased by {percent_change:.1f}%')

    return data_df

In [None]:
input_path = sys.argv[1]
train_df = pd.read_csv("./data/training_data.csv")

In [9]:
train_df, val_df = train_test_split(train_df)
print(train_df.shape)
print(val_df.shape)

(108174, 2845)
(36059, 2845)


In [10]:
x_train, y_train = train_df.drop(["TransactionID","isFraud"], axis=1), train_df[["TransactionID", "isFraud"]]
x_val, y_val = val_df.drop(["TransactionID","isFraud"], axis=1), val_df[["TransactionID", "isFraud"]]

In [11]:
x_train = reduce_mem_usage(x_train, sparse=True)

Memory usage of dataframe is 583.69 MB
Memory usage after optimization is: 342.91 MB
Decreased by 41.3%


In [12]:
x_val = reduce_mem_usage(x_val, sparse=True)

Memory usage of dataframe is 194.57 MB
Memory usage after optimization is: 114.28 MB
Decreased by 41.3%


In [13]:
model = lgb.LGBMClassifier(boosting_type="goss", n_estimators=500, objective="binary", silent=False)
trained_model = model.fit(x_train, y_train["isFraud"].values)

In [None]:
model_file_name = sys.argv[2]
with open(f"./data/{model_file_name}") as file:
    pickle.dump(trained_model, file)