In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
from imblearn.over_sampling import SMOTE

In [14]:
df = pd.read_parquet('train_test/imputed_train.parquet')

target_column = 'activation'
remove_list = ['activation', 'ind_recommended']
features = [col for col in df.columns.to_list() if col not in remove_list]

In [23]:
# def get_init_params(params, verbose=1, cat_idxs=None, cat_dims=None):
#     return {
#         "n_d": params.get("n_d",8),
#         "n_a": params.get("n_d", 8),
#         "n_steps": params.get("n_steps", 3),
#         "n_shared": params.get("n_shared", 2),
#         "cat_emb_dim": params.get("cat_emb_dim", 1),
#         "optimizer_params": {"lr": params.get("lr", 2e-2)},
#         "mask_type": params.get("mask_type", "sparsemax"),
#         "lambda_sparse": params.get("lambda_sparse", 1e-3),
#         "optimizer_fn": torch.optim.Adam,
#         "cat_idxs": cat_idxs or [],
#         "cat_dims": cat_dims or [],
#         "verbose": verbose,
#     }

def get_init_params(verbose=1, cat_idxs=None, cat_dims=None):
    return {
        "n_d": 8,
        "n_a": 8,
        "n_steps": 3,
        "n_shared": 2,
        "cat_emb_dim": 1,
        "optimizer_params": {"lr": 2e-2},
        "mask_type": "sparsemax",
        "lambda_sparse": 1e-3,
        "optimizer_fn": torch.optim.Adam,
        "cat_idxs": cat_idxs or [],
        "cat_dims": cat_dims or [],
        "verbose": 1,
    }

In [4]:
import torch
print(torch.cuda.is_available())

True


In [10]:
df = pd.read_parquet('train_test/imputed_train.parquet')

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
import joblib
from sklearn.pipeline import Pipeline


# Define your target and feature list
target_column = 'activation'
remove_list = ['activation', 'ind_recommended']

target_df = df[target_column]
df = df.drop(remove_list, axis=1)

features = [col for col in df.columns.to_list()]

# Specify categorical and numerical features
categorical_features = ['merchant_profile_01']
numerical_features = [col for col in features if col not in categorical_features]

# Find indices of categorical features for TabNet
cat_idxs = [features.index(cat_feat) for cat_feat in categorical_features]

# Calculate the number of unique values for each categorical feature
cat_dims = [len(df[col].unique()) for col in categorical_features]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(df[features], target_df, test_size=0.2, random_state=42)

print('Preprocessing data...')
# Initialize a StandardScaler and scale numerical features

num_transformer = Pipeline(
    steps = [
        ('scaler', StandardScaler())  
    ]
)

# create a pipeline to fill missing values and encode categorical variables

cat_transformer = Pipeline(
    steps = [
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

# combining both pipelines

preprocessor = ColumnTransformer(
    transformers = [
        ('num', num_transformer, numerical_features),
    ],  remainder='passthrough' 
)

X_train_scaled = preprocessor.fit_transform(X_train)
X_val_scaled = preprocessor.transform(X_val)
preprocessor_save_path = 'scaler_model/preprocessor1.joblib'
joblib.dump(preprocessor, preprocessor_save_path)

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Initialize and configure the TabNet model
#init_params = get_init_params(cat_idxs=cat_idxs, cat_dims=cat_dims)
#clf = TabNetClassifier(**init_params)
clf = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=2e-2),
                       scheduler_params={"step_size":50, "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='sparsemax')

print("Start training...")

# Train the model
clf.fit(
    X_train=X_train_smote, y_train=y_train_smote,
    eval_set=[(X_val_scaled, y_val.values)],
    eval_name=['val'],
    eval_metric=['accuracy'],
    max_epochs=10,  # Adjust as needed
    patience=10,  # For early stopping
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)
model_save_path = 'ml_models/clf_model.zip'  # Choose your path and file name
clf.save_model(model_save_path)

Preprocessing data...




Start training...
epoch 0  | loss: 0.29513 | val_accuracy: 0.56579 |  0:21:06s


In [2]:
pip install --upgrade pip

Note: you may need to restart the kernel to use updated packages.


In [3]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))
print(tf.test.is_gpu_available())

[]
False


2024-03-18 11:46:14.007015: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
import joblib
# Assuming df, target_column, and features are already defined as per your code snippet

# Split data into features and target
X = df[features].values
y = df[target_column].values

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define your categorical and numerical features again
categorical_features = ['merchant_profile_01']
numerical_features = [col for col in features if col not in categorical_features]

# Create DataFrames for ease of use with ColumnTransformer
X_train_df = pd.DataFrame(X_train, columns=numerical_features + categorical_features)
X_test_df = pd.DataFrame(X_test, columns=numerical_features + categorical_features)

# Define the preprocessor with one-hot encoding for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Apply the preprocessing to training and test data
X_train_transformed = preprocessor.fit_transform(X_train_df)
X_test_transformed = preprocessor.transform(X_test_df)

# Save the preprocessor for later use
preprocessor_save_path = 'scaler_model/preprocessor_with_encoding.joblib'
joblib.dump(preprocessor, preprocessor_save_path)

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_transformed, y_train)

In [19]:
clf = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=2e-2),
                       scheduler_params={"step_size":50, "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='sparsemax')

# Fit the model directly with NumPy arrays
max_epochs = 30
clf.fit(
    X_train=X_train_smote,  # NumPy array
    y_train=y_train_smote,  # NumPy array
    max_epochs=max_epochs,
    patience=15,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

model_save_path = 'ml_models/clf_model.zip'  # Choose your path and file name
clf.save_model(model_save_path)

# Predict on test set with NumPy array
preds_proba = clf.predict_proba(X_test_transformed)[:, 1]
preds = (preds_proba > 0.5).astype(int)

accuracy = accuracy_score(y_test, preds)
print(f"Accuracy: {accuracy}")

RuntimeError: [enforce fail at alloc_cpu.cpp:117] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 44152038090000 bytes. Error code 12 (Cannot allocate memory)

In [2]:
test_df = pd.read_parquet('train_test/imputed_test.parquet')
original_test_df = pd.read_csv('evaluation.csv')

customer_merchant_info = original_test_df[['customer', 'merchant']]


In [None]:
import joblib

preprocessor = joblib.load('scaler_model/preprocessor_with_encoding.joblib')
X_new_transformed = preprocessor.transform(test_df[features])
dtest_new = xgb.DMatrix(X_new_transformed)
pred_probs = bst.predict(dtest_new)

In [None]:
predictions = pd.DataFrame({
    'predicted_score': pred_probs
})

# Concatenate the Customer and Merchant columns with the predictions
# Make sure that the indices align correctly
final_output = pd.concat([customer_merchant_info.reset_index(drop=True), predictions], axis=1)

In [None]:
import os
import re
import pandas as pd

def save_new_submission(df, base_path='submission/', base_filename='final_submission'):
    """
    Saves the DataFrame to a new CSV file, incrementing the file name index to avoid overwrites.

    Parameters:
    - df: DataFrame to save.
    - base_path: The directory where the file should be saved.
    - base_filename: The base name for the file, without index and extension.
    """
    # Ensure the base_path exists
    if not os.path.exists(base_path):
        os.makedirs(base_path)

    # Regular expression to match files with the pattern: base_filename_x.csv
    pattern = re.compile(rf'^{base_filename}_([0-9]+)\.csv$')

    # Get a list of all files in the base_path
    files = os.listdir(base_path)

    # Find all files matching the pattern and extract their indices
    indices = [int(pattern.match(file).group(1)) for file in files if pattern.match(file)]

    # Determine the next index (start with 1 if no files are found)
    next_index = max(indices) + 1 if indices else 1

    # Construct the new file name with the next index
    new_file_name = f'{base_filename}_{next_index}.csv'

    # Full path for the new file
    full_path = os.path.join(base_path, new_file_name)

    # Save the DataFrame to the new file
    df.to_csv(full_path, index=False)
    print(f'File saved as: {full_path}')


In [None]:
save_new_submission(final_output)