In [13]:
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

historical_df = pd.read_csv('./historical_transaction.csv')
current_df = pd.read_csv('./current_transaction.csv')
historical_df_without_y = historical_df.drop(columns='y')

historical_df

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x91,x92,x93,x94,x95,x96,x97,x98,x99,y
0,-0.247824,6.425853,-6.485658,-8.241462,,0.03%,Tuesday,-9.808905,4.753455,2.537819,...,0.592543,14.967527,6.691044,-0.275572,2.518542,4.637431,-14.296510,1.898115,-1.225228,0
1,2.623274,-2.808263,5.321746,-2.437653,0.614041,0.02%,Wednesday,10.710053,1.608882,-2.027541,...,-1.712788,-3.638637,-2.256492,2.703101,1.760729,2.660614,1.182922,-3.203078,0.711018,0
2,-5.460551,5.997701,-4.614194,-6.035128,-1.986438,-0.0%,Tuesday,-3.364649,-2.493473,1.429913,...,2.206956,7.292558,10.095556,1.601155,1.710645,-12.519434,-5.261707,2.032666,-6.445528,0
3,1.566307,-0.080517,0.064754,6.063193,-2.253075,0.0%,Thurday,-6.111982,-1.796377,-3.169510,...,-3.841669,5.189840,10.291069,-3.642396,0.674300,-1.104317,-3.922342,-0.613947,7.275179,0
4,0.750715,15.656095,-6.778600,11.251037,-0.308200,0.0%,Wednesday,2.498892,-0.323229,-2.624680,...,8.933781,-7.295794,-11.246059,-4.876594,3.395415,-2.646844,-5.273267,-5.259536,-3.240523,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,-2.664967,-3.427965,-5.143886,0.289251,-3.202695,0.01%,Tuesday,-13.356218,0.244953,-2.524016,...,2.123462,6.139848,-4.323045,-6.453976,6.345870,6.482477,-11.562510,4.216840,5.689256,0
7996,2.174302,-4.912113,-4.803975,-22.081911,,0.0%,Tuesday,-0.266593,-1.743051,5.870177,...,-12.307131,-4.762671,10.574798,-3.954958,0.998331,9.001036,1.171144,-1.001999,9.383899,0
7997,2.247079,-3.377818,-1.567924,-7.445187,-0.677291,-0.01%,Wednesday,6.618001,2.323299,3.724704,...,-8.084817,8.031024,7.282587,2.439789,1.666460,0.566402,3.591239,2.962485,0.980093,0
7998,-1.402385,-20.117766,3.705099,11.634801,0.680733,-0.0%,Tuesday,-4.304220,-1.730289,-12.788627,...,4.920424,-0.954106,6.261179,0.628781,-0.109908,2.086040,12.889705,7.359320,0.124861,0


In [14]:
historical_df_without_y.dtypes[:-1].value_counts()

float64    93
object      6
dtype: int64

In our dataset, there are six columns identified as categorical data types (`object`), which need to be converted into integer values before they can be effectively utilized in our model.

In [15]:
historical_df_without_y.select_dtypes(include='object')

Unnamed: 0,x5,x6,x20,x27,x49,x57
0,0.03%,Tuesday,Aug,Afternoon,False,$1578.34
1,0.02%,Wednesday,Jul,Afternoon,False,$767.76
2,-0.0%,Tuesday,Aug,Evening,False,$931.06
3,0.0%,Thurday,Aug,Afternoon,True,$2124.52
4,0.0%,Wednesday,Jul,Evening,False,$-3055.67
...,...,...,...,...,...,...
7995,0.01%,Tuesday,Jun,Afternoon,False,$-1664.04
7996,0.0%,Tuesday,May,Afternoon,False,$-42.81
7997,-0.01%,Wednesday,Aug,Afternoon,True,$220.59
7998,-0.0%,Tuesday,Aug,Evening,True,$467.47


- **Column `x5`**:
  - The values in this column represent percentages stored as strings with a trailing '%'. 
  - To use these values for numerical analysis, we first remove the '%' sign from each string, and then convert the resulting strings to floating-point numbers.

- **Column `x57`**:
  - This column contains monetary values represented as strings prefixed with a dollar sign ('$').
  - Similar to the transformation for column `x5`, we remove the '$' sign and convert the strings to floats.

In [16]:
historical_df_without_y['x5'] = historical_df_without_y['x5'].str.rstrip('%').astype('float') / 100.0
historical_df_without_y['x57'] = historical_df_without_y['x57'].astype(str).replace('[\$,]', '', regex=True).astype('float')

historical_df_without_y[['x5', 'x57']]

Unnamed: 0,x5,x57
0,0.0003,1578.34
1,0.0002,767.76
2,-0.0000,931.06
3,0.0000,2124.52
4,0.0000,-3055.67
...,...,...
7995,0.0001,-1664.04
7996,0.0000,-42.81
7997,-0.0001,220.59
7998,-0.0000,467.47


To prepare the dataset for model training, we undertake a series of preprocessing steps.

- **Numerical Columns**:
  - We attempt to select columns of type `int64`, assuming these represent numerical data.

- **Categorical Columns**:
  - We explicitly define a list of columns that are known to contain categorical data: `['x6', 'x20', 'x27', 'x49']`.

- **Numerical Pipeline**:
  - **Imputer**: Missing values in numerical columns are replaced with the median of each column.
  - **Scaler**: StandardScaler is applied to normalize the data, ensuring that each feature contributes equally to model training by bringing them to a common scale.

- **Categorical Pipeline**:
  - **Imputer**: For categorical columns, missing values are replaced with the most frequent value in each column.
  - **One-Hot Encoder**: Converts categorical variables into a form that could be provided to ML algorithms. This encoder creates binary columns for each category/level and handles unknown categories by ignoring them.

In [17]:
numerical_cols = historical_df_without_y.select_dtypes(include=['int64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

remaining_categorical_cols = ['x6', 'x20', 'x27', 'x49']
numerical_cols = historical_df_without_y.columns.difference(remaining_categorical_cols + ['y'])
categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, remaining_categorical_cols)
        ])

preprocessor.fit(historical_df_without_y)
historical_df_without_y = preprocessor.transform(historical_df_without_y)
feature_names = list(numerical_cols) + \
                list(preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(remaining_categorical_cols))


historical_df_without_y = pd.DataFrame(historical_df_without_y, columns=feature_names)
historical_df_without_y.head()

Unnamed: 0,x0,x1,x10,x11,x12,x13,x14,x15,x16,x17,...,x20_May,x20_Nov,x20_Oct,x20_Sept,x27_Afternoon,x27_Evening,x27_Morning,x27_Night,x49_False,x49_True
0,-0.075227,0.431298,-0.102254,1.120197,0.260087,-1.199619,-2.450272,-1.659281,1.102589,2.012662,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.626326,-0.229215,0.293587,0.199684,0.331408,0.632379,-0.225701,-0.157557,-0.516379,-0.119575,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,-1.348959,0.400672,-0.934314,-1.947026,-0.016606,0.222856,-1.347693,-0.992515,-1.378931,-2.035756,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.368056,-0.0341,-0.384268,0.119804,-0.373455,-1.815058,-0.902405,-0.288641,-0.613191,-1.453367,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.168766,1.091534,2.082734,-0.515513,2.155874,2.58013,-0.365617,0.84976,-1.017621,-0.146502,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [18]:
historical_df['y'].value_counts(normalize=True)

0    0.945125
1    0.054875
Name: y, dtype: float64

The dataset shows a significant imbalance in the distribution of the target variable `y`, where:

#### Strategy to Mitigate Imbalance
To address this imbalance and improve the model's performance on minority class predictions, we will implement the following strategies:

1. **Oversampling the Minority Class**:
   - We will use **SMOTE** (Synthetic Minority Over-sampling Technique) to artificially increase the number of samples in the minority class (`1`). SMOTE generates synthetic samples from the minority class by interpolating existing samples, thus providing a more balanced dataset.

2. **Undersampling the Majority Class**:
   - Alongside oversampling, we will also apply **RandomUnderSampler** to reduce the number of samples in the majority class (`0`). This method randomly discards samples from the majority class to prevent its overwhelming influence during model training.

3. **Experimenting with Different Ratios**:
   - We will experiment with different oversampling and undersampling ratios to find the optimal balance that maximizes the model's performance, especially in terms of precision and recall for the minority class.
   Ratios that will be tested: (10,90) , (20,80) , (30,70) , (40,60) , (50,50)

#### Models to be Tested

1. Support Vector Machine (SVM)

2. Logistic Regression

3. XGBoost

4. Random Forest

5. K-Nearest Neighbors (KNN):
   - We test multiple values of \( k \) (e.g., 1, 5, 10, 15, 20) to determine the optimal number of neighbors that balances sensitivity and specificity.

#### Evaluation Process

- **Data Splitting**: The dataset is divided into 70% training and 30% testing, ensuring models are tested against unseen data.
- **Model Fitting and Prediction**: Each model is trained on resampled training data and evaluated on the test set.
- **Performance Metrics**: Accuracy, precision, recall, and F1-score are calculated for each model.

In [20]:
models = [
    {'name': 'svc', 'model': SVC()},
    {'name': 'logistic regression', 'model': LogisticRegression(max_iter=1000)},
    {'name': 'xgboost', 'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss')},
    {'name': 'random forest', 'model': RandomForestClassifier()}
] + [{'name': f'knn, k={k}', 'model': KNeighborsClassifier(n_neighbors=k)} for k in range(1, 21, 4)]

ratios = [(i, 100 - i) for i in range(10, 60, 10)]  # Format (minority, majority)

x = historical_df_without_y
y = historical_df['y']

# Split data into training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

best_overall_model = None
max_overall_accuracy = 0

for ratio in ratios:
    strategy = {0: int(len(y_train) * ratio[1] / 100), 1: int(len(y_train) * ratio[0] / 100)}
    pipeline = ImbPipeline([
        ('oversample', SMOTE(sampling_strategy=strategy[1] / strategy[0])),
        ('undersample', RandomUnderSampler(sampling_strategy=strategy)),
    ])

    # Resample the training data
    X_resampled, y_resampled = pipeline.fit_resample(x_train, y_train)

    for model_info in models:
        model = model_info['model']
        model_name = model_info['name']
        model.fit(X_resampled, y_resampled)
        y_pred = model.predict(x_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        if accuracy > max_overall_accuracy:
            max_overall_accuracy = accuracy
            best_overall_model = {
                'model': model,
                'model_name': model_name,
                'sampling_ratio': ratio,
                'accuracy': accuracy,
                'report': classification_report(y_test, y_pred)
            }

# Display the best overall model
if best_overall_model:
    print(f"Best Overall Model: {best_overall_model['model_name']}")
    print(f"Ratio: {best_overall_model['sampling_ratio']}")
    print(f"Accuracy: {best_overall_model['accuracy']}\n")
    print(best_overall_model['report'])

Best Overall Model: xgboost
Ratio: (10, 90)
Accuracy: 0.97

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      2266
           1       0.84      0.57      0.68       134

    accuracy                           0.97      2400
   macro avg       0.91      0.78      0.83      2400
weighted avg       0.97      0.97      0.97      2400



Best Model has been evaluated to be xgboost with the (10,90) ratio.

To ensure consistency and accuracy in our predictions, we will apply the same preprocessing steps to the current transaction data as those used on the historical data.

In [21]:
best_ratio = best_overall_model['sampling_ratio']
strategy = {0: int(len(y_train) * best_ratio[1] / 100), 1: int(len(y_train) * best_ratio[0] / 100)}

pipeline = ImbPipeline([
    ('oversample', SMOTE(sampling_strategy=strategy[1] / strategy[0])),
    ('undersample', RandomUnderSampler(sampling_strategy=strategy)),
])

current_df['x5'] = current_df['x5'].str.rstrip('%').astype('float') / 100.0
current_df['x57'] = current_df['x57'].replace('[\$,]', '', regex=True).astype('float')

numerical_cols = current_df.select_dtypes(include=['int64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

remaining_categorical_cols = ['x6', 'x20', 'x27', 'x49']
numerical_cols = current_df.columns.difference(remaining_categorical_cols + ['y'])
categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, remaining_categorical_cols)
        ])

preprocessor.fit(current_df)
transformed_current_df = preprocessor.transform(current_df)

feature_names = list(numerical_cols) + \
                list(preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(remaining_categorical_cols))
transformed_current_df = pd.DataFrame(transformed_current_df, columns=feature_names)
transformed_current_df = preprocessor.transform(current_df)

# Now proceed to predict using the trained model
model = best_overall_model['model'].fit(X_resampled, y_resampled)
y_pred = model.predict(transformed_current_df)

pd.DataFrame(y_pred).value_counts()

0    1896
1     104
dtype: int64

In [23]:
predictions_df = pd.DataFrame(y_pred, columns=['predicted y'])

predictions_df.to_csv('./current_transaction_predictions.csv', index=False)