In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE

In [7]:
def load_and_explore_data(csv_file_path, is_train=True):
    """Loads and explores the data from a csv file."""
    df = pd.read_csv(csv_file_path)
    print("X----------------------------------------------------------------------X")
    print("Shape:\n", df.shape)
    print("Info:\n", df.info())
    print("Describe:\n", df.describe())
    print("Missing Values:\n", df.isnull().sum())
    if is_train:
      print("Bad Flag Distribution:\n", df['bad_flag'].value_counts(normalize=True))
    print("X----------------------------------------------------------------------X")
    return df

In [8]:
train_csv_path = 'dataset/problem2/train_data.csv'
test_csv_path = 'dataset/problem2/test_data.csv'

In [9]:
train_df = load_and_explore_data(train_csv_path)
test_df = load_and_explore_data(test_csv_path, is_train=False)

test_df = test_df.drop(columns=['bad_flag'], errors='ignore')

X----------------------------------------------------------------------X
Shape:
 (77444, 1216)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77444 entries, 0 to 77443
Columns: 1216 entries, account_number to onus_attribute_48
dtypes: float64(1189), int64(27)
memory usage: 718.5 MB
Info:
 None
Describe:
        account_number      bad_flag  onus_attribute_1  \
count    77444.000000  77444.000000      5.728900e+04   
mean     48421.346173      0.014230      1.542033e+05   
std      27972.936684      0.118437      1.720019e+05   
min          1.000000      0.000000      2.500000e+04   
25%      24178.750000      0.000000      5.900000e+04   
50%      48486.000000      0.000000      1.000000e+05   
75%      72664.250000      0.000000      1.810000e+05   
max      96805.000000      1.000000      2.800000e+06   

       transaction_attribute_1  transaction_attribute_2  \
count             57289.000000             57289.000000   
mean                  4.229119                 0.001850   


In [10]:
def preprocess_data(df, is_train=True):
    """Preprocesses the data, handles missing values, and scales numerical features."""
    numerical_features = df.select_dtypes(include=np.number).columns.tolist()
    if 'account_number' in numerical_features:
        numerical_features.remove('account_number')
    if is_train:
        numerical_features.remove('bad_flag')

    for col in numerical_features:
        if df[col].isnull().sum() > 0:
            df[col].fillna(df[col].median(), inplace=True)

    scaler = StandardScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])

    return df

In [11]:
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df, is_train=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [12]:
X = train_df.drop(['account_number', 'bad_flag'], axis=1)
y = train_df['bad_flag']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
print("Total NaNs before aggressive handling:", X_train.isnull().sum().sum())
print("Total NaNs before aggressive handling:", X_val.isnull().sum().sum())

X_train = X_train.fillna(0) # Fill remaining NaNs with 0
X_val = X_val.fillna(0)

print("Total NaNs after aggressive handling X_train:", X_train.isnull().sum().sum())
print("Total NaNs after aggressive handling X_val:", X_val.isnull().sum().sum())

Total NaNs before aggressive handling: 123910
Total NaNs before aggressive handling: 30978
Total NaNs after aggressive handling X_train: 0
Total NaNs after aggressive handling X_val: 0


In [14]:
print("Total NaNs before aggressive handling test_df:", test_df.drop('account_number', axis=1).isnull().sum().sum())
test_df_no_account = test_df.drop('account_number', axis=1).fillna(0)
print("Total NaNs after aggressive handling test_df:", test_df_no_account.isnull().sum().sum())

Total NaNs before aggressive handling test_df: 38724
Total NaNs after aggressive handling test_df: 0


In [15]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

[WinError 2] The system cannot find the file specified
  File "C:\Users\Lenovo\PyCharmMiscProject\.venv\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "C:\Users\Lenovo\pyver\py313\Lib\subprocess.py", line 556, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Lenovo\pyver\py313\Lib\subprocess.py", line 1038, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^

In [16]:
logreg = LogisticRegression(random_state=42, max_iter=1000)
logreg.fit(X_train, y_train)
logreg_probs = logreg.predict_proba(X_val)[:, 1]
print("Logistic Regression AUC:", roc_auc_score(y_val, logreg_probs))

Logistic Regression AUC: 0.7572889812394691


In [17]:
# 4. Prediction on Validation Data
test_probs = logreg.predict_proba(test_df_no_account)[:, 1]
submission_df = pd.DataFrame({'account_number': test_df['account_number'], 'predicted_probability': test_probs})
submission_df.to_csv('submission.csv', index=False)