In [28]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("../data/landing/train.csv")
test_data = pd.read_csv("../data/landing/test.csv")

In [29]:
train_data.shape

(10966, 24)

In [30]:
train_data['Age'] = train_data['Age'].apply(lambda x: x * -1 if x < 0 else (x / 1000 if x > 10000 else x))


In [31]:
typical_locations = ['Adelaide', 'Canberra', 'Melbourne', 'Sydney', 'Hobart', 'Darwin', 'Brisbane','Perth']

# Create a new column TypicalLocationFlag
train_data['TypicalLocationFlag'] = train_data['TransactionLocation'].isin(typical_locations).astype(int)

In [32]:
location_mapping = {
    'Adelaide City': 'Adelaide', 'Adl': 'Adelaide', 'adl': 'Adelaide',
    'BNE': 'Brisbane', 'Bne': 'Brisbane', 'brisbane': 'Brisbane',
    'CBR': 'Canberra', 'Cbr': 'Canberra', 'canberra': 'Canberra',
    'DRW': 'Darwin', 'Drw': 'Darwin', 'darwin': 'Darwin',
    'HBT': 'Hobart', 'Hbt': 'Hobart', 'hobart': 'Hobart',
    'MLB': 'Melbourne', 'Mel': 'Melbourne', 'Melb': 'Melbourne', 'Melburn': 'Melbourne',
    'melbourne': 'Melbourne',
    'PTH': 'Perth', 'Pth': 'Perth', 'perth': 'Perth',
    'SYD': 'Sydney', 'Syd': 'Sydney', 'sydney': 'Sydney'
}

train_data['TransactionLocation'] = train_data['TransactionLocation'].replace(location_mapping)

In [33]:
import re
import pandas as pd

def changing_currency(df, column):
    currency_symbols = {"AUD":["AU$", "AUD"], "GBP": ["GBP", "£"], "AED": ["AED"]}
    reverse_mapping = {symbol: currency for currency, symbols in currency_symbols.items() for symbol in symbols}

    numbers = []
    currencies = []

    # Process each currency string
    for item in df[column]:
        # Find all numbers and currencies using regular expressions
        match = re.search(r'([\d,.]+)\s*(\D*)|(\D*)\s*([\d,.]+)', item)

        if match:
            number = match.group(1) or match.group(4)
            currency = match.group(2) or match.group(3)
            numbers.append(float(number))
            currencies.append(str(currency))

    # Create a DataFrame with two rows
    new_df = pd.DataFrame({
        f'{column}_Amount': numbers,
        f'{column}_Currency': currencies,
    })

    new_df[f'{column}_Currency'] = new_df[f'{column}_Currency'].str.replace(' ', '', regex=False)
    new_df[f'{column}_Currency'] = new_df[f'{column}_Currency'].map(reverse_mapping)

    new_df.loc[new_df[f'{column}_Currency'] == 'GBP', f'{column}_Amount'] *= 1.96
    new_df.loc[new_df[f'{column}_Currency'] == 'AED', f'{column}_Amount'] *= 0.41
    new_df = new_df.drop(columns=[f'{column}_Currency'])
    return new_df

# Income
income_changed_df = changing_currency(train_data, "Income")

# Expenditure 
expenditure_changed_df = changing_currency(train_data, "Expenditure")

# Gifts Transaction
gift_transaction_changed_df = changing_currency(train_data, "GiftsTransaction")

# Transaction Amount
transaction_amount_changed_df = changing_currency(train_data, "TransactionAmount")

train_data = pd.concat([train_data, income_changed_df], axis=1)
train_data = pd.concat([train_data, expenditure_changed_df], axis=1)
train_data = pd.concat([train_data, gift_transaction_changed_df], axis=1)
train_data = pd.concat([train_data, transaction_amount_changed_df], axis=1)
train_data = train_data.drop(columns=["GiftsTransaction", "Expenditure", "Income", "TransactionAmount"])

train_data.to_csv('train_data_raw.csv', index=False)



In [34]:
train_data.shape

(10966, 25)

In [35]:
from datetime import datetime

def standardize_time(time_str):
    try:
        # Try parsing in 'hh/mm/ss' format (24-hour)
        time_obj = datetime.strptime(time_str, '%H/%M/%S')
        return time_obj.strftime('%H:%M:%S')
    except ValueError:
        try:
            # Try parsing in 'hh:mm:ss AM/PM' format (12-hour with AM/PM)
            time_obj = datetime.strptime(time_str, '%I:%M:%S %p')
            return time_obj.strftime('%H:%M:%S')
        except ValueError:
            try:
                # Try parsing in 'hh:mm:ss' format (24-hour)
                time_obj = datetime.strptime(time_str, '%H:%M:%S')
                return time_obj.strftime('%H:%M:%S')
            except ValueError:
                return None

def normalize_date_time(df):
    df['TransactionTime'] = df['TransactionTime'].apply(standardize_time)

    df['TransactionTime'] = pd.to_datetime(df['TransactionTime'], format='%H:%M:%S')
    df['Hour'] = df['TransactionTime'].dt.hour
    df['Minute'] = df['TransactionTime'].dt.minute

    df['TransactionDate'] = pd.to_datetime(df['TransactionDate'],format='%Y-%m-%d') 
    df['Year'] = df['TransactionDate'].dt.year
    df['Month'] = df['TransactionDate'].dt.month
    df['Day'] = df['TransactionDate'].dt.day

    df = df.drop(columns=["TransactionTime", "TransactionDate"])

    return df

train_data = normalize_date_time(train_data)
train_data


Unnamed: 0,TransactionNumber,UserID,Age,Gender,Occupation,EducationLevel,MaritalStatus,NumDependents,MerchantID,TransactionType,...,TypicalLocationFlag,Income_Amount,Expenditure_Amount,GiftsTransaction_Amount,TransactionAmount_Amount,Hour,Minute,Year,Month,Day
0,8765,70,37.0,Female,Professional,Bachelor,Widowed,3,M006,Withdrawal,...,1,28884.43,14610.6100,2058.0196,258.1400,12,25,2023,3,12
1,9645,3386,34.0,Male,Student,High School,Married,4,M002,Withdrawal,...,1,54919.07,39169.4900,9740.6316,34.9400,18,27,2023,3,5
2,1145,2971,25.0,Male,Unemployed,Master,Married,2,M008,Purchase,...,1,74728.57,55873.7600,2253.7060,323.8200,17,16,2023,11,10
3,15308,2925,25.0,Male,Professional,High School,Married,3,M001,Purchase,...,1,55712.62,36756.1064,4335.7000,13.3209,0,34,2023,10,7
4,14967,2339,38.0,Male,Professional,High School,Single,4,M001,Withdrawal,...,0,53004.70,17876.4182,4763.4800,467.7075,18,40,2023,9,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10961,11284,3632,25.0,Male,Unemployed,High School,Single,3,M007,Purchase,...,1,64488.68,21813.5300,5379.2788,182.5100,23,2,2023,6,4
10962,11964,3925,18.0,Female,Professional,High School,Married,2,M003,Purchase,...,1,80403.31,63429.0800,374.7716,137.5000,2,34,2023,5,24
10963,5390,4811,22.0,Male,Unemployed,High School,Widowed,3,M002,Purchase,...,0,29048.42,18806.3100,100.0188,112.3700,9,42,2023,12,26
10964,860,1110,29.0,Female,Student,High School,Divorced,4,M004,Purchase,...,1,28654.66,9748.5300,2311.6436,68.0500,5,24,2023,7,16


In [36]:
train_data['Transaction_Income_Ratio'] = train_data['TransactionAmount_Amount'] / train_data['Income_Amount']

# Handle division by zero if needed (e.g., replace infinity with NaN)
train_data['Transaction_Income_Ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Transaction_Income_Ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)


In [37]:
useless_features = ["Gender", "Occupation", "EducationLevel", "MaritalStatus", "NumDependents", "EmailDomain", "MerchantID"]

train_data = train_data.drop(columns=useless_features)

In [38]:
train_data.dtypes

TransactionNumber             int64
UserID                        int64
Age                         float64
TransactionType              object
TransactionLocation          object
DeviceType                   object
Latitude                    float64
Longitude                   float64
Terrorism                      bool
UserTenure                    int64
IsFraud                       int64
TypicalLocationFlag           int64
Income_Amount               float64
Expenditure_Amount          float64
GiftsTransaction_Amount     float64
TransactionAmount_Amount    float64
Hour                          int32
Minute                        int32
Year                          int32
Month                         int32
Day                           int32
Transaction_Income_Ratio    float64
dtype: object

In [39]:
numerical_data = train_data[["TransactionNumber", "UserID", "Age", "Latitude", "Longitude", "UserTenure", "TypicalLocationFlag", \
                            "Income_Amount", "Expenditure_Amount", "GiftsTransaction_Amount", "TransactionAmount_Amount", \
                                "Transaction_Income_Ratio", "Hour", "Minute", "Year", "Month", "Day"]]
obj_data = train_data[["TransactionType", "TransactionLocation", "DeviceType"]]
fraud = train_data[["IsFraud"]]
terrorism = train_data[["Terrorism"]].astype(int)

In [40]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_features_numerical = scaler.fit_transform(numerical_data)
scaled_features_numerical = pd.DataFrame(scaled_features_numerical)

In [41]:
encoded_obj = pd.get_dummies(obj_data).astype(int)

In [90]:
from sklearn.model_selection import train_test_split

final_train_data = pd.concat([scaled_features_numerical, encoded_obj], axis=1)
final_train_data = pd.concat([final_train_data, terrorism], axis=1)

X = final_train_data
y = fraud

y_freq = fraud.mode().iloc[0]
OneR_pred = [int(y_freq[0])]*len(y)

X.columns = range(X.shape[1])
y.columns = range(y.shape[1])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_freq = y_train.mode().iloc[0]
OneR_pred = [int(y_freq[0])]*len(y_test)

In [91]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)


  return fit_method(estimator, *args, **kwargs)


In [94]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.5f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 0-R
print("############################# One-R ############################")
accuracy = accuracy_score(y_test, OneR_pred)
print(f"Accuracy: {accuracy:.5f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, OneR_pred))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, OneR_pred))

Accuracy: 0.99909

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1368
           1       1.00      1.00      1.00       826

    accuracy                           1.00      2194
   macro avg       1.00      1.00      1.00      2194
weighted avg       1.00      1.00      1.00      2194


Confusion Matrix:
[[1368    0]
 [   2  824]]
############################# One-R ############################
Accuracy: 0.62352

Classification Report:
              precision    recall  f1-score   support

           0       0.62      1.00      0.77      1368
           1       0.00      0.00      0.00       826

    accuracy                           0.62      2194
   macro avg       0.31      0.50      0.38      2194
weighted avg       0.39      0.62      0.48      2194


Confusion Matrix:
[[1368    0]
 [ 826    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
cv_scores = cross_val_score(rf_model, X, y, cv=5)
print("Cross-Validation Scores for Each Fold:", cv_scores)
print(f"Mean CV Accuracy: {cv_scores.mean():.2f}")

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Cross-Validation Scores for Each Fold: [1.         0.99863201 0.99817601 0.99817601 0.999544  ]
Mean CV Accuracy: 1.00


In [19]:
from sklearn.model_selection import cross_validate

scoring = {'f1': 'f1', 'precision': 'precision', 'recall': 'recall'}

# Perform cross-validation
scores = cross_validate(rf_model, X, y, cv=5, scoring=scoring, return_train_score=False)

# Print results
print(f"F1 Scores: {scores['test_f1']}")
print(f"Precision Scores: {scores['test_precision']}")
print(f"Recall Scores: {scores['test_recall']}")

# Print average scores
print(f"Average F1 Score: {scores['test_f1'].mean()}")
print(f"Average Precision: {scores['test_precision'].mean()}")
print(f"Average Recall: {scores['test_recall'].mean()}")

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


F1 Scores: [1.         0.99811912 0.99749373 0.99749059 0.99937383]
Precision Scores: [1.         1.         0.99874529 1.         1.        ]
Recall Scores: [1.         0.99624531 0.99624531 0.99499374 0.99874844]
Average F1 Score: 0.9984954544455835
Average Precision: 0.9997490589711419
Average Recall: 0.9972465581977472


# Test

In [20]:
test_data['Age'] = test_data['Age'].apply(lambda x: x * -1 if x < 0 else (x / 1000 if x > 10000 else x))
test_data['TypicalLocationFlag'] = test_data['TransactionLocation'].isin(typical_locations).astype(int)
test_data['TransactionLocation'] = test_data['TransactionLocation'].replace(location_mapping)

In [21]:
# Income
income_changed_df = changing_currency(test_data, "Income")

# Expenditure 
expenditure_changed_df = changing_currency(test_data, "Expenditure")

# Gifts Transaction
gift_transaction_changed_df = changing_currency(test_data, "GiftsTransaction")

# Transaction Amount
transaction_amount_changed_df = changing_currency(test_data, "TransactionAmount")

test_data = pd.concat([test_data, income_changed_df], axis=1)
test_data = pd.concat([test_data, expenditure_changed_df], axis=1)
test_data = pd.concat([test_data, gift_transaction_changed_df], axis=1)
test_data = pd.concat([test_data, transaction_amount_changed_df], axis=1)
test_data = test_data.drop(columns=["GiftsTransaction", "Expenditure", "Income", "TransactionAmount"])

In [22]:
test_data = normalize_date_time(test_data)

In [23]:
test_data['Transaction_Income_Ratio'] = test_data['TransactionAmount_Amount'] / train_data['Income_Amount']

# Handle division by zero if needed (e.g., replace infinity with NaN)
test_data['Transaction_Income_Ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Transaction_Income_Ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)


In [24]:
useless_features = ["Gender", "Occupation", "EducationLevel", "MaritalStatus", "NumDependents", "EmailDomain", "MerchantID"]

test_data = test_data.drop(columns=useless_features)

In [25]:
numerical_test_data = test_data[["TransactionNumber", "UserID", "Age", "Latitude", "Longitude", "UserTenure", "TypicalLocationFlag", \
                            "Income_Amount", "Expenditure_Amount", "GiftsTransaction_Amount", "TransactionAmount_Amount", \
                                "Transaction_Income_Ratio", "Hour", "Minute", "Year", "Month", "Day"]]
obj_test_data = test_data[["TransactionType", "TransactionLocation", "DeviceType"]]
terrorism_test = test_data[["Terrorism"]].astype(int)

In [26]:
scaler = StandardScaler()
scaled_features_numerical_test = scaler.fit_transform(numerical_test_data)
scaled_features_numerical_test = pd.DataFrame(scaled_features_numerical_test)

encoded_test_obj = pd.get_dummies(obj_test_data).astype(int)

final_test_data = pd.concat([scaled_features_numerical_test, encoded_test_obj], axis=1)
final_test_data = pd.concat([final_test_data, terrorism_test], axis=1)

transactionID_test = test_data[["TransactionNumber"]]
final_test_data.columns = range(X.shape[1])

y_pred_test = rf_model.predict(final_test_data)


In [27]:
y_pred_test = pd.DataFrame(y_pred_test, columns=['IsFraud'])  
solution = pd.concat([transactionID_test, y_pred_test], axis=1)
solution.to_csv('solution.csv', index=False)