In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
file_path = 'preprocessed_dataset.csv'
data = pd.read_csv(file_path)

columns = ['line_number', 'merchant', 'category', 'amt', 'gender', 'city', 'state','city_pop', 'job', 'is_fraud', 'unix_time', 'age']

categorical_columns_onehot = ['gender', 'category']
categorical_columns_label = ['merchant', 'city','state', 'job']
numerical_columns = ['line_number', 'amt', 'city_pop','unix_time', 'age']

scaler = StandardScaler()
encoder = OneHotEncoder()
label = LabelEncoder()

for col in categorical_columns_label:
    data[col] = label.fit_transform(data[col])

scaled_data = scaler.fit_transform(data[numerical_columns])
scaled_df = pd.DataFrame(scaled_data, columns=numerical_columns)

encoded_data_onehot = encoder.fit_transform(data[categorical_columns_onehot]).toarray()
encoded_df_onehot = pd.DataFrame(encoded_data_onehot, columns=encoder.get_feature_names_out(categorical_columns_onehot))

combined_df = pd.concat([scaled_df, encoded_df_onehot, data[categorical_columns_label], data['is_fraud']], axis=1)

print(combined_df.head())
print(combined_df.info())

combined_df.to_csv('preprocessed_classification_data.csv', index=False)

In [None]:
data = pd.read_csv('preprocessed_classification_data.csv')

X = data.drop('is_fraud', axis=1)
y = data['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

plt.figure(figsize=(10, 5))
plt.plot(['Train', 'Test'], [train_accuracy, test_accuracy], marker='o', linestyle='--', color='b')
plt.title('Model Accuracy')
plt.xlabel('Dataset')
plt.ylabel('Accuracy')
plt.grid(True)
plt.show()