In [9]:
!pip install --upgrade openpyxl
!pip install --upgrade xlrd



In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Step 1: Load Training and Testing Data from CSV Files
train_df = pd.read_csv('/content/fraudTrain.csv')
test_df = pd.read_csv('/content/fraudTest.csv')

# Step 2: Data Exploration and Cleaning
# Check for missing values
print(train_df.isnull().sum())
print(test_df.isnull().sum())

# Handle missing values (impute with the median for numerical features)
# Select numeric columns for imputation
numeric_cols_train = train_df.select_dtypes(include=np.number).columns
numeric_cols_test = test_df.select_dtypes(include=np.number).columns

train_df_numeric = train_df[numeric_cols_train]
test_df_numeric = test_df[numeric_cols_test]

# Replace infinite values with NaN
train_df_numeric.replace([np.inf, -np.inf], np.nan, inplace=True)
test_df_numeric.replace([np.inf, -np.inf], np.nan, inplace=True)

# Apply imputation only to numeric columns
imputer = SimpleImputer(strategy='median')
train_df_imputed_numeric = pd.DataFrame(imputer.fit_transform(train_df_numeric), columns=train_df_numeric.columns)
test_df_imputed_numeric = pd.DataFrame(imputer.transform(test_df_numeric), columns=test_df_numeric.columns)

# Concatenate imputed numeric columns back with original DataFrame
train_df_imputed = pd.concat([train_df_imputed_numeric, train_df.drop(columns=numeric_cols_train, errors='ignore')], axis=1)
test_df_imputed = pd.concat([test_df_imputed_numeric, test_df.drop(columns=numeric_cols_test, errors='ignore')], axis=1)

# Step 3: Feature Engineering
# Split the data into features (X) and target (y)
X_train = train_df_imputed.drop('is_fraud', axis=1)  # Assuming 'is_fraud' is the target column
y_train = train_df_imputed['is_fraud']

X_test = test_df_imputed.drop('is_fraud', axis=1)
y_test = test_df_imputed['is_fraud']

# Drop non-numeric columns from X_train and X_test
X_train = X_train.select_dtypes(include=np.number)
X_test = X_test.select_dtypes(include=np.number)

# Step 4: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 5: Train Models

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nRandom Forest Classification Report:\n", classification_report(y_test, y_pred_rf))

# Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nLogistic Regression Classification Report:\n", classification_report(y_test, y_pred_lr))

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_scaled, y_train)
y_pred_dt = dt_model.predict(X_test_scaled)  # Get predictions for Decision Tree
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("\nDecision Tree Classification Report:\n", classification_report(y_test, y_pred_dt))

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   1
merchant                 1
category                 1
amt                      1
first                    1
last                     1
gender                   1
street                   1
city                     1
state                    1
zip                      1
lat                      1
long                     1
city_pop                 1
job                      1
dob                      1
trans_num                1
unix_time                1
merch_lat                1
merch_long               1
is_fraud                 1
dtype: int64
Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat            

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_numeric.replace([np.inf, -np.inf], np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_numeric.replace([np.inf, -np.inf], np.nan, inplace=True)


Random Forest Accuracy: 0.9960447636305398

Random Forest Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    553574
         1.0       0.39      0.05      0.08      2145

    accuracy                           1.00    555719
   macro avg       0.69      0.52      0.54    555719
weighted avg       0.99      1.00      0.99    555719

Logistic Regression Accuracy: 0.9955211176871764

Logistic Regression Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    553574
         1.0       0.00      0.00      0.00      2145

    accuracy                           1.00    555719
   macro avg       0.50      0.50      0.50    555719
weighted avg       0.99      1.00      0.99    555719

Decision Tree Accuracy: 0.8983515049872327

Decision Tree Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      0.90