<a href="https://colab.research.google.com/github/invarrow/dk-tech/blob/main/creditcard.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!kaggle datasets download -d kartik2112/fraud-detection

Dataset URL: https://www.kaggle.com/datasets/kartik2112/fraud-detection
License(s): CC0-1.0
fraud-detection.zip: Skipping, found more recently modified local copy (use --force to force download)


In [9]:

!pip install pandas scikit-learn matplotlib seaborn




In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load the data
train_df = pd.read_csv('fraudTrain.csv')
test_df = pd.read_csv('fraudTest.csv')

# Define columns to drop
columns_to_drop = ['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant',
                   'first', 'last', 'street', 'city', 'state', 'zip', 'lat',
                   'long', 'job', 'dob', 'trans_num']

# Drop unnecessary columns
train_df.drop(columns_to_drop, axis=1, inplace=True)
test_df.drop(columns_to_drop, axis=1, inplace=True)

# Prepare data for training
X_train = train_df.drop('is_fraud', axis=1)
y_train = train_df['is_fraud']
X_test = test_df.drop('is_fraud', axis=1)
y_test = test_df['is_fraud']

# Define numerical and categorical columns
numerical_features = X_train.select_dtypes(include=['float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Define preprocessing pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Append classifier to preprocessing pipeline
lr_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', LogisticRegression())])

# Train the model
lr_model.fit(X_train, y_train)

# Predict on test data
y_pred_lr = lr_model.predict(X_test)

# Evaluate the model
print("Logistic Regression Classification Report")
print(classification_report(y_test, y_pred_lr))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.00      0.00      0.00      2145

    accuracy                           1.00    555719
   macro avg       0.50      0.50      0.50    555719
weighted avg       0.99      1.00      0.99    555719



In [11]:
print(train_df.head())
print(test_df.head())

        category     amt gender  city_pop   unix_time  merch_lat  merch_long  \
0       misc_net    4.97      F      3495  1325376018  36.011293  -82.048315   
1    grocery_pos  107.23      F       149  1325376044  49.159047 -118.186462   
2  entertainment  220.11      M      4154  1325376051  43.150704 -112.154481   
3  gas_transport   45.00      M      1939  1325376076  47.034331 -112.561071   
4       misc_pos   41.96      M        99  1325376186  38.674999  -78.632459   

   is_fraud  
0         0  
1         0  
2         0  
3         0  
4         0  
         category    amt gender  city_pop   unix_time  merch_lat  merch_long  \
0   personal_care   2.86      M    333497  1371816865  33.986391  -81.200714   
1   personal_care  29.84      F       302  1371816873  39.450498 -109.960431   
2  health_fitness  41.28      F     34496  1371816893  40.495810  -74.196111   
3        misc_pos  60.05      M     54767  1371816915  28.812398  -80.883061   
4          travel   3.19      M    