# Logistical Regression Model work


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Load datasets
credit_card_data = pd.read_csv('credit_card_transactions.csv')
fraud_test_data = pd.read_csv('fraudTest.csv')

# Sample size from the dataset
credit_card_data_sample = credit_card_data.sample(10000, random_state=42)
fraud_test_data_sample = fraud_test_data.sample(10000, random_state=42)

# Identify common columns and merge datasets
common_columns = list(set(credit_card_data_sample.columns).intersection(set(fraud_test_data_sample.columns)))
full_data = pd.concat([credit_card_data_sample[common_columns], fraud_test_data_sample[common_columns]], axis=0)

# Handle missing values
numeric_columns = full_data.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = full_data.select_dtypes(include=['object']).columns

full_data[numeric_columns] = full_data[numeric_columns].fillna(full_data[numeric_columns].mean())
for col in categorical_columns:
    full_data[col] = full_data[col].fillna(full_data[col].mode()[0])

# Encode categorical variables
full_data = pd.get_dummies(full_data, columns=categorical_columns, drop_first=True)

full_data.head()

Unnamed: 0.1,zip,merch_long,merch_lat,cc_num,long,city_pop,Unnamed: 0,amt,lat,is_fraud,...,trans_date_trans_time_2020-12-31 20:17:18,trans_date_trans_time_2020-12-31 21:15:10,trans_date_trans_time_2020-12-31 21:16:46,trans_date_trans_time_2020-12-31 21:50:38,trans_date_trans_time_2020-12-31 22:00:05,trans_date_trans_time_2020-12-31 22:14:05,trans_date_trans_time_2020-12-31 22:36:21,trans_date_trans_time_2020-12-31 23:33:31,trans_date_trans_time_2020-12-31 23:48:28,trans_date_trans_time_2020-12-31 23:48:53
1045211,15686,-78.865012,40.420453,577588686219,-79.4545,972,1045211,194.51,40.6153,0,...,False,False,False,False,False,False,False,False,False,False
547406,97476,-123.636337,42.75886,30376238035123,-124.4409,217,547406,52.32,42.825,0,...,False,False,False,False,False,False,False,False,False,False
110142,15449,-78.89819,40.475159,4658490815480264,-79.7853,184,110142,6.53,39.9636,0,...,False,False,False,False,False,False,False,False,False,False
1285953,14425,-76.542384,43.767506,3514897282719543,-77.3083,10717,1285953,7.33,42.958,0,...,False,False,False,False,False,False,False,False,False,False
271705,82221,-104.092324,41.040392,6011381817520024,-104.1974,635,271705,64.29,41.6423,0,...,False,False,False,False,False,False,False,False,False,False


In [None]:
# Split features and target
X = full_data.drop('is_fraud', axis=1)  # Replace with the actual target column name
y = full_data['is_fraud']

# Split data into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train logistic regression model
logistic_model = LogisticRegression(random_state=42, max_iter=500)
logistic_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = logistic_model.predict(X_test)
y_pred_prob = logistic_model.predict_proba(X_test)[:, 1]

# Evaluation metrics
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)

print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)
print("\nROC AUC Score:", roc_auc)

Confusion Matrix:
[[5973    0]
 [  27    0]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5973
           1       0.00      0.00      0.00        27

    accuracy                           1.00      6000
   macro avg       0.50      0.50      0.50      6000
weighted avg       0.99      1.00      0.99      6000


ROC AUC Score: 0.6908681660062875


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from imblearn.over_sampling import SMOTE

# Resample data via SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [4]:
from sklearn.metrics import precision_recall_curve

# Get prediction probabilities
y_pred_prob = logistic_model.predict_proba(X_test)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, y_pred_prob)

# Adjust threshold based on business requirements or analysis
custom_threshold = 0.3  # Example: lower threshold for more sensitivity
y_pred_custom = (y_pred_prob >= custom_threshold).astype(int)

print(classification_report(y_test, y_pred, zero_division=1))  # Avoids undefined warnings by setting zero_division=1


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5973
           1       1.00      0.00      0.00        27

    accuracy                           1.00      6000
   macro avg       1.00      0.50      0.50      6000
weighted avg       1.00      1.00      0.99      6000

