# Exploratory Data Analysis (EDA)

This notebook explores the Credit Card Fraud Detection dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, average_precision_score


In [None]:
df = pd.read_csv('../data/creditcard.csv')
df.head()

In [None]:
class_counts = df['Class'].value_counts()
class_counts.plot(kind='bar', title='Class Distribution')
plt.xlabel('Class (0=Non-Fraud, 1=Fraud)')
plt.ylabel('Count')
plt.show()
print(class_counts / len(df))

In [None]:
corrs = df.corr(numeric_only=True)['Class'].abs().sort_values(ascending=False)
corrs[1:6]  # skip 'Class' itself

In [None]:
X = df.drop('Class', axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
clf = LogisticRegression(max_iter=200, solver='liblinear')
clf.fit(X_train, y_train)
y_score = clf.predict_proba(X_test)[:,1]
precision, recall, _ = precision_recall_curve(y_test, y_score)
pr_auc = average_precision_score(y_test, y_score)
plt.plot(recall, precision, label=f'PR curve (AUC={pr_auc:.3f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve (Logistic Regression)')
plt.legend()
plt.show()

In [None]:
# Feature summary
df.describe().T