In [None]:
!pip install psycopg2-binary pandas

In [None]:
!pip install SQLAlchemy

In [None]:
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine('postgresql+psycopg2://postgres:September%401397@localhost:5432/fraud_detection_db')
query = "SELECT * FROM transactions;"
df = pd.read_sql(query, engine)
df.head()

In [None]:
!pip install matplotlib
!pip install seaborn

In [None]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
print("Dataset Information:\n")
print(df.info())
print("\nMissing Values in Each Column:\n")
print(df.isnull().sum())
print("\nDescriptive Statistics:\n")
print(df.describe())
plt.figure(figsize=(6, 4))
sns.countplot(x='class', data=df, palette='Set2')
plt.title('Distribution of Fraudulent (1) vs Non-Fraudulent (0) Transactions')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()
fraud_percentage = (df['class'].sum() / len(df)) * 100
print(f"\nPercentage of Fraudulent Transactions: {fraud_percentage:.4f}%")

In [None]:
!pip install scikit-learn

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, average_precision_score, PrecisionRecallDisplay
X = df.drop(columns=['time', 'class'])
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_prob = rf_model.predict_proba(X_test)[:, 1]


In [None]:
precision, recall, _ = precision_recall_curve(y_test, y_prob)
auprc_score = average_precision_score(y_test, y_prob)
print(f"AUPRC (Area Under the Precision-Recall Curve): {auprc_score:.4f}")

In [None]:
plt.figure(figsize=(8, 6))
pr_display = PrecisionRecallDisplay(precision=precision, recall=recall)
pr_display.plot()
plt.title(f'Precision-Recall Curve (AUPRC = {auprc_score:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

In [None]:
!pip install scikit-learn

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score, PrecisionRecallDisplay
log_reg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)
y_prob_log = log_reg.predict_proba(X_test)[:, 1]
print("Logistic Regression Evaluation:\n")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))
print("\nClassification Report:\n", classification_report(y_test, y_pred_log))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_prob_log))
print("AUPRC (Area Under Precision-Recall Curve):", average_precision_score(y_test, y_prob_log))
pr_display_log = PrecisionRecallDisplay.from_predictions(y_test, y_prob_log)
pr_display_log.ax_.set_title('Logistic Regression Precision-Recall Curve')

In [None]:
from sklearn.svm import SVC
svm_model = SVC(kernel='rbf', class_weight='balanced', probability=True, random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
y_prob_svm = svm_model.predict_proba(X_test)[:, 1]
print("\nSVM Evaluation:\n")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_prob_svm))
print("AUPRC (Area Under Precision-Recall Curve):", average_precision_score(y_test, y_prob_svm))
pr_display_svm = PrecisionRecallDisplay.from_predictions(y_test, y_prob_svm)
pr_display_svm.ax_.set_title('SVM Precision-Recall Curve')

In [None]:
from sklearn.metrics import RocCurveDisplay
plt.figure(figsize=(8, 6))
RocCurveDisplay.from_predictions(y_test, y_prob_log, name='Logistic Regression', ax=plt.gca())
RocCurveDisplay.from_predictions(y_test, y_prob_svm, name='SVM', ax=plt.gca())
plt.title('ROC Curves for Logistic Regression and SVM')
plt.show()

In [None]:
df['hour_of_day'] = (df['time'] // 3600) % 24
fraud_by_hour = df[df['class'] == 1]['hour_of_day'].value_counts().sort_index()
plt.figure(figsize=(10, 6))
sns.barplot(x=fraud_by_hour.index, y=fraud_by_hour.values)
plt.title('Fraudulent Transactions by Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Fraudulent Transactions')
plt.show()

In [None]:
df['amount_category'] = pd.cut(df['amount'], bins=[0, 50, 200, 1000, np.inf], labels=['Small', 'Medium', 'Large', 'Very Large'])
fraud_amount_dist = df[df['class'] == 1]['amount_category'].value_counts()
plt.figure(figsize=(8, 5))
sns.barplot(x=fraud_amount_dist.index, y=fraud_amount_dist.values)
plt.title('Fraudulent Transactions by Amount Category')
plt.xlabel('Amount Category')
plt.ylabel('Number of Fraudulent Transactions')
plt.show()

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
features_for_clustering = df.drop(columns=['class', 'time'])
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(features_for_clustering)
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(reduced_features)
plt.figure(figsize=(8, 6))
sns.scatterplot(x=reduced_features[:, 0], y=reduced_features[:, 1], hue=clusters, palette='viridis')
plt.title('Clustering of Transactions (Potential Fraud Patterns)')
plt.show()