# Fraud Detection

### Data Collection

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
# Load the dataset 
df = pd.read_csv('creditcard.csv')
# Display basic information about the dataset 
print(df.info()) 
print(df.describe())
# Check for missing values 
print(df.isnull().sum())
# Visualize the distribution of the target variable 
sns.countplot(x='Class', data=df) 
plt.show()

### Feature Engineering

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1, 1))
df['Time'] = scaler.fit_transform(df['Time'].values.reshape(-1, 1))

### Model Selection 

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop('Class', axis=1) 
y = df['Class'] 
 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Choosing Algorithm

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Initialize models
lr = LogisticRegression()
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()

# Train models
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)
gb.fit(X_train, y_train)

# Predict and evaluate
y_pred_lr = lr.predict(X_test)
y_pred_rf = rf.predict(X_test)
y_pred_gb = gb.predict(X_test)

print("Logistic Regression:\n", classification_report(y_test, y_pred_lr))
print("Random Forest:\n", classification_report(y_test, y_pred_rf))
print("Gradient Boosting:\n", classification_report(y_test, y_pred_gb))


### Imbalanced Data

In [None]:
from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state=42) 
X_train_res, y_train_res = sm.fit_resample(X_train, y_train) 
 
rf.fit(X_train_res, y_train_res) 
y_pred_rf_res = rf.predict(X_test) 
 
print("Random Forest with SMOTE:\n", classification_report(y_test, y_pred_rf_res))

### Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV 
 
param_grid = { 
    'n_estimators': [100, 200], 
    'max_depth': [10, 20], 
    'min_samples_split': [2, 5]
    } 
 
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='f1', verbose=2) 
grid_search.fit(X_train_res, y_train_res) 
 
print("Best Parameters:", grid_search.best_params_)

### Model Evaluation 


In [None]:
from sklearn.model_selection import cross_val_score 
 
final_model = grid_search.best_estimator_ 
scores = cross_val_score(final_model, X, y, cv=5, scoring='f1') 
 
print("Cross-validation F1 scores:", scores) 
print("Mean F1 score:", scores.mean())