In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from utils import evaluate_model

In [None]:
df = pd.read_csv('data\credit_card_transactions.csv')

In [None]:
df_dropped = df.copy()
df_dropped = df_dropped.drop(columns=['Unnamed: 0','trans_date_trans_time','cc_num','first', 'last', 'gender', 'street', 'lat', 'long', 'dob','trans_num','merch_lat', 'merch_long', 'unix_time','city', 'state'])

In [None]:
df_dropped['merch_zipcode'] = df_dropped['merch_zipcode'].fillna(df_dropped['zip'])

In [None]:
df_copy = df_dropped.copy()
df_copy = df_copy[df_copy['amt'] < 10000]

In [None]:
dummies = pd.get_dummies(df_copy['category'], drop_first=True, dtype=int)
df_copy = pd.concat([df_copy, dummies], axis=1)
df_copy = df_copy.drop(columns=['category'])

In [None]:
scaler = StandardScaler()
df_scaled = df_copy.copy()
df_scaled[['amt', 'zip', 'city_pop', 'merch_zipcode']] = scaler.fit_transform(df_scaled[['amt', 'zip', 'city_pop', 'merch_zipcode']])
df_scaled.head()

In [None]:
merch_freq = df_scaled['merchant'].value_counts()
df_scaled['merchant_encoded'] = df_scaled['merchant'].apply(lambda x: merch_freq[x])
df_scaled = df_scaled.drop(columns=['merchant'])

In [None]:
job_freq = df_scaled['job'].value_counts()
df_scaled['job_encoded'] = df_scaled['job'].apply(lambda x: job_freq[x])
df_scaled = df_scaled.drop(columns=['job'])

In [None]:
df_scaled.head()

In [None]:
X = df_scaled.drop(columns=['is_fraud'])
y = df_scaled['is_fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
LogReg = LogisticRegression(solver='saga',max_iter=5000)
evaluate_model(LogReg, X_train, y_train, X_test, y_test)

In [None]:
SGD = SGDClassifier(loss = 'log_loss', max_iter=5000, tol=1e-3, random_state=42)
evaluate_model(SGD, X_train, y_train, X_test, y_test)

In [None]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
smoteenn = SMOTEENN(random_state=42)
X_train_resampled, y_train_resampled = smoteenn.fit_resample(X_train, y_train)

In [None]:
pca_model = PCA(n_components = 10)
pca_model.fit(X_train_resampled)

X_train_pca = pd.DataFrame(pca_model.transform(X_train_resampled))
X_test_pca = pd.DataFrame(pca_model.transform(X_test))
X_train_pca.head()

In [None]:
RFC = RandomForestClassifier(n_estimators=100, random_state=42)
evaluate_model(RFC, X_train_pca, y_train_resampled, X_test_pca, y_test)

In [None]:
param_grid = {'n_neighbors': range(1, 21)}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_resampled, y_train_resampled)
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
evaluate_model(knn, X_train_resampled, y_train_resampled, X_test, y_test)