In [2]:
!pip install imbalanced-learn

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import sklearn
import os
import boto3
import sagemaker
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [None]:
df = pd.read_csv('local_file.csv')

In [None]:
df['step'] = df['step'] / 24

In [None]:
max_value = df['step'].max()
min_value = df['step'].min()

print(f'Highest Value: {max_value}')
print(f'Lowest Value: {min_value}')

In [None]:
df['step'] = pd.to_numeric(df['step'], errors='coerce')

In [None]:

bin_edges = [0, 12, 24, 36]
bin_labels = ['Normal', 'Whole', 'Stay']
df['step'] = pd.cut(df['step'], bins=bin_edges, labels=bin_labels, right=False)

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['step'] = label_encoder.fit_transform(df['step'])


In [None]:
df['type'] = df['type'].replace(['TRANSFER', 'DEBIT'], 'OTHER')

In [None]:
frequency_map = df['type'].value_counts(normalize=False).to_dict()
df['type'] = df['type'].map(frequency_map)

In [None]:
columns_to_drop = ['nameOrig', 'nameDest', 'isFlaggedFraud']
df= df.drop(columns=columns_to_drop)

In [None]:
columns_to_drop = ['oldbalanceDest', 'oldbalanceOrg']
df = df.drop(columns=columns_to_drop)

In [None]:
X= df.drop(columns=['isFraud'])
y=df['isFraud']

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state = 42)

In [None]:
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [33]:
# Convert the resampled data back to Pandas DataFrames
X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
y_resampled_df = pd.DataFrame(y_resampled, columns=['isFraud'])
y_test_df = pd.DataFrame(y_train, columns=['isFraud'])

# Save to CSV
X_resampled_df.to_csv('X_train_resampled_svm.csv', index=False)
y_resampled_df.to_csv('y_train_resampled_svm.csv', index=False)
X_test.to_csv('X_test_svm.csv', index=False)
y_test.to_csv('y_test_svm.csv', index=False)

In [37]:
# Initialize a boto3 client
s3_client = boto3.client('s3')

bucket = 'udacity-nano-degree-project'
prefix = 'processed_data_svm'
output_prefix = 'svm-output'

# Upload the files
s3_client.upload_file('X_train_resampled_svm.csv', bucket, f'{prefix}/train/X_train_resampled.csv')
s3_client.upload_file('y_train_resampled_svm.csv', bucket, f'{prefix}/train/y_train_resampled.csv')
s3_client.upload_file('X_test_svm.csv', bucket, f'{prefix}/test/X_test_svm.csv')
s3_client.upload_file('y_test_svm.csv', bucket, f'{prefix}/test/y_test_svm.csv')

In [38]:
# Now your data is in S3 and you can set up the s3_input objects
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=f's3://{bucket}/{prefix}/train/', content_type='csv')
s3_input_test = sagemaker.inputs.TrainingInput(s3_data=f's3://{bucket}/{prefix}/test/', content_type='csv')

In [None]:
# Set your SageMaker session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()  # Make sure this IAM role has the necessary permissions

# Create a unique name for the estimator
unique_name = f"SMOTE-SVM-Estimator-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"

# Create SKLearn estimator
sklearn_estimator = SKLearn(
    entry_point='gridSearch.py', # Path to your training script
    role=role,
    instance_type='ml.m5.xlarge',  # Example instance type, choose one based on your needs
    framework_version='0.23-1',  # Match this to your scikit-learn version
    py_version='py3',  # Choose Python version
    output_path=f's3://{bucket}/{output_prefix}'
    sagemaker_session=sagemaker_session
    base_job_name=unique_name 
)

# Fit estimator
sklearn_estimator.fit({'train': s3_input_train, 'test': s3_input_test})

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import classification_report

svm_classifier = SVC()
param_dist = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
}
stratified_kfold = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)
randomized_search = RandomizedSearchCV(
    estimator=svm_classifier,
    param_distributions=param_dist,
    n_iter=10, 
    cv=stratified_kfold,
    scoring='recall',
    n_jobs=-1,
    random_state=42,
)
randomized_search.fit(X_train, y_train)
best_model = randomized_search.best_estimator_
y_pred = best_model.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)
