In [2]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Obtaining dependency information for imbalanced-learn from https://files.pythonhosted.org/packages/a3/9e/fbe60a768502af54563dcb59ca7856f5a8833b3ad5ada658922e1ab09b7f/imbalanced_learn-0.11.0-py3-none-any.whl.metadata
  Using cached imbalanced_learn-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Collecting scikit-learn>=1.0.2 (from imbalanced-learn)
  Obtaining dependency information for scikit-learn>=1.0.2 from https://files.pythonhosted.org/packages/d0/0b/26ad95cf0b747be967b15fb71a06f5ac67aba0fd2f9cd174de6edefc4674/scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Using cached imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
Using cached scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
Installing collected packages: scikit-learn, imbalanced-learn
  Attempting uninstall: scikit-

In [3]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import sklearn
import os
import boto3
import sagemaker
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [4]:
df = pd.read_csv('local_file.csv')

In [5]:
df['step'] = df['step'] / 24

In [6]:
max_value = df['step'].max()
min_value = df['step'].min()

print(f'Highest Value: {max_value}')
print(f'Lowest Value: {min_value}')

Highest Value: 30.958333333333332
Lowest Value: 0.041666666666666664


In [7]:
df['step'] = pd.to_numeric(df['step'], errors='coerce')

In [8]:

bin_edges = [0, 12, 24, 36]
bin_labels = ['Normal', 'Whole', 'Stay']
df['step'] = pd.cut(df['step'], bins=bin_edges, labels=bin_labels, right=False)

In [9]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['step'] = label_encoder.fit_transform(df['step'])


In [10]:
df['type'] = df['type'].replace(['TRANSFER', 'DEBIT'], 'OTHER')

In [11]:
frequency_map = df['type'].value_counts(normalize=False).to_dict()
df['type'] = df['type'].map(frequency_map)

In [12]:
columns_to_drop = ['nameOrig', 'nameDest', 'isFlaggedFraud']
df= df.drop(columns=columns_to_drop)

In [13]:
columns_to_drop = ['oldbalanceDest', 'oldbalanceOrg']
df = df.drop(columns=columns_to_drop)

In [14]:
X= df.drop(columns=['isFraud'])
y=df['isFraud']

In [15]:
df.head()

Unnamed: 0,step,type,amount,newbalanceOrig,newbalanceDest,isFraud
0,0,2151495,9839.64,160296.36,0.0,0
1,0,2151495,1864.28,19384.72,0.0,0
2,0,574341,181.0,0.0,0.0,1
3,0,2237500,181.0,0.0,0.0,1
4,0,2151495,11668.14,29885.86,0.0,0


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state = 42)

In [17]:
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [18]:
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [19]:
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [20]:
# Convert the resampled data back to Pandas DataFrames
X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
y_resampled_df = pd.DataFrame(y_resampled, columns=['isFraud'])
y_test_df = pd.DataFrame(y_train, columns=['isFraud'])

# Save to CSV
X_resampled_df.to_csv('X_train_resampled_nb.csv', index=False)
y_resampled_df.to_csv('y_train_resampled_nb.csv', index=False)
X_test.to_csv('X_test_nb.csv', index=False)
y_test.to_csv('y_test_nb.csv', index=False)

In [21]:
# Initialize a boto3 client
s3_client = boto3.client('s3')

bucket = 'udacity-nano-degree-project'
prefix = 'processed_data_nb'
output_prefix = 'nb-output'

# Upload the files
s3_client.upload_file('X_train_resampled_nb.csv', bucket, f'{prefix}/train/X_train_resampled.csv')
s3_client.upload_file('y_train_resampled_nb.csv', bucket, f'{prefix}/train/y_train_resampled.csv')
s3_client.upload_file('X_test_nb.csv', bucket, f'{prefix}/test/X_test_nb.csv')
s3_client.upload_file('y_test_nb.csv', bucket, f'{prefix}/test/y_test_nb.csv')

In [22]:
# Now your data is in S3 and you can set up the s3_input objects
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=f's3://{bucket}/{prefix}/train/', content_type='csv')
s3_input_test = sagemaker.inputs.TrainingInput(s3_data=f's3://{bucket}/{prefix}/test/', content_type='csv')

In [29]:
from datetime import datetime

# Set your SageMaker session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()  # Make sure this IAM role has the necessary permissions

# Create a unique name for the estimator
unique_name = f"NB-Estimator-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"

# Create SKLearn estimator
sklearn_estimator = SKLearn(
    entry_point='navie_bayes.py', # Path to your training script
    role=role,
    instance_type='ml.m5.4xlarge',  # Example instance type, choose one based on your needs
    framework_version='0.23-1',  # Match this to your scikit-learn version
    py_version='py3',  # Choose Python version
    output_path=f's3://{bucket}/{output_prefix}',
    sagemaker_session=sagemaker_session,
    base_job_name=unique_name
)

# Fit estimator
sklearn_estimator.fit({'train': s3_input_train, 'test': s3_input_test})

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
Using provided s3_resource


INFO:sagemaker:Creating training-job with name: NB-Estimator-2023-11-10-03-04-49-2023-11-10-03-04-49-244


ClientError: An error occurred (AccessDeniedException) when calling the CreateTrainingJob operation: User: arn:aws:sts::863397112005:assumed-role/AmazonSageMaker-ExecutionRole-20231109T153131/SageMaker is not authorized to perform: sagemaker:CreateTrainingJob on resource: arn:aws:sagemaker:us-east-1:863397112005:training-job/NB-Estimator-2023-11-10-03-04-49-2023-11-10-03-04-49-244 with an explicit deny in a service control policy

In [None]:
!python --version