In [18]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib
import sklearn
import os
import boto3
import matplotlib.pyplot as plt
import sagemaker
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

In [4]:
''' # Get the SageMaker execution role
role = get_execution_role()

# Create a SageMaker session
sagemaker_session = sagemaker.Session()

bucket_name = 'udacity-nano-degree-project' 
file_key = 'raw_data/PS_20174392719_1491204439457_log.csv'

# Use the SageMaker session to download the file from S3
s3_client = sagemaker_session.boto_session.client('s3')
s3_client.download_file(bucket_name, file_key, 'local_file.csv') '''

# Load the data into a pandas DataFrame
df = pd.read_csv('local_file.csv')

In [5]:
df['type'].value_counts()

CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: type, dtype: int64

In [6]:
df['isFraud'].isnull().sum()

0

In [7]:
df['type'] = df['type'].map(df['type'].value_counts(normalize=False))

print(df.head())

   step     type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1  2151495   9839.64  C1231006815       170136.0       160296.36   
1     1  2151495   1864.28  C1666544295        21249.0        19384.72   
2     1   532909    181.00  C1305486145          181.0            0.00   
3     1  2237500    181.00   C840083671          181.0            0.00   
4     1  2151495  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  


In [8]:
df['nameOrig'].value_counts()

C1530544995    3
C545315117     3
C724452879     3
C1784010646    3
C1677795071    3
              ..
C1567523029    1
C644777639     1
C1256645416    1
C1231536757    1
C1971151096    1
Name: nameOrig, Length: 6353307, dtype: int64

In [9]:
df['nameOrig'] = df['nameOrig'].astype(str).str[1:].astype(int)

print(df.head())

   step     type    amount    nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1  2151495   9839.64  1231006815       170136.0       160296.36   
1     1  2151495   1864.28  1666544295        21249.0        19384.72   
2     1   532909    181.00  1305486145          181.0            0.00   
3     1  2237500    181.00   840083671          181.0            0.00   
4     1  2151495  11668.14  2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  


In [10]:
df['nameDest'] = df['nameDest'].astype(str).str[1:].astype(int)

print(df.head())

   step     type    amount    nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1  2151495   9839.64  1231006815       170136.0       160296.36   
1     1  2151495   1864.28  1666544295        21249.0        19384.72   
2     1   532909    181.00  1305486145          181.0            0.00   
3     1  2237500    181.00   840083671          181.0            0.00   
4     1  2151495  11668.14  2048537720        41554.0        29885.86   

     nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  1979787155             0.0             0.0        0               0  
1  2044282225             0.0             0.0        0               0  
2   553264065             0.0             0.0        1               0  
3    38997010         21182.0             0.0        1               0  
4  1230701703             0.0             0.0        0               0  


In [11]:
columns_to_drop = ['oldbalanceDest', 'oldbalanceOrg','isFlaggedFraud']

df = df.drop(columns=columns_to_drop)

In [12]:
X= df.drop(columns=['isFraud'])
y=df['isFraud']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state = 42)

In [14]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [15]:
train_index = X_train.index
train_columns = X_train.columns
test_index = X_test.index
test_columns = X_test.columns

# Convert the scaled arrays back to DataFrames
X_train_df = pd.DataFrame(X_train_scaled, index=train_index, columns=train_columns)
X_test_df = pd.DataFrame(X_test_scaled, index=test_index, columns=test_columns)

# Convert the target series to DataFrames
y_train_df = pd.DataFrame(y_train, index=X_train.index)
y_test_df = pd.DataFrame(y_test, index=X_test.index)

In [24]:
# Define your bucket and prefix paths
bucket = 'udacity-nano-degree-project'
prefix = 'processed_data'

# Save the DataFrames as CSV files
X_train_df.to_csv('X_train.csv', header=True, index=False)
y_train_df.to_csv('y_train.csv', header=True, index=False)
X_test_df.to_csv('X_test.csv', header=True, index=False)
y_test_df.to_csv('y_test.csv', header=True, index=False)

# Get the S3 client
s3 = boto3.client('s3')

# Upload the files to S3
s3.upload_file('X_train.csv', bucket, f'{prefix}/train/X_train.csv')
s3.upload_file('y_train.csv', bucket, f'{prefix}/train/y_train.csv')
s3.upload_file('X_test.csv', bucket, f'{prefix}/test/X_test.csv')
s3.upload_file('y_test.csv', bucket, f'{prefix}/test/y_test.csv')

# Now your data is in S3 and you can set up the s3_input objects
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=f's3://{bucket}/{prefix}/', content_type='csv')
s3_input_test = sagemaker.inputs.TrainingInput(s3_data=f's3://{bucket}/{prefix}/', content_type='csv')


In [25]:
# Set your SageMaker session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()  # Make sure this IAM role has the necessary permissions

# Create SKLearn estimator
sklearn_estimator = SKLearn(
    entry_point='gridSearch.py', # Path to your training script
    role=role,
    instance_type='ml.m5.xlarge',  # Example instance type, choose one based on your needs
    framework_version='0.23-1',  # Match this to your scikit-learn version
    py_version='py3',  # Choose Python version
    sagemaker_session=sagemaker_session
)

# Fit estimator
sklearn_estimator.fit({'train': s3_input_train, 'test': s3_input_test})

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2023-11-09-22-50-10-092


Using provided s3_resource
2023-11-09 22:50:10 Starting - Starting the training job...
2023-11-09 22:50:25 Starting - Preparing the instances for training......
2023-11-09 22:51:29 Downloading - Downloading input data......
2023-11-09 22:52:36 Training - Training image download completed. Training in progress.
2023-11-09 22:52:36 Uploading - Uploading generated training model
2023-11-09 22:52:36 Failed - Training job failed
[34m2023-11-09 22:52:22,136 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2023-11-09 22:52:22,140 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-11-09 22:52:22,186 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2023-11-09 22:52:22,365 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-11-09 22:52:22,377 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)

UnexpectedStatusException: Error for Training job sagemaker-scikit-learn-2023-11-09-22-50-10-092: Failed. Reason: AlgorithmError: framework error: 
Traceback (most recent call last):
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_containers/_trainer.py", line 84, in train
    entrypoint()
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/training.py", line 39, in main
    train(environment.Environment())
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/training.py", line 35, in train
    runner_type=runner.ProcessRunnerType)
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/entry_point.py", line 100, in run
    wait, capture_error
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/process.py", line 291, in run
    cwd=environment.code_dir,
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/process.py", line 208, in check_error
    info=extra_info,
sagemaker_training.errors.ExecuteUserScriptError: ExecuteUserScriptError:
ExitCode 1
ErrorMessage ""
Command "/miniconda3/bin/python gridSearch.py"

ExecuteUserScrip

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train, y_train= smote.fit_resample(X_train, y_train)


svm_classifier = SVC(class_weight=class_weights)
param_grid = {
    'C': [0.1, 1, 10], 
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
}
stratified_kfold = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)
grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, cv=stratified_kfold, scoring='f1', n_jobs=-1, )
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)