In [None]:
import pandas as pd

# reading dataset v1
#dataset = pd.read_csv('combined_csv_v1.csv')
dataset = pd.read_csv('combined_csv_v1.csv', nrows = 1000000)

In [None]:
# printing the shape of the dataset
print(dataset.shape)

# view first 5 rows
dataset.head(5)

In [None]:
# splitting the datasets based on train-validation-test split of 70-15-15

training_dataset = dataset.sample(frac=0.70, random_state=59)
test_val_dataset = dataset.loc[~dataset.index.isin(training_dataset.index), :]

testing_dataset = test_val_dataset.sample(frac=0.50, random_state=59)   
validation_dataset = test_val_dataset.loc[~test_val_dataset.index.isin(testing_dataset.index), :]

print(dataset.shape)
print(training_dataset.shape)
print(validation_dataset.shape)
print(testing_dataset.shape)

In [None]:
# writing the train and test datasets to file

training_dataset.to_csv('training_dataset.csv', index=False, header=False)
validation_dataset.to_csv('validation_dataset.csv', index=False, header=False)
testing_dataset.to_csv('testing_dataset.csv', index=False, header=False)

In [None]:

# importing sagemaker 

import sagemaker

sess = sagemaker.Session()
bucket = sess.default_bucket()

prefix = 'airplane-delays'
training_data_path = sess.upload_data(path='training_dataset.csv', key_prefix=prefix + '/input/training')
validation_data_path = sess.upload_data(path='validation_dataset.csv', key_prefix=prefix + '/input/validation')
testing_data_path = sess.upload_data(path='testing_dataset.csv', key_prefix=prefix + '/input/testing')


print(training_data_path)
print(validation_data_path)
print(testing_data_path)


In [None]:
# importing librarires and running linear regressor

import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri

region = boto3.Session().region_name    
container = get_image_uri(region, 'linear-learner')

from sagemaker.estimator import Estimator

role = sagemaker.get_execution_role() 

ll_estimator = Estimator(container,
    role=role, 
    train_instance_count=1,
    train_instance_type='ml.m5.large',
    output_path='s3://{}/{}/output'.format(bucket, prefix)
)

ll_estimator.set_hyperparameters(predictor_type='binary_classifier', 
                                 mini_batch_size=1000,
                                epochs = 3)

In [None]:
# creating the channels

training_data_channel   = sagemaker.TrainingInput(s3_data=training_data_path, content_type='text/csv')
validation_data_channel = sagemaker.TrainingInput(s3_data=validation_data_path, content_type='text/csv')

ll_data = {'train': training_data_channel, 'validation': validation_data_channel}

In [None]:
ll_estimator.fit(ll_data)

In [None]:
#load the dataset into S3 without the target column
batch_test = testing_dataset.iloc[:,1:]
batch_test.to_csv('batch-in.csv', index=False, header=False)
batch_test_filepath = sess.upload_data(path='batch-in.csv', key_prefix=prefix + '/input/testing')

print(batch_test_filepath)

In [None]:
batch_output = 's3://{}/{}/batch-out/'.format(bucket, prefix)
print(batch_output)

In [None]:
# perform a batch transform on the test data
ll_transformer = ll_estimator.transformer(instance_count=1,
                                            instance_type='ml.c5.9xlarge',
                                            strategy='MultiRecord',
                                            assemble_with='Line',
                                            output_path=batch_output)

ll_transformer.transform(data=batch_test_filepath,
                         data_type='S3Prefix',
                         content_type='text/csv',
                         split_type='Line')

ll_transformer.wait()

In [None]:
import io

In [None]:
#download the results from S3
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucket, Key="{}/batch-out/{}".format(prefix,'batch-in.csv.out'))
target_predicted = pd.read_csv(io.BytesIO(obj['Body'].read()), names=['target'])


In [None]:
target_predicted.dtypes

In [None]:
target_predicted['target'][0]

In [None]:
target_predicted_real = [int(label[-1]) for label in target_predicted.index]
target_predicted_real[:5]

In [None]:
testing_dataset.iloc[:,0]

In [None]:
from sklearn.metrics import confusion_matrix
test_labels = testing_dataset.iloc[:,0]
matrix = confusion_matrix(test_labels, target_predicted_real)
matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def plot_confusion_matrix(test_labels, target_predicted):
    matrix = confusion_matrix(test_labels, target_predicted)
    plt.figure(figsize=(7,7))
    sns.heatmap(matrix, square=True, annot=True, fmt='d', cbar=False, cmap='mako', linewidths=0.5,
                xticklabels=['Not Delayed', 'Delayed'],
                yticklabels=['Not Delayed', 'Delayed'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')

In [None]:
from sklearn import metrics

print("\nTest Accuracy\n", metrics.accuracy_score(test_labels, target_predicted_real))
print("\nRecall\n", (matrix[0,0])/(matrix[0,0]+ matrix[0,1]))
print("\nPrecision\n", (matrix[0,0])/(matrix[0,0]+ matrix[1,0]))
print("\nSensitivity\n", (matrix[0,0])/(matrix[0,0]+ matrix[0,1]))
print("\nSpecificity\n", (matrix[1,1])/(matrix[1,0]+ matrix[1,1]))
print("\nF1 Score\n", (matrix[0,0])/(matrix[0,0]+ 0.5*(matrix[1,0]+matrix[0,1])))

In [None]:
plot_confusion_matrix(test_labels, target_predicted_real)

In [None]:
import pandas as pd

# reading dataset v1
#dataset = pd.read_csv('combined_csv_v2.csv')
dataset = pd.read_csv('combined_csv_v2.csv', nrows=1000000)

In [None]:
# printing the shape of the dataset
print(dataset.shape)

# view first 5 rows
dataset.head(5)

In [None]:
# splitting the datasets based on train-validation-test split of 70-15-15

training_dataset = dataset.sample(frac=0.70, random_state=59)
test_val_dataset = dataset.loc[~dataset.index.isin(training_dataset.index), :]

testing_dataset = test_val_dataset.sample(frac=0.50, random_state=59)   
validation_dataset = test_val_dataset.loc[~test_val_dataset.index.isin(testing_dataset.index), :]


print(dataset.shape)
print(training_dataset.shape)
print(validation_dataset.shape)
print(testing_dataset.shape)

In [None]:
# writing the train and test datasets to file

training_dataset.to_csv('training_dataset.csv', index=False, header=False)
validation_dataset.to_csv('validation_dataset.csv', index=False, header=False)
testing_dataset.to_csv('testing_dataset.csv', index=False, header=False)

In [None]:
# importing sagemaker 

import sagemaker

sess = sagemaker.Session()
bucket = sess.default_bucket()

prefix = 'airplane-delays'
training_data_path = sess.upload_data(path='training_dataset.csv', key_prefix=prefix + '/input/training')
validation_data_path = sess.upload_data(path='validation_dataset.csv', key_prefix=prefix + '/input/validation')
testing_data_path = sess.upload_data(path='testing_dataset.csv', key_prefix=prefix + '/input/testing')


print(training_data_path)
print(validation_data_path)
print(testing_data_path)

In [None]:
# importing librarires and running linear regressor

import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri

region = boto3.Session().region_name    
container = get_image_uri(region, 'linear-learner')

from sagemaker.estimator import Estimator

role = sagemaker.get_execution_role() 

ll_estimator = Estimator(container,
    role=role, 
    train_instance_count=1,
    train_instance_type='ml.m5.large',
    output_path='s3://{}/{}/output'.format(bucket, prefix)
)

ll_estimator.set_hyperparameters(predictor_type='binary_classifier', 
                                 mini_batch_size=1000,
                                epochs = 3)

In [None]:
# creating the channels

training_data_channel   = sagemaker.TrainingInput(s3_data=training_data_path, content_type='text/csv')
validation_data_channel = sagemaker.TrainingInput(s3_data=validation_data_path, content_type='text/csv')

ll_data = {'train': training_data_channel, 'validation': validation_data_channel}

In [None]:
ll_estimator.fit(ll_data)

In [None]:
#load the dataset into S3 without the target column
batch_test = testing_dataset.iloc[:,1:]
batch_test.to_csv('batch-in.csv', index=False, header=False)
batch_test_filepath = sess.upload_data(path='batch-in.csv', key_prefix=prefix + '/input/testing')

print(batch_test_filepath)

In [None]:
batch_output = 's3://{}/{}/batch-out/'.format(bucket, prefix)
print(batch_output)

In [None]:
# perform a batch transform on the test data
ll_transformer = ll_estimator.transformer(instance_count=1,
                                            instance_type='ml.c5.9xlarge',
                                            strategy='MultiRecord',
                                            assemble_with='Line',
                                            output_path=batch_output)

ll_transformer.transform(data=batch_test_filepath,
                         data_type='S3Prefix',
                         content_type='text/csv',
                         split_type='Line')

ll_transformer.wait()

In [None]:
import io

In [None]:
#download the results from S3
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucket, Key="{}/batch-out/{}".format(prefix,'batch-in.csv.out'))
target_predicted = pd.read_csv(io.BytesIO(obj['Body'].read()), names=['target'])


In [None]:
target_predicted.dtypes


In [None]:
target_predicted['target'][0]

In [None]:
target_predicted_real = [int(label[-1]) for label in target_predicted.index]
target_predicted_real[:5]

In [None]:
testing_dataset.iloc[:,0]

In [None]:
from sklearn.metrics import confusion_matrix
test_labels = testing_dataset.iloc[:,0]
matrix = confusion_matrix(test_labels, target_predicted_real)
matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def plot_confusion_matrix(test_labels, target_predicted):
    matrix = confusion_matrix(test_labels, target_predicted)
    plt.figure(figsize=(7,7))
    sns.heatmap(matrix, square=True, annot=True, fmt='d', cbar=False, cmap='mako', linewidths=0.5,
                xticklabels=['Not Delayed', 'Delayed'],
                yticklabels=['Not Delayed', 'Delayed'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')

In [None]:
from sklearn import metrics

print("\nTest Accuracy\n", metrics.accuracy_score(test_labels, target_predicted_real))
print("\nRecall\n", (matrix[0,0])/(matrix[0,0]+ matrix[0,1]))
print("\nPrecision\n", (matrix[0,0])/(matrix[0,0]+ matrix[1,0]))
print("\nSensitivity\n", (matrix[0,0])/(matrix[0,0]+ matrix[0,1]))
print("\nSpecificity\n", (matrix[1,1])/(matrix[1,0]+ matrix[1,1]))
print("\nF1 Score\n", (matrix[0,0])/(matrix[0,0]+ 0.5*(matrix[1,0]+matrix[0,1])))

In [None]:
plot_confusion_matrix(test_labels, target_predicted_real)

Step 3


In [None]:
import pandas as pd

# reading dataset v1
# dataset = pd.read_csv('combined_csv_v1.csv')
dataset = pd.read_csv('combined_csv_v1.csv', nrows=1000000)

In [None]:
# printing the shape of the dataset
print(dataset.shape)

# view first 5 rows
dataset.head(5)

In [None]:
# splitting the datasets based on train-validation-test split of 70-15-15

training_dataset = dataset.sample(frac=0.70, random_state=59)
test_val_dataset = dataset.loc[~dataset.index.isin(training_dataset.index), :]

testing_dataset = test_val_dataset.sample(frac=0.50, random_state=59)   
validation_dataset = test_val_dataset.loc[~test_val_dataset.index.isin(testing_dataset.index), :]

print(dataset.shape)
print(training_dataset.shape)
print(validation_dataset.shape)
print(testing_dataset.shape)

In [None]:
# writing the train and test datasets to file

training_dataset.to_csv('training_dataset.csv', index=False, header=False)
validation_dataset.to_csv('validation_dataset.csv', index=False, header=False)
testing_dataset.to_csv('testing_dataset.csv', index=False, header=False)

In [None]:
# importing sagemaker 

import sagemaker

sess = sagemaker.Session()
bucket = sess.default_bucket()

prefix = 'airplane-delays'
training_data_path = sess.upload_data(path='training_dataset.csv', key_prefix=prefix + '/input/training')
validation_data_path = sess.upload_data(path='validation_dataset.csv', key_prefix=prefix + '/input/validation')
testing_data_path = sess.upload_data(path='testing_dataset.csv', key_prefix=prefix + '/input/testing')


print(training_data_path)
print(validation_data_path)
print(testing_data_path)

In [None]:
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri

region = boto3.Session().region_name    
container = get_image_uri(region, 'xgboost', repo_version='1.0-1')

from sagemaker.estimator import Estimator

role = sagemaker.get_execution_role() 

xgb_estimator = Estimator(container,
    role=role, 
    train_instance_count=1,
    train_instance_type='ml.m5.large',
    output_path='s3://{}/{}/output'.format(bucket, prefix)
)

xgb_estimator.set_hyperparameters(objective='multi:softmax',
                                  num_class='2',
                                  num_round=10,
                                  early_stopping_rounds=5)

In [None]:
training_data_channel   = sagemaker.TrainingInput(s3_data=training_data_path, content_type='text/csv')
validation_data_channel = sagemaker.TrainingInput(s3_data=validation_data_path, content_type='text/csv')

xgb_data = {'train': training_data_channel, 'validation': validation_data_channel}

In [None]:
xgb_estimator.fit(xgb_data)

In [None]:
#load the dataset into S3 without the target column
batch_test = testing_dataset.iloc[:,1:]
batch_test.to_csv('batch-in.csv', index=False, header=False)
batch_test_filepath = sess.upload_data(path='batch-in.csv', key_prefix=prefix + '/input/testing')

print(batch_test_filepath)

In [None]:
batch_output = 's3://{}/{}/batch-out/'.format(bucket, prefix)
print(batch_output)

In [None]:
# perform a batch transform on the test data
xgb_transformer = xgb_estimator.transformer(instance_count=1,
                                            instance_type='ml.c5.9xlarge',
                                            strategy='MultiRecord',
                                            assemble_with='Line',
                                            output_path=batch_output)

xgb_transformer.transform(data=batch_test_filepath,
                         data_type='S3Prefix',
                         content_type='text/csv',
                         split_type='Line')

xgb_transformer.wait()

In [None]:
import io

In [None]:
#download the results from S3
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucket, Key="{}/batch-out/{}".format(prefix,'batch-in.csv.out'))
target_predicted = pd.read_csv(io.BytesIO(obj['Body'].read()), names=['target'])

In [None]:
from sklearn.metrics import confusion_matrix
test_labels = testing_dataset.iloc[:,0]
matrix = confusion_matrix(test_labels, target_predicted)
matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def plot_confusion_matrix(test_labels, target_predicted):
    matrix = confusion_matrix(test_labels, target_predicted)
    plt.figure(figsize=(7,7))
    sns.heatmap(matrix, square=True, annot=True, fmt='d', cbar=False, cmap='mako', linewidths=0.5,
                xticklabels=['Not Delayed', 'Delayed'],
                yticklabels=['Not Delayed', 'Delayed'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')

In [None]:
from sklearn import metrics

print("\nTest Accuracy\n", metrics.accuracy_score(test_labels, target_predicted))
print("\nRecall\n", (matrix[0,0])/(matrix[0,0]+ matrix[0,1]))
print("\nPrecision\n", (matrix[0,0])/(matrix[0,0]+ matrix[1,0]))
print("\nSensitivity\n", (matrix[0,0])/(matrix[0,0]+ matrix[0,1]))
print("\nSpecificity\n", (matrix[1,1])/(matrix[1,0]+ matrix[1,1]))
print("\nF1 Score\n", (matrix[0,0])/(matrix[0,0]+ 0.5*(matrix[1,0]+matrix[0,1])))

In [None]:
plot_confusion_matrix(test_labels, target_predicted)

#result

DAtaset2 -----

In [None]:
import pandas as pd

# reading dataset v1
#dataset = pd.read_csv('combined_csv_v2.csv')
dataset = pd.read_csv('combined_csv_v2.csv', nrows=1000000)

In [None]:
# printing the shape of the dataset
print(dataset.shape)

# view first 5 rows
dataset.head(5)

In [None]:
# splitting the datasets based on train-validation-test split of 70-15-15

training_dataset = dataset.sample(frac=0.70, random_state=59)
test_val_dataset = dataset.loc[~dataset.index.isin(training_dataset.index), :]

testing_dataset = test_val_dataset.sample(frac=0.50, random_state=59)   
validation_dataset = test_val_dataset.loc[~test_val_dataset.index.isin(testing_dataset.index), :]

print(dataset.shape)
print(training_dataset.shape)
print(validation_dataset.shape)
print(testing_dataset.shape)

In [None]:
# writing the train and test datasets to file

training_dataset.to_csv('training_dataset.csv', index=False, header=False)
validation_dataset.to_csv('validation_dataset.csv', index=False, header=False)
testing_dataset.to_csv('testing_dataset.csv', index=False, header=False)

In [None]:
# importing sagemaker 

import sagemaker

sess = sagemaker.Session()
bucket = sess.default_bucket()

prefix = 'airplane-delays'
training_data_path = sess.upload_data(path='training_dataset.csv', key_prefix=prefix + '/input/training')
validation_data_path = sess.upload_data(path='validation_dataset.csv', key_prefix=prefix + '/input/validation')
testing_data_path = sess.upload_data(path='testing_dataset.csv', key_prefix=prefix + '/input/testing')


print(training_data_path)
print(validation_data_path)
print(testing_data_path)


In [None]:
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri

region = boto3.Session().region_name    
container = get_image_uri(region, 'xgboost', repo_version='1.0-1')

from sagemaker.estimator import Estimator

role = sagemaker.get_execution_role() 

xgb_estimator = Estimator(container,
    role=role, 
    train_instance_count=1,
    train_instance_type='ml.m5.large',
    output_path='s3://{}/{}/output'.format(bucket, prefix)
)

xgb_estimator.set_hyperparameters(objective='multi:softmax',
                                  num_class='2',
                                  num_round=10,
                                  early_stopping_rounds=5)

In [None]:
training_data_channel   = sagemaker.TrainingInput(s3_data=training_data_path, content_type='text/csv')
validation_data_channel = sagemaker.TrainingInput(s3_data=validation_data_path, content_type='text/csv')

xgb_data = {'train': training_data_channel, 'validation': validation_data_channel}

In [None]:
xgb_estimator.fit(xgb_data)

In [None]:
#load the dataset into S3 without the target column
batch_test = testing_dataset.iloc[:,1:]
batch_test.to_csv('batch-in.csv', index=False, header=False)
batch_test_filepath = sess.upload_data(path='batch-in.csv', key_prefix=prefix + '/input/testing')

print(batch_test_filepath)

In [None]:
batch_output = 's3://{}/{}/batch-out/'.format(bucket, prefix)
print(batch_output)

In [None]:
# perform a batch transform on the test data
xgb_transformer = xgb_estimator.transformer(instance_count=1,
                                            instance_type='ml.c5.9xlarge',
                                            strategy='MultiRecord',
                                            assemble_with='Line',
                                            output_path=batch_output)

xgb_transformer.transform(data=batch_test_filepath,
                         data_type='S3Prefix',
                         content_type='text/csv',
                         split_type='Line')

xgb_transformer.wait()

In [None]:
import io

In [None]:
#download the results from S3
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucket, Key="{}/batch-out/{}".format(prefix,'batch-in.csv.out'))
target_predicted = pd.read_csv(io.BytesIO(obj['Body'].read()), names=['target'])

In [None]:
from sklearn.metrics import confusion_matrix
test_labels = testing_dataset.iloc[:,0]
matrix = confusion_matrix(test_labels, target_predicted)
matrix


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def plot_confusion_matrix(test_labels, target_predicted):
    matrix = confusion_matrix(test_labels, target_predicted)
    plt.figure(figsize=(7,7))
    sns.heatmap(matrix, square=True, annot=True, fmt='d', cbar=False, cmap='mako', linewidths=0.5,
                xticklabels=['Not Delayed', 'Delayed'],
                yticklabels=['Not Delayed', 'Delayed'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')

In [None]:
from sklearn import metrics

print("\nTest Accuracy\n", metrics.accuracy_score(test_labels, target_predicted))
print("\nRecall\n", (matrix[0,0])/(matrix[0,0]+ matrix[0,1]))
print("\nPrecision\n", (matrix[0,0])/(matrix[0,0]+ matrix[1,0]))
print("\nSensitivity\n", (matrix[0,0])/(matrix[0,0]+ matrix[0,1]))
print("\nSpecificity\n", (matrix[1,1])/(matrix[1,0]+ matrix[1,1]))
print("\nF1 Score\n", (matrix[0,0])/(matrix[0,0]+ 0.5*(matrix[1,0]+matrix[0,1])))

In [None]:
plot_confusion_matrix(test_labels, target_predicted)

#result

COnclusion