# [AIG130 – Lab 5: AutoML](https://github.com/jcp-tech/Seneca_Class_Notes/tree/master/Semester%201/AIG130%20-%20Cloud%20Computing%20for%20Machine%20Learning/Lab%205)

## Initialising Libraries and Functions

In [1]:
!pip install numpy pandas matplotlib seaborn scikit-learn kagglehub ucimlrepo google-cloud-aiplatform

^C




In [2]:
## Import required libraries
# import matplotlib.pyplot as plt
from dotenv import load_dotenv # Used for Environment Variables
# import scipy.stats as stats
# import seaborn as sns
import pandas as pd
# import numpy as np
import sys, os
# import math

## Preprocessing
from sklearn.model_selection import train_test_split

## Evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

## Load the dataset & Ignore warnings
from ucimlrepo import fetch_ucirepo # , list_available_datasets
import kagglehub
import warnings

## GCP Libraries
from google.cloud import aiplatform # AutoML from Vertex AI
from google.cloud import storage # Buckets in GCS


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
warnings.filterwarnings('ignore')

In [None]:
# Then I'm loading the Environment Variables from the .env file, if the file is not found then the code will stop execution.
try:
    if not load_dotenv():
        # raise FileNotFoundError(".env file not found")
        sys.exit("Stopping execution as environment variables are required")
    # else:
    #     load_dotenv()
    #     # print(".env file loaded successfully")
except Exception as e:
    print(f"Error loading .env file: {e}")
    # raise SystemExit("Stopping execution as environment variables are required")
    sys.exit("Stopping execution as environment variables are required")
else:
    if os.getenv("API_KEY") is None:
        sys.exit("Stopping execution as required environment variables are not set")
finally:
    load_dotenv()
    print("Environment variables loaded successfully!")

In [5]:
service_account_file = r"C:\Users\JonathanChackoPattas\OneDrive - Maritime Support Solutions\Desktop\Class Notes\Seneca\Semester 1\AIG130 - Cloud Computing for Machine Learning\Lab 5\ignore\serviceAccountKey.json" # r'I:\Work\MSS-Automation\Connectors\serviceAccountKey.json'

def set_gc_credentials(service_account_file):
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = service_account_file # if os.path.exists(service_account_file) else os.environ.get('GC_CREDS') if 'GC_CREDS' in os.environ and os.environ.get('GC_CREDS') else None

set_gc_credentials(service_account_file)

In [6]:
PROJECT_ID = os.getenv("PROJECT_ID") # @param {type:"string"}
LOCATION = os.getenv("LOCATION") # @param {type:"string"}
BUCKET_NAME = os.getenv("BUCKET_NAME") # @param {type:"string"}

In [7]:
storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)
aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [8]:
def gcp_process_data(df, target, bucket, proccess_type ="classification"):
    
    DATASET_NAME = f"{proccess_type}-dataset"

    # Split into Train and Test
    train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

    # Save Pandas DataFrame as CSV
    train_data.to_csv(f"{proccess_type}_data.csv", index=False)

    blob = bucket.blob(f"{proccess_type}_data.csv")
    blob.upload_from_filename(f"{proccess_type}_data.csv")

    # GCS Path
    gcs_uri = f"gs://{bucket.name}/{proccess_type}_data.csv"

    # Create Vertex AI Dataset
    dataset = aiplatform.TabularDataset.create(
        display_name=DATASET_NAME,
        gcs_source=[gcs_uri]
    )

    # dataset.resource_name # to PRINT

    job = aiplatform.AutoMLTabularTrainingJob(
        display_name="automl-tabular-model",
        optimization_prediction_type=proccess_type,
        # column_transformations=[
        #     {"": {"column_name": ""}}, # {"numeric": {"column_name": "Age"}},
        # ]
    )

    model = job.run(
        dataset=dataset,
        target_column=target,
        # training_fraction_split=0.8,
        # validation_fraction_split=0.1,
        # test_fraction_split=0.1,
        model_display_name="adopted-prediction-model",
        # disable_early_stopping=False,
    )

    # Deploy the Model
    endpoint = model.deploy(
        machine_type="n1-standard-4",
    )

    test_data.drop(columns=[target], inplace=True)
    
    predictions = endpoint.predict(instances=test_data.to_dict(orient="records"))

    # Delete the training job
    job.delete()

    # Delete the model
    model.delete()

    # Delete the endpoint
    endpoint.delete()

    return predictions 

--------------------------------------------------

# Classification Task
**Dataset**: Phishing Websites dataset

**Source**: [UCI ML Repository](https://archive.ics.uci.edu/dataset/327/phishing+websites)

**Objective**: To classify websites as either legitimate or phishing based on various URL and website features. This classification model will help improve cybersecurity by automatically identifying potentially malicious websites that attempt to steal sensitive information from users.

## Classification Dataset

In [10]:
try:
    # fetch dataset 
    phishing_websites = fetch_ucirepo(id=327) 
except Exception as e:
    print(f"Error loading dataset: {e}")
    phishing_websites = fetch_ucirepo(name='Phishing Websites') # fetch dataset by name
finally:
    X =  pd.DataFrame(phishing_websites.data.features) # Features | phishing_websites.data.features
    y =  pd.DataFrame(phishing_websites.data.targets) # Target variable | phishing_websites.data.targets
    df_classification = pd.concat([X, y], axis=1) # concatenate features and target variable
df_classification

Unnamed: 0,having_ip_address,url_length,shortining_service,having_at_symbol,double_slash_redirecting,prefix_suffix,having_sub_domain,sslfinal_state,domain_registration_length,favicon,...,popupwindow,iframe,age_of_domain,dnsrecord,web_traffic,page_rank,google_index,links_pointing_to_page,statistical_report,result
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11050,1,-1,1,-1,1,1,1,1,-1,-1,...,-1,-1,1,1,-1,-1,1,1,1,1
11051,-1,1,1,-1,-1,-1,1,-1,-1,-1,...,-1,1,1,1,1,1,1,-1,1,-1
11052,1,-1,1,1,1,-1,1,-1,-1,1,...,1,1,1,1,1,-1,1,0,1,-1
11053,-1,-1,1,1,1,-1,-1,-1,1,-1,...,-1,1,1,1,1,-1,1,1,1,-1


## Model Implementation & Evaluation

In [None]:
prediction_classification = gcp_process_data(df_classification, 'result', bucket, proccess_type="classification")
prediction_classification

![Step 3](Images%20for%20PRESENTATION/UploadedToBucket.png)
----------------------------------------------------------------------
![Step 3](Images%20for%20PRESENTATION/VertexTrainingPage.png)
----------------------------------------------------------------------
![Step 3](Images%20for%20PRESENTATION/TabluarTrainingClassificationData.png)

--------------------------------------------------

# Regression Task
**Dataset**: Flight Price Prediction dataset

**Source**: [Kaggle](https://www.kaggle.com/datasets/shubhambathwal/flight-price-prediction)

**Objective**: To predict the price of airline tickets based on various features such as airline, flight details, source and destination cities, departure and arrival times, number of stops, class, duration, and days left before the flight. This regression model will help travelers anticipate flight costs and potentially make more economical travel decisions.

## Regression Dataset

In [12]:
try:
  df_regression = kagglehub.load_dataset(
    kagglehub.KaggleDatasetAdapter.PANDAS,
    "shubhambathwal/flight-price-prediction",
    "Clean_Dataset.csv",
  )
except Exception as e:
  print(f"Error loading dataset: {e}")
  path = kagglehub.dataset_download("shubhambathwal/flight-price-prediction") # Download latest version
  df_regression = pd.read_csv(path+"/Clean_Dataset.csv")
finally:
  X = df_regression.drop(columns=['price']) # Features
  y = df_regression['price'] # Target variable
  # df_regression = pd.concat([X, y], axis=1) # concatenate features and target variable
  df_regression.drop(columns=['Unnamed: 0'], inplace=True) # Drop the unnessary index column
df_regression



Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955
...,...,...,...,...,...,...,...,...,...,...,...
300148,Vistara,UK-822,Chennai,Morning,one,Evening,Hyderabad,Business,10.08,49,69265
300149,Vistara,UK-826,Chennai,Afternoon,one,Night,Hyderabad,Business,10.42,49,77105
300150,Vistara,UK-832,Chennai,Early_Morning,one,Night,Hyderabad,Business,13.83,49,79099
300151,Vistara,UK-828,Chennai,Early_Morning,one,Evening,Hyderabad,Business,10.00,49,81585


## Model Implementation & Evaluation

In [None]:
prediction_regression = gcp_process_data(df_regression, 'price', bucket, proccess_type="regression")
prediction_regression

![Step 3](Images%20for%20PRESENTATION/UploadedToBucket.png)
----------------------------------------------------------------------
![Step 3](Images%20for%20PRESENTATION/VertexTrainingPage.png)
----------------------------------------------------------------------
![Step 3](Images%20for%20PRESENTATION/TabluarTrainingRegressionData.png)

--------------------------------------------------

# References

1. [UCI Machine Learning Repository: Phishing Websites Dataset](https://archive.ics.uci.edu/dataset/327/phishing+websites)
2. [Kaggle: Flight Price Prediction Dataset](https://www.kaggle.com/datasets/shubhambathwal/flight-price-prediction)
3. [Random Forest Algorithm: Theory and Applications](https://towardsdatascience.com/understanding-random-forest-58381e0602d2)
4. [IPYNB Reference](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/automl/automl-tabular-classification.ipynb)
5. [AIG100 – Project 2: Regression and Classification Methods](https://github.com/jcp-tech/Seneca_Class_Notes/tree/master/Semester%201/AIG100%20-%20Machine%20Learing/Project%202)

# THE END!