# [AIG130 – Lab 5: AutoML]()

## Initialising Libraries and Functions

In [None]:
!pip install numpy pandas matplotlib seaborn scikit-learn kagglehub ucimlrepo google-cloud-aiplatform

In [None]:
# Import required libraries
import matplotlib.pyplot as plt
from dotenv import load_dotenv # Used for Environment Variables
# import scipy.stats as stats
import seaborn as sns
import pandas as pd
import numpy as np
import sys, os
# import math

# Preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Load the dataset & Ignore warnings
from ucimlrepo import fetch_ucirepo # , list_available_datasets
import kagglehub
import warnings

from google.cloud.aiplatform.v1.schema.trainingjob.definition_v1.types import AutoMlTablesInputs
from google.cloud import aiplatform
from google.cloud import storage


In [None]:
warnings.filterwarnings('ignore')

In [None]:
# Then I'm loading the Environment Variables from the .env file, if the file is not found then the code will stop execution.
try:
    if not load_dotenv():
        # raise FileNotFoundError(".env file not found")
        sys.exit("Stopping execution as environment variables are required")
    # else:
    #     load_dotenv()
    #     # print(".env file loaded successfully")
except Exception as e:
    print(f"Error loading .env file: {e}")
    # raise SystemExit("Stopping execution as environment variables are required")
    sys.exit("Stopping execution as environment variables are required")
else:
    if os.getenv("API_KEY") is None:
        sys.exit("Stopping execution as required environment variables are not set")
finally:
    load_dotenv()
    print("Environment variables loaded successfully!")

In [None]:
service_account_file = r"C:\Users\JonathanChackoPattas\OneDrive - Maritime Support Solutions\Desktop\Class Notes\Seneca\Semester 1\AIG130 - Cloud Computing for Machine Learning\Lab 5\ignore\serviceAccountKey.json" # r'I:\Work\MSS-Automation\Connectors\serviceAccountKey.json'

def set_gc_credentials(service_account_file):
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = service_account_file # if os.path.exists(service_account_file) else os.environ.get('GC_CREDS') if 'GC_CREDS' in os.environ and os.environ.get('GC_CREDS') else None

set_gc_credentials(service_account_file)

In [43]:
PROJECT_ID = os.getenv("PROJECT_ID") # @param {type:"string"}
LOCATION = os.getenv("LOCATION") # @param {type:"string"}
BUCKET_NAME = os.getenv("BUCKET_NAME") # @param {type:"string"}

In [None]:
aiplatform.init(project=PROJECT_ID, location=LOCATION)
# Upload CSV to Google Cloud Storage (GCS)
storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)

In [None]:
def gcp_process_data(df, target, bucket, proccess_type ="classification"):
    
    DATASET_NAME = f"{proccess_type}-dataset" # @param {type:"string"}

    # Split into Train and Test
    train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

    # Save Pandas DataFrame as CSV
    train_data.to_csv(f"{proccess_type}_data.csv", index=False)

    blob = bucket.blob(f"{proccess_type}_data.csv")
    blob.upload_from_filename(f"{proccess_type}_data.csv")

    # GCS Path
    gcs_uri = f"gs://{bucket.name}/{proccess_type}_data.csv"

    # Create Vertex AI Dataset
    dataset = aiplatform.TabularDataset.create(
        display_name=DATASET_NAME,
        gcs_source=[gcs_uri]
    )

    # dataset.resource_name # to PRINT

    job = aiplatform.AutoMLTabularTrainingJob(
        display_name="automl-tabular-model",
        optimization_prediction_type=proccess_type,
        # column_transformations=[
        #     {"": {"column_name": ""}}, # {"numeric": {"column_name": "Age"}},
        # ]
    )

    model = job.run(
        dataset=dataset,
        target_column=target,
        # training_fraction_split=0.8,
        # validation_fraction_split=0.1,
        # test_fraction_split=0.1,
        model_display_name="adopted-prediction-model",
        # disable_early_stopping=False,
    )

    # Deploy the Model
    endpoint = model.deploy(
        machine_type="n1-standard-4",
    )

    test_data.drop(columns=[target], inplace=True)
    
    predictions = endpoint.predict(instances=test_data.to_dict(orient="records"))
    
    print(predictions)
    
    # # Warning: Setting this to true will delete everything in your bucket
    # delete_bucket = False

    # # Delete the training job
    # job.delete()

    # # Delete the model
    # model.delete()

    # # Delete the endpoint
    # endpoint.delete()

--------------------------------------------------

# Regression Task
**Dataset**: Flight Price Prediction dataset

**Source**: [Kaggle](https://www.kaggle.com/datasets/shubhambathwal/flight-price-prediction)

**Objective**: To predict the price of airline tickets based on various features such as airline, flight details, source and destination cities, departure and arrival times, number of stops, class, duration, and days left before the flight. This regression model will help travelers anticipate flight costs and potentially make more economical travel decisions.

## Regression Dataset

In [None]:
try:
  df_regression = kagglehub.load_dataset(
    kagglehub.KaggleDatasetAdapter.PANDAS,
    "shubhambathwal/flight-price-prediction",
    "Clean_Dataset.csv",
  )
except Exception as e:
  print(f"Error loading dataset: {e}")
  path = kagglehub.dataset_download("shubhambathwal/flight-price-prediction") # Download latest version
  df_regression = pd.read_csv(path+"/Clean_Dataset.csv")
finally:
  X = df_regression.drop(columns=['price']) # Features
  y = df_regression['price'] # Target variable
  # df_regression = pd.concat([X, y], axis=1) # concatenate features and target variable
  df_regression.drop(columns=['Unnamed: 0'], inplace=True) # Drop the unnessary index column
df_regression

## Understanding the Data

In [None]:
# Tells about the data types and their distribution
df_regression.describe(include='all')

In [None]:
# Tells about the number of rows and columns in the dataset
df_regression.shape # (rows, columns)

In [None]:
# Tells about the data types of each column
df_regression.info() # df_regression.dtypes

In [None]:
# Tells us if there are any missing values in the dataset
print("Missing values in the dataset:")
int(df_regression.isnull().sum().sum()) # There aren't any missing values in the dataset

In [None]:
# Examine class distribution of the target variable price
plt.figure(figsize=(8, 6)) # (20, 15)
sns.histplot(df_regression['price'], kde=True)
plt.title('Price Distribution of Houses')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Show Bar Distribution of Categorical Variables in one figure
categorical_cols = df_regression.select_dtypes(include='object').columns
n_cols = len(categorical_cols)
n_rows = (n_cols + 1) // 2  # Calculate number of rows needed (2 columns per row)
# Create a color palette using a different scheme
colors = sns.color_palette("Set2", 8)  # Using Set2 palette instead of husl
plt.figure(figsize=(15, 4*n_rows))
for idx, col in enumerate(categorical_cols, 1):
    plt.subplot(n_rows, 2, idx)
    data = df_regression[col].value_counts()
    data = data.head(8) if len(data) > 8 else data  # Display only top 8 categories if there are more than 8 categories
    sns.barplot(x=data.index, y=data.values, palette=colors)
    plt.title(f'{col} Distribution')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.xticks(rotation=45)
    plt.tight_layout()
plt.show()
# # Display value counts for categorical variables
# for col in df_regression.select_dtypes(include='object').columns:
#     print(f"Column: {col}")
#     print(df_regression[col].value_counts())
#     print("\n")

In [None]:
# from pandas.plotting import scatter_matrix
numerical_cols = df_regression.select_dtypes(include='number').columns
plt.figure(figsize=(12, 12))
n = len(numerical_cols)
colors = sns.color_palette("husl", n*n)
cidx = 0
for i in range(n):
    for j in range(i+1):  # Changed this line to create triangle
        plt.subplot(n, n, i*n + j + 1)
        plt.scatter(
            df_regression[numerical_cols[j]], df_regression[numerical_cols[i]], 
            alpha=0.5, c=colors[cidx], s=10
        )
        cidx += 1
        if i == n-1:
            plt.xlabel(numerical_cols[j])
        if j == 0:
            plt.ylabel(numerical_cols[i])
plt.suptitle('Scatter Plot Matrix of Numerical Features')
plt.tight_layout()
# scatter_matrix(df_regression, figsize=(20, 20))
# plt.title('Scatter Matrix of Features')
plt.show()

## Model Implementation & Evaluation

In [None]:
prediction_regression = gcp_process_data(df_regression, 'price', bucket, proccess_type="regression")
prediction_regression

--------------------------------------------------

# Classification Task
**Dataset**: Phishing Websites dataset

**Source**: [UCI ML Repository](https://archive.ics.uci.edu/dataset/327/phishing+websites)

**Objective**: To classify websites as either legitimate or phishing based on various URL and website features. This classification model will help improve cybersecurity by automatically identifying potentially malicious websites that attempt to steal sensitive information from users.

## Classification Dataset

In [None]:
try:
    # fetch dataset 
    phishing_websites = fetch_ucirepo(id=327) 
except Exception as e:
    print(f"Error loading dataset: {e}")
    phishing_websites = fetch_ucirepo(name='Phishing Websites') # fetch dataset by name
finally:
    X =  pd.DataFrame(phishing_websites.data.features) # Features | phishing_websites.data.features
    y =  pd.DataFrame(phishing_websites.data.targets) # Target variable | phishing_websites.data.targets
    df_classification = pd.concat([X, y], axis=1) # concatenate features and target variable
df_classification

## Understanding the Data

In [None]:
df_classification.describe(include='all')

In [None]:
df_classification.shape

In [None]:
df_classification.info() # df_classification.dtypes

In [None]:
# Tells us if there are any missing values in the dataset
print("Missing values in the dataset:")
int(df_classification.isnull().sum().sum()) # There aren't any missing values in the dataset

In [None]:
# Examine class distribution of the target variable
class_dist = df_classification['result'].value_counts()
# Calculate percentages
total = len(df_classification)
percentages = (class_dist / total * 100).round(1)
# Create pie chart
plt.figure(figsize=(8, 6))
plt.pie(class_dist, labels=['Legitimate', 'Phishing'], 
    autopct=lambda pct: f'{pct:.1f}%',
    colors=['lightgreen', 'lightcoral'])
plt.title('Class Distribution in Phishing Websites Dataset')
plt.show()

In [None]:
features_to_plot = [ftp for ftp in df_classification.columns if ftp != 'result'] # [ftp[0] for ftp in df_classification.corr().loc['result'].abs().sort_values(ascending=False).items() if ftp[0] != 'result']
n_cols = 2 if len(features_to_plot) % 2 == 0 else 3
n_rows = (len(features_to_plot) + 1) // n_cols
plt.figure(figsize=(15, 4*n_rows))
for idx, feature in enumerate(features_to_plot, 1):
    plt.subplot(n_rows, n_cols, idx)
    sns.countplot(data=df_classification, x=feature, hue='result')
    plt.title(f'{feature} Distribution by Class')
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.legend(title='Result', labels=['Phishing', 'Legitimate'])
plt.tight_layout()
plt.show()
# # Display value counts for categorical variables
# for col in df_classification.select_dtypes(include='object').columns:
#     print(f"Column: {col}")
#     print(df_classification[col].value_counts())
#     print("\n")

In [None]:
# Compute correlation matrix
correlation_matrix = df_classification.corr()
# Create mask for upper triangle to avoid redundancy
mask = np.triu(np.ones_like(correlation_matrix))
# Set up the matplotlib figure
plt.figure(figsize=(20, 16))
# Create heatmap with better visualization
sns.heatmap(correlation_matrix, 
            mask=mask,
            annot=True,  # Show correlation values
            fmt='.2f',   # Round to 2 decimal places
            cmap='coolwarm',
            center=0,
            square=True,
            linewidths=0.5,
            cbar_kws={"shrink": .5},
            annot_kws={"size": 8})
plt.title('Feature Correlation Heatmap', pad=20, size=16)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## Model Implementation & Evaluation

In [None]:
prediction_classification = gcp_process_data(df_classification, 'result', bucket, proccess_type="classification")
prediction_classification

--------------------------------------------------

# References

1. [Scikit-learn Column Transformer Documentation](https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html)
2. [UCI Machine Learning Repository: Phishing Websites Dataset](https://archive.ics.uci.edu/dataset/327/phishing+websites)
3. [Kaggle: Flight Price Prediction Dataset](https://www.kaggle.com/datasets/shubhambathwal/flight-price-prediction)
4. [Scikit-learn Documentation: Regression Metrics](https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics)
5. [Scikit-learn Documentation: Classification Metrics](https://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics)
6. [Random Forest Algorithm: Theory and Applications](https://towardsdatascience.com/understanding-random-forest-58381e0602d2)
7. [IPYNB Reference](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/automl/automl-tabular-classification.ipynb)
8. [AIG100 – Project 2: Regression and Classification Methods](https://github.com/jcp-tech/Seneca_Class_Notes/tree/master/Semester%201/AIG100%20-%20Machine%20Learing/Project%202)

# THE END!