# Storage Solutions for Big Data - CA1


The assessment CA 1 by **Yulianna Tsaruk** \
Programme Title: Higher Diploma in Science in AI Applications \
Module Title: Storage Solutions for Big Data




## Code contents:
1. **[Exploratory Data Analysis & Processing](./1_processing.ipynb)**
2. **Training model and Usage Example (this file)**

## Intoduction

For this project I'm using HDFS (Hadoop Distributed File System) as the primary storage system, Apache Spark for processing with PySpark - an interface for Apache Spark in Python.

In this file I will load the database (previously processed dataset in _file 1_), train a model and use it for prediction on data provided by the user through widgets.

## Training ML model with Spark

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator


import ipywidgets as widgets
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import time
import datetime

%matplotlib inline
# ignore warnings
import warnings
warnings.filterwarnings("ignore")



In [2]:
# Set path to folder with dataset on HDFS
dataset_path_hdfs = '/user1/dataset/' # must end with /

In [3]:
# Creating Spark session with configurations
spark = (SparkSession.builder \
    .appName("ML training")
    # hardware-related configs, comment it if not needed for your machine.
    .config("spark.driver.memory", "8g")
    .config("spark.executor.memory", "6g")  
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.network.timeout", "600s") 
    .config("spark.executor.heartbeatInterval", "120s")
    
    # to output more
    .config("spark.sql.debug.maxToStringFields", 100)
    .getOrCreate())


df = spark.read.parquet(dataset_path_hdfs + "db")
# values for widgets
property_types = df.select('room_type').distinct().toPandas()['room_type'].to_list()
neighbourhoods = df.select('neighbourhood_cleansed').distinct().toPandas().sort_values('neighbourhood_cleansed')['neighbourhood_cleansed'].to_list()

24/04/27 17:18:38 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

In [4]:
# Define column types and  names
columns = {
    "continuous": ["price", "date_unix"],
    "categorical": ["room_type", "neighbourhood_cleansed"],
    "boolean": ["instant_bookable", "host_identity_verified", "weekends", "holiday"],
    "target": "available"
}

In [5]:
# Define transormers and pipeline
class ML():
    def __init__(self):
        self.assembler_last = None
        self.pipeline = None
        self.model = None
        self.Evaluator = None
        
    
    def make_pipeline(self, columns:dict, maxBins=100):
        # Categorical
        indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_indexed") 
                        for col in columns["categorical"]]
        # Continuous
        vector_assemblers = [VectorAssembler(inputCols=[col], outputCol=f"{col}_vec")
                                 for col in columns["continuous"]]
        # Scaler
        scalers = [StandardScaler(inputCol=f"{col}_vec", outputCol=f"scaled_{col}", withStd=True, withMean=True) 
                       for col in columns["continuous"]]
        # Assemble to one vector
        self.assembler_last = VectorAssembler(
                        inputCols=[f"scaled_{col}" for col in columns["continuous"]] +
                        columns["boolean"] +
                        [f"{col}_indexed" for col in columns["categorical"]],
                        outputCol="all_features")
        # Define random forest classifier
        classifier = RandomForestClassifier(labelCol=columns["target"],
                                            featuresCol="all_features",
                                            maxBins=maxBins, # max no of categories in categorical values
                                            seed=42)
        # Define Evaluator for later
        self.Evaluator = BinaryClassificationEvaluator(labelCol=columns["target"])
        # Define pipeline
        self.pipeline = Pipeline(stages=indexers+vector_assemblers+scalers+[self.assembler_last, classifier])

    def train(self, data):
        self.model = self.pipeline.fit(data)

    def pred_eval(self, test_data):
        # Make predictions
        predictions = self.model.transform(test_data)  
        # Evaluate the model
        accuracy = self.Evaluator.evaluate(predictions)
        auc = self.Evaluator.evaluate(predictions, {self.Evaluator.metricName: 'areaUnderROC'})
        print(f"Accuracy: {accuracy*100:.2f}%, AUC: {auc:.2f}")
    
    def find_and_plot_FI(self):
        # Find and plot feature importances
        feature_names = self.assembler_last.getInputCols()        
        feature_importance_dict = dict(zip(feature_names, self.model.stages[-1].featureImportances.toArray()))
        imp = pd.DataFrame(sorted(feature_importance_dict.items(), 
                                  key=lambda x: x[1], reverse=True),
                           columns=['feature', 'importance'])
        imp.sort_values('importance', inplace=True, ascending=False)
        imp['importance'] = imp['importance'] * 100 # normalize values
        # Plot
        plt.figure(figsize=(8, 6))
        ax = sns.barplot(x="importance", y="feature", hue="feature", data=imp, legend=False)
        plt.title('Feature importance of model', fontweight="bold", fontsize=14)
        # Annotate values on top of each bar
        for container in ax.containers:
          rects = container.get_children()
          for rect in rects:
            width = rect.get_width()
            x_loc = width + 0.1
            y_loc = rect.get_y() + rect.get_height() / 2
            label = f"{width:.1f}%"  # Format value with one decimal place
            ax.text(x_loc, y_loc, label, ha='left', va='center', fontsize=12)
        plt.show()
        # ax.bar_label(ax.containers[0])
        return imp


### First Model

In [6]:
# create an instance
pl = ML()
pl.make_pipeline(columns, maxBins=50)

In [7]:
# Split the data into training and testing sets
train_data, test_data = df.randomSplit([0.7, 0.3], seed=42)
df.unpersist() # clean memory

DataFrame[neighbourhood_cleansed: string, room_type: string, host_identity_verified: boolean, instant_bookable: boolean, available: int, price: float, date_unix: bigint, weekends: boolean, holiday: boolean]

In [None]:
# Train the model
pl.train(train_data)

24/04/27 17:20:34 WARN MemoryStore: Not enough space to cache rdd_78_0 in memory! (computed 99.9 MiB so far)
24/04/27 17:20:34 WARN BlockManager: Persisting block rdd_78_0 to disk instead.
24/04/27 17:20:41 WARN MemoryStore: Not enough space to cache rdd_78_1 in memory! (computed 236.8 MiB so far)
24/04/27 17:20:41 WARN BlockManager: Persisting block rdd_78_1 to disk instead.
24/04/27 17:20:50 WARN MemoryStore: Not enough space to cache rdd_78_0 in memory! (computed 149.9 MiB so far)
24/04/27 17:20:51 WARN MemoryStore: Not enough space to cache rdd_78_1 in memory! (computed 236.8 MiB so far)
24/04/27 17:20:57 WARN MemoryStore: Not enough space to cache rdd_78_0 in memory! (computed 149.9 MiB so far)
24/04/27 17:20:57 WARN MemoryStore: Not enough space to cache rdd_78_1 in memory! (computed 236.8 MiB so far)
24/04/27 17:21:03 WARN MemoryStore: Not enough space to cache rdd_78_0 in memory! (computed 149.9 MiB so far)
24/04/27 17:21:04 WARN MemoryStore: Not enough space to cache rdd_78_1 

In [None]:
# evaluate
pl.pred_eval(test_data)

In [None]:
# plot feature importance
pl.find_and_plot_FI()

### Second model - Retrain
Let's reduce dimentiality by getting rid of features with importance less that 2% and re-train model. This will require less preprocessing steps.

In [None]:
drop_col = ['weekends','holiday']

train_data, test_data = train_data.drop(*drop_col), test_data.drop(*drop_col)

In [None]:
# Free memory
del pl

In [None]:
# re-define column types and  names
columns = {
    "continuous": ["price", "date_unix"],
    "categorical": ["room_type", "neighbourhood_cleansed"],
    "boolean": ["instant_bookable", "host_identity_verified"],
    "target": "available"
}
# Re-train a model
pl = ML()
pl.make_pipeline(columns, maxBins=55)
pl.train(train_data)
pl.pred_eval(test_data)
pl.find_and_plot_FI()

Metrics are slightly improved, and _room_type_ feature gained more importance.

In [None]:
# save model to variable
model = pl.model

## Usage Example of Model trained on Spark

In [None]:
# Date picker
date_picker = widgets.DatePicker(
    description='Pick a Date',
    disabled=False,
    value=datetime.date.today())
# Property type dropdown
property_type_dropdown = widgets.Dropdown(
    options=property_types,
    description='Property Type:',
    disabled=False,)
# Neighborhood dropdown
neighborhood_dropdown = widgets.Dropdown(
    options=neighbourhoods,
    description='Neighborhood:',
    disabled=False,)
# Price 
price_input = widgets.FloatText(
    description='Price per night (in Yen):',
    value=10000.0,
    step=1.0,
    continuous_update=False)
# Instant bookable 
instant_bookable_toggle = widgets.ToggleButtons(
    options=['Yes', 'No'],
    description='Instant Bookable:',
    disabled=False)
# Verified host 
verified_host_toggle = widgets.ToggleButtons(
    options=['Yes', 'No'],
    description='Verified Host:',
    disabled=False)
# progress bar
progress = widgets.IntProgress(
    value=0,
    min=0,
    max=10,
    description='Predicting...\n',
    bar_style='info',
    orientation='horizontal')

progress.layout.visibility = 'hidden'
output_w = widgets.Output()

New_DF = None # global variable to store new data from user

# to display all widgets
def display_widgets():
    display(date_picker, property_type_dropdown, neighborhood_dropdown, price_input, instant_bookable_toggle, verified_host_toggle)

# to predict and output result
def make_prediction():
    output_w.clear_output(wait=True)
    progress.layout.visibility = 'visible'
    progress.value = 0
    
    with output_w:
        progress.value = 1
        # define spark DF
        df = spark.createDataFrame(New_DF)
        # Make predictions on new data
        predictions = model.transform(df)
        progress.value = 4
        result = predictions.select("prediction").toPandas()['prediction'][0]
        proba = predictions.select("probability").toPandas()['probability'][0][int(result)]
        progress.value = 5
        input_data = f"""
        <p style="line-height: 0">Data used:</p>
        <ul>
        <li>Date: <strong>{date_picker.value}</strong></li>
        <li>Price per night: <strong>{price_input.value} ¥</strong></li>
        <li>Property type: <strong>{property_type_dropdown.value}</strong></li>
        <li>Neighbourhood: <strong>{neighborhood_dropdown.value}</strong></li>
        <li>Instant bookable: <strong>{instant_bookable_toggle.value}</strong></li>
        <li>Verified host: <strong>{verified_host_toggle.value}</strong></li>
        </ul>
        """
        if result > 0:
            string = f'<h3>Your property will be occupied with {proba*100:.1f}% confidence.</h3>'
        else:
            string = f'<h3>Your property will be available with {(proba*100):.1f}% confidence.</h3>'
        progress.value = 8
        progress.layout.visibility = 'hidden'
        # Display the result
        display(widgets.HTML(value=string + input_data))


def on_submit_clicked(b):
    global New_DF
    # change time to unix timestamp
    unix_timestamp = int(time.mktime(date_picker.value.timetuple()))    
    data = {
        'date_unix': [unix_timestamp],
        'price': [price_input.value],
        'room_type': [property_type_dropdown.value],
        'neighbourhood_cleansed': [neighborhood_dropdown.value],
        'instant_bookable': [instant_bookable_toggle.value == 'Yes'],
        'host_identity_verified': [verified_host_toggle.value == 'Yes']
    }
    New_DF = pd.DataFrame(data)
    # hide widgets
    date_picker.close()
    property_type_dropdown.close()
    price_input.close()
    neighborhood_dropdown.close()
    instant_bookable_toggle.close()
    verified_host_toggle.close()
    b.close()
    make_prediction()

display_widgets()
submit_button = widgets.Button(description="Submit")
submit_button.on_click(on_submit_clicked)
display(submit_button, progress, output_w)

In [None]:
# Terminate spark session
spark.stop()