# Mathematical and Logical Foundations of Informatics INFO-H 611 (26951) - Fall 2023
## Extra Credit Assignment - Notebook

- Hasaranga Jayathilake

# Setting up spark!

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.2.0-bin-hadoop3.2"

import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NaiveBayesExample").getOrCreate()
sc = spark.sparkContext
sc

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing Libraries

In [19]:
import pyspark.sql
import numpy as np
import matplotlib.pyplot as plt
from pyspark.sql import Row
from pyspark.sql.types import *
import json
from pyspark.sql.functions import col, sum, lit
from math import sqrt
from pyspark.mllib.stat import Statistics
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.ml.clustering import LDA
import warnings
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.clustering import *
from pyspark.ml.linalg import Vectors
from pyspark.mllib.tree import DecisionTree
import pandas as pd
from sklearn.model_selection import train_test_split
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col
from pyspark.sql import functions as F

 # Load and Prepare Data

## Loading Data

In [3]:
df = spark.read.format("csv").option("header","true").load("/content/drive/MyDrive/Project/ABL_Longline Sablefish Survey_ID_17218.csv")
df.createOrReplaceTempView("ABL_Survey")

q1 = spark.sql("SELECT * FROM ABL_Survey LIMIT 5")
q1.show(truncate=False)

+-------------+-------------+--------------+------------+----------+---------------------+-----+-----------+-------------+----+-----------------------+-------------+-------------+-----------+-----------+--------------+------------+---------------+-------------+---------+------------------+--------------+------------+--------+-------+--------+------+-----------+------------+---------------------------------+--------------------+-----------+-----------------------+----------------+----+------------------+-------------------+-----+----------+-----------------+--------+--------------+-----------------------------------------------+--------------------------------------------------+-------------------+----------------+----------+---------------+---------+---------------+-------------------+
|cruise_number|vessel_number|station_number|species_code|catch_freq|common_name          |hachi|vessel_name|country      |year|haul_date              |start_hemi_ns|start_hemi_ew|end_hemi_ns|end_hemi_ew|

# Dropping columns with almost 50% missing values

In [4]:
# Dropping columns having more than 50% of null values
columns_to_drop = ['gear_temperature','rpn_filter','mammal_sighting','tdr_depth','depredated_freq','non_depredated_freq']  # Replace with your actual column names
df = df.drop(*columns_to_drop)
df = df.na.drop()

# Changing datatype of numerical columns

In [10]:
df = df.withColumn("distance_fished", col("distance_fished").cast("float"))
df = df.withColumn("catch_freq", col("catch_freq").cast("float"))
df = df.withColumn("hachi", col("hachi").cast("float"))
df = df.withColumn("year", col("year").cast("float"))
df = df.withColumn("soak_time", col("soak_time").cast("float"))
df = df.withColumn("starting_depth", col("starting_depth").cast("float"))
df = df.withColumn("ending_depth", col("ending_depth").cast("float"))
df = df.withColumn("surface_temperature", col("surface_temperature").cast("float"))

In [27]:
# Define features
feature_columns = ["distance_fished", "hachi", "year", "soak_time", "starting_depth",
                   "ending_depth", "surface_temperature", "catch_freq"]

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Define Naive Bayes classifier
nb = NaiveBayes(labelCol="catch_freq", featuresCol="features")

# Create a pipeline
pipeline = Pipeline(stages=[assembler, nb])

# Split the data into training and testing sets (adjust the ratio as needed)
(training_data, testing_data) = df.randomSplit([0.8, 0.2], seed=1234)


# Train and Evaluate the Model

In [12]:
# Define the label columns
label_columns = ["distance_fished", "hachi", "year", "soak_time", "starting_depth", "ending_depth", "surface_temperature", "catch_freq"]

# Combine the label columns into a single column named "label" in the testing_data
testing_data = testing_data.withColumn("label", col(label_columns[0]).cast("double"))
for label_col in label_columns[1:]:
    testing_data = testing_data.withColumn("label", testing_data["label"] + col(label_col).cast("double"))

# Make predictions on the testing set
predictions = model.transform(testing_data)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Accuracy: {accuracy}")


Accuracy: 1.9622078763024155e-05


In [13]:
testing_data.select("label").distinct().show()


+------------------+
|             label|
+------------------+
| 2923.699999809265|
|3296.7000002861023|
|3731.1000003814697|
|3796.4000005722046|
| 3940.300000190735|
|            3901.0|
|            3597.0|
| 3253.300000190735|
| 2928.000000476837|
|3108.7000007629395|
|3068.7000007629395|
|            2862.0|
|            2734.0|
|3284.9000005722046|
| 3604.000000476837|
| 3103.699999809265|
| 4529.700000286102|
|3468.5999999046326|
|3725.2000007629395|
|3857.4000005722046|
+------------------+
only showing top 20 rows



In [14]:
predictions.select("prediction").distinct().show()


+----------+
|prediction|
+----------+
|     596.0|
|    5983.0|
|     692.0|
|    3901.0|
|    5858.0|
|    6433.0|
|    2862.0|
|    2815.0|
|    6506.0|
|    3739.0|
|    5136.0|
|    1765.0|
|    5607.0|
|    2638.0|
|     160.0|
|    5545.0|
|    3882.0|
|    4214.0|
|    6138.0|
|    3108.0|
+----------+
only showing top 20 rows

