# Project Title

## Table of Contents
1. [Libraries](#1)
2. [Section 2](#2)
3. [Section 3](#3)
4. [Section 4](#4)
5. [Conclusion](#5)

__Introduction__  
This project focuses on ...

__Datasets Used__  
- [Dataset 1](URL)

# 1. Set-Up <a id = '1'></a>

In [25]:
import tarfile
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from loguru import logger
from scipy.io import arff
from pyspark.sql import SparkSession

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 100)

In [26]:
# Initialize Spark session with proper configuration
spark = (
    SparkSession.builder.appName("AnomalyDetection")
    .master("local[*]")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "4g")
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "true")
    .config(
        "spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true"
    )
    .config(
        "spark.executor.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true"
    )
    .getOrCreate()
)

# Set log level to reduce verbosity
spark.sparkContext.setLogLevel("WARN")

# 2. EDA <a id = '2'></a> 

In [28]:
# Read arff file
data, meta = arff.loadarff("../data/bank-additional-ful-nominal.arff")
df = pd.DataFrame(data)
df = df.applymap(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x)

# Examine metadata
print(meta)

# Convert to spark df
bank_df = spark.createDataFrame(df)
bank_df.createOrReplaceTempView("BANK")

bank_df.printSchema()
spark.sql("SELECT * FROM BANK LIMIT 5").show()

Dataset: bank-additional-full-weka.filters.unsupervised.attribute.RemoveType-Tnumeric
	job's type is nominal, range is ('housemaid', 'services', 'admin.', 'blue-collar', 'technician', 'retired', 'management', 'unemployed', 'self-employed', 'unknown', 'entrepreneur', 'student')
	marital's type is nominal, range is ('married', 'single', 'divorced', 'unknown')
	education's type is nominal, range is ('basic.4y', 'high.school', 'basic.6y', 'basic.9y', 'professional.course', 'unknown', 'university.degree', 'illiterate')
	default's type is nominal, range is ('no', 'unknown', 'yes')
	housing's type is nominal, range is ('no', 'yes', 'unknown')
	loan's type is nominal, range is ('no', 'yes', 'unknown')
	contact's type is nominal, range is ('telephone', 'cellular')
	month's type is nominal, range is ('may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec', 'mar', 'apr', 'sep')
	day_of_week's type is nominal, range is ('mon', 'tue', 'wed', 'thu', 'fri')
	poutcome's type is nominal, range is ('nonexistent

In [29]:
# Read arff file
data, meta = arff.loadarff("../data/census-income-full-nominal.arff")
df = pd.DataFrame(data)
df = df.applymap(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x)

# Examine metadata
print(meta)

# Convert to spark df
census_df = spark.createDataFrame(df)
census_df.createOrReplaceTempView("CENSUS")

census_df.printSchema()
spark.sql("SELECT * FROM CENSUS LIMIT 5").show()

Dataset: 'census-income-full-weka.filters.unsupervised.attribute.Remove-R43-45-weka.filters.unsupervised.attribute.Remove-R25-weka.filters.unsupervised.attribute.Normalize-S1.0-T0.0-weka.filters.unsupervised.attribute.Remove-R1,6,17-19,30,39-weka.filters.unsupervised.instance.Randomize-S42'
	att2's type is nominal, range is ('Not-in-universe', 'Private', 'Local-government', 'Self-employed-not-incorporated', 'Federal-government', 'Self-employed-incorporated', 'State-government', 'Never-worked', 'Without-pay')
	att3's type is nominal, range is ('4', '0', '40', '34', '43', '37', '24', '39', '12', '35', '45', '3', '19', '29', '32', '48', '33', '23', '44', '36', '31', '30', '41', '5', '11', '9', '42', '6', '18', '50', '2', '1', '26', '47', '16', '14', '22', '17', '7', '8', '25', '46', '27', '15', '13', '49', '38', '21', '28', '20', '51', '10')
	att4's type is nominal, range is ('34', '0', '10', '3', '40', '26', '37', '31', '12', '36', '41', '22', '2', '35', '25', '23', '42', '8', '19', '29'

25/12/28 17:55:46 WARN TaskSetManager: Stage 16 contains a task of very large size (5459 KiB). The maximum recommended task size is 1000 KiB.


In [30]:
# Read arff file
data, meta = arff.loadarff("../data/covertype_nominal_4vs123567.arff")
df = pd.DataFrame(data)
df = df.applymap(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x)

# Examine metadata
print(meta)

# Convert to spark df
covertype_df = spark.createDataFrame(df)
covertype_df.createOrReplaceTempView("COVERTYPE")

covertype_df.printSchema()
spark.sql("SELECT * FROM COVERTYPE LIMIT 5").show()

Dataset: 'CoverType_nominal'
	Dim_11's type is nominal, range is ('1', '0')
	Dim_12's type is nominal, range is ('1', '0')
	Dim_13's type is nominal, range is ('1', '0')
	Dim_14's type is nominal, range is ('1', '0')
	Dim_15's type is nominal, range is ('1', '0')
	Dim_16's type is nominal, range is ('1', '0')
	Dim_17's type is nominal, range is ('1', '0')
	Dim_18's type is nominal, range is ('1', '0')
	Dim_19's type is nominal, range is ('1', '0')
	Dim_20's type is nominal, range is ('1', '0')
	Dim_21's type is nominal, range is ('1', '0')
	Dim_22's type is nominal, range is ('1', '0')
	Dim_23's type is nominal, range is ('1', '0')
	Dim_24's type is nominal, range is ('1', '0')
	Dim_25's type is nominal, range is ('1', '0')
	Dim_26's type is nominal, range is ('1', '0')
	Dim_27's type is nominal, range is ('1', '0')
	Dim_28's type is nominal, range is ('1', '0')
	Dim_29's type is nominal, range is ('1', '0')
	Dim_30's type is nominal, range is ('1', '0')
	Dim_31's type is nominal, rang

25/12/28 17:56:52 WARN TaskSetManager: Stage 17 contains a task of very large size (2209 KiB). The maximum recommended task size is 1000 KiB.
