In [1]:
import polars as pl
import plotly.express as px

### Loading data

In [2]:
data = pl.read_csv("data\supervised_dataset.csv")
print(data.shape)
data.head()

FileNotFoundError: The system cannot find the path specified. (os error 3): data\supervised_dataset.csv

### Data Profiling

In [None]:
data.describe()

statistic,Unnamed: 1_level_0,_id,inter_api_access_duration(sec),api_access_uniqueness,sequence_length(count),vsession_duration(min),ip_type,num_sessions,num_users,num_unique_apis,source,classification
str,f64,str,f64,f64,f64,f64,str,f64,f64,f64,str,str
"""count""",1699.0,"""1699""",1695.0,1695.0,1699.0,1699.0,"""1699""",1699.0,1699.0,1699.0,"""1699""","""1699"""
"""null_count""",0.0,"""0""",4.0,4.0,0.0,0.0,"""0""",0.0,0.0,0.0,"""0""","""0"""
"""mean""",849.0,,1.501123,0.173226,61.648982,6028.340789,,564.726898,406.263685,67.246616,,
"""std""",490.60337,,21.697558,0.283641,205.803273,46650.419622,,1179.9312,960.71858,82.189214,,
"""min""",0.0,"""00041830-3168-…",3e-06,0.0012,0.0,1.0,"""datacenter""",2.0,1.0,0.0,"""E""","""normal"""
"""25%""",425.0,,0.000708,0.009205,10.0,63.0,,5.0,1.0,14.0,,
"""50%""",849.0,,0.002574,0.018717,17.095238,195.0,,164.0,141.0,37.0,,
"""75%""",1274.0,,0.024822,0.230769,41.446352,3714.0,,447.0,309.0,90.0,,
"""max""",1698.0,"""ffbf4937-68e6-…",852.92925,1.0,3303.0,1352948.0,"""default""",9299.0,8447.0,524.0,"""F""","""outlier"""


### Missing data

In [None]:
data.filter(pl.col("inter_api_access_duration(sec)").is_null())

Unnamed: 0_level_0,_id,inter_api_access_duration(sec),api_access_uniqueness,sequence_length(count),vsession_duration(min),ip_type,num_sessions,num_users,num_unique_apis,source,classification
i64,str,f64,f64,f64,i64,str,f64,f64,f64,str,str
1556,"""8e8b99bb-7b6d-…",,,0.0,3,"""datacenter""",2.0,1.0,0.0,"""E""","""outlier"""
1567,"""bedfd600-80ef-…",,,0.0,3,"""datacenter""",4.0,1.0,0.0,"""E""","""outlier"""
1619,"""60a25ad0-add8-…",,,0.0,3,"""datacenter""",2.0,1.0,0.0,"""E""","""outlier"""
1647,"""70b6a9dd-e4c6-…",,,0.0,3,"""datacenter""",4.0,1.0,0.0,"""E""","""outlier"""


**Observation**
* The data is quite clean and there are only 4 rows of data have missing values
* `_id` column in this dataset
* `sequence_length(count)` is 0 which mean there was no API calls made

**Action**
* These 4 rows without missing values need to be dropped

### Checking for outlier in the dataset

In [None]:
fig = px.box(x=data["inter_api_access_duration(sec)"].to_list(), title="Boxplot of Inter API Access Duration")
fig.show()

In [None]:
data.filter(pl.col("inter_api_access_duration(sec)") > 100 )

Unnamed: 0_level_0,_id,inter_api_access_duration(sec),api_access_uniqueness,sequence_length(count),vsession_duration(min),ip_type,num_sessions,num_users,num_unique_apis,source,classification
i64,str,f64,f64,f64,i64,str,f64,f64,f64,str,str
1693,"""d8ac0f74-473a-…",159.783857,0.357143,14.0,134219,"""datacenter""",2.0,1.0,5.0,"""F""","""outlier"""
1695,"""44356d09-52e9-…",852.92925,0.5,2.0,102352,"""datacenter""",2.0,1.0,1.0,"""F""","""outlier"""


In [None]:
fig = px.box(x=data["sequence_length(count)"].to_list(), title="Boxplot of API Sequence Length")
fig.show()

In [None]:
data.filter(pl.col("sequence_length(count)") > 2000)

Unnamed: 0_level_0,_id,inter_api_access_duration(sec),api_access_uniqueness,sequence_length(count),vsession_duration(min),ip_type,num_sessions,num_users,num_unique_apis,source,classification
i64,str,f64,f64,f64,i64,str,f64,f64,f64,str,str
1244,"""2db89946-8e5c-…",0.000315,0.013624,3303.0,63,"""default""",106.0,1.0,45.0,"""E""","""outlier"""
1524,"""95f59845-64ea-…",0.001432,0.019249,2078.0,179,"""default""",87.0,1.0,40.0,"""E""","""outlier"""
1596,"""6267c784-baa1-…",0.003393,0.035161,2389.0,487,"""datacenter""",183.0,1.0,84.0,"""E""","""outlier"""
1602,"""0274e020-cf32-…",0.003323,0.035745,2350.0,469,"""datacenter""",176.0,1.0,84.0,"""E""","""outlier"""
1626,"""e3af9271-2a84-…",0.005391,0.038462,2106.0,682,"""datacenter""",154.0,1.0,81.0,"""E""","""outlier"""
1633,"""83d6afd0-ea25-…",0.003779,0.038164,2201.0,500,"""datacenter""",167.0,1.0,84.0,"""E""","""outlier"""
1666,"""d735b3d8-2d08-…",0.003342,0.036395,2308.0,463,"""datacenter""",170.0,1.0,84.0,"""E""","""outlier"""


**Observation**
* Thera are two features in the dataset showing some extreme data points, mostly from datacenter
* They are anomalities and classified as "outlier", so it seems good data points

**Action**
* These outlier will not be dropped

### Data Pre-processing Pipeline
Regarding the process above, there are few step of pre-procesing pipeline taken
* Remove ID
* Remove rows contain null values
* Create a new boolean variable

In [None]:
def data_cleaning(data: pl.DataFrame, output_path: str):
    data = data.drop("_id")
    data.filter(pl.col("inter_api_access_duration(sec)").is_not_null()).with_columns(is_anomaly=pl.col("classification") == "outlier").write_parquet(output_path)

In [None]:
data_cleaning(data, "data\supervised_dataset_clean.parquet")