# Question 3: Outlier Detection

CS 5304 - Data Science in the Wild, Assignment 2

**Author**: Yufan Zhang (yz2894)


In [162]:
import pandas as pd
import numpy as np

# Visualization
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# DBSCAN
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import DBSCAN
from itertools import combinations


In [163]:
# Load the data
df = pd.read_csv('data/prog_book.csv')

# Convert 'Reviews' from object to numeric
df['Reviews'] = df['Reviews'].str.replace(',', '')
df['Reviews'] = df['Reviews'].astype(int)

df.head()

Unnamed: 0,Rating,Reviews,Book_title,Description,Number_Of_Pages,Type,Price
0,4.17,3829,The Elements of Style,This style manual offers practical advice on i...,105,Hardcover,9.323529
1,4.01,1406,"The Information: A History, a Theory, a Flood","James Gleick, the author of the best sellers C...",527,Hardcover,11.0
2,3.33,0,Responsive Web Design Overview For Beginners,In Responsive Web Design Overview For Beginner...,50,Kindle Edition,11.267647
3,3.97,1658,Ghost in the Wires: My Adventures as the World...,If they were a hall of fame or shame for compu...,393,Hardcover,12.873529
4,4.06,1325,How Google Works,Both Eric Schmidt and Jonathan Rosenberg came ...,305,Kindle Edition,13.164706


## Task 1: Univariate Outlier detection

In [164]:
# Numerical features
features = ["Rating", "Reviews", "Number_Of_Pages", "Price"]

# Generate box plots for each feature
fig = make_subplots(rows=1, cols=4)

for feature in features:
    fig.add_trace(
        go.Box(y=df[feature], name=feature),
        row=1,
        col=features.index(feature) + 1,
    )

fig.update_layout(height=600, width=1000, title_text="Box Plots for Each Feature")
fig.write_html("plots/task1.html")
fig.show()

## Task 2: Multivariate Outlier detection

In [165]:
# Encoding 'Type' column with LabelEncoder
df_encoded = df.copy()
df_encoded['Type'] = LabelEncoder().fit_transform(df['Type'])

df_encoded.head()

Unnamed: 0,Rating,Reviews,Book_title,Description,Number_Of_Pages,Type,Price
0,4.17,3829,The Elements of Style,This style manual offers practical advice on i...,105,1,9.323529
1,4.01,1406,"The Information: A History, a Theory, a Flood","James Gleick, the author of the best sellers C...",527,1,11.0
2,3.33,0,Responsive Web Design Overview For Beginners,In Responsive Web Design Overview For Beginner...,50,2,11.267647
3,3.97,1658,Ghost in the Wires: My Adventures as the World...,If they were a hall of fame or shame for compu...,393,1,12.873529
4,4.06,1325,How Google Works,Both Eric Schmidt and Jonathan Rosenberg came ...,305,2,13.164706


In [170]:
# Selecting the needed features
features = ['Price', 'Number_Of_Pages', 'Rating', 'Reviews', 'Type']
X = df_encoded[features]

# Scaling the features
X_scaled = StandardScaler().fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=features)

X_scaled_df

Unnamed: 0,Price,Number_Of_Pages,Rating,Reviews,Type
0,-1.268555,-1.211057,0.353476,6.611187,-1.245116
1,-1.221524,0.169913,-0.197845,2.214547,-1.245116
2,-1.214015,-1.391042,-2.540962,-0.336702,-0.294619
3,-1.168964,-0.268594,-0.335676,2.671812,-1.245116
4,-1.160795,-0.556569,-0.025557,2.067569,-0.294619
...,...,...,...,...,...
266,4.167898,0.906213,-1.059285,-0.336702,-1.245116
267,4.420054,1.848677,-0.439048,-0.296782,0.655878
268,4.652572,1.377445,1.456120,-0.271378,-2.195613
269,4.652572,8.812435,2.420933,-0.329444,-1.245116


### Task 2.1: Bivariate Outlier Detection

To view the interactive plot, please visit the following link: [yufanbruce.com/dsw/posts/a2](https://yufanbruce.com/dsw/posts/a2).

In [168]:
# All the bivariate combinations of the features
bivariate_combinations = list(combinations(features, 2))

outliers_bivariate = {}  # Store the outliers for each bivariate combination

fig = make_subplots(
    rows=5,
    cols=2,
    subplot_titles=[f"{x[0]} vs {x[1]}" for x in bivariate_combinations],
    vertical_spacing=0.05,
)

for i, combination in enumerate(bivariate_combinations):
    feature1, feature2 = combination
    X_subset = X_scaled_df[[feature1, feature2]]
    
    # Apply DBSCAN
    dbscan = DBSCAN(eps=0.4, min_samples=7).fit(X_subset)
    labels = dbscan.labels_

    # Store the outliers
    outlier_indices = np.where(labels == -1)[0]
    outliers_bivariate[combination] = df.iloc[outlier_indices][[feature1, feature2]]
    
    outliers = X_subset[labels == -1]
    outliers_bivariate[combination] = outliers

    # Plot
    fig.add_trace(
        go.Scatter(
            x=X_subset[feature1],
            y=X_subset[feature2],
            mode="markers",
            marker=dict(color=labels, colorscale="Viridis", showscale=True),
            text=labels,
            showlegend=False,
        ),
        row=i // 2 + 1,
        col=i % 2 + 1,
    )
    fig.update_xaxes(title_text=feature1, row=i // 2 + 1, col=i % 2 + 1)
    fig.update_yaxes(title_text=feature2, row=i // 2 + 1, col=i % 2 + 1)


fig.update_layout(height=2000, width=1200, title_text="Bivariate DBSCAN Analysis")
fig.update_traces(marker_showscale=False)

fig.write_html("plots/task2_1.html")
fig.show()

### Task 2.2: Trivariate Outlier Detection

To view the interactive plot, please visit the following link: [yufanbruce.com/dsw/posts/a2](https://yufanbruce.com/dsw/posts/a2).

In [169]:
trivariate_combinations = list(combinations(features, 3))

outliers_trivariate = {}  # Store the outliers for each trivariate combination

total_plots = len(trivariate_combinations)
cols = 1
rows = total_plots

fig = make_subplots(
    rows=rows,
    cols=cols,
    specs=[[{"type": "scatter3d"}] for _ in range(total_plots)],
    subplot_titles=[f"{x[0]}, {x[1]}, {x[2]}" for x in trivariate_combinations],
    vertical_spacing=0.04,
)

for i, combination in enumerate(trivariate_combinations):
    feature1, feature2, feature3 = combination
    X_subset = X_scaled_df[[feature1, feature2, feature3]]

    # Apply DBSCAN
    dbscan = DBSCAN(eps=0.7, min_samples=12).fit(X_subset)
    labels = dbscan.labels_

    # Store the outliers
    outlier_indices = np.where(labels == -1)[0]
    outliers_trivariate[combination] = df.iloc[outlier_indices][list(combination)]

    # Create a 3D scatter plot
    fig.add_trace(
        go.Scatter3d(
            x=X_subset[feature1],
            y=X_subset[feature2],
            z=X_subset[feature3],
            mode="markers",
            marker=dict(
                size=3,
                color=labels,  # Color points by cluster labels
                colorscale="Viridis",  # Choose a color scale
                opacity=0.8,
            ),
            showlegend=False,
        ),
        row=i + 1,
        col=1,
    )

    fig.update_scenes(
        dict(xaxis_title=feature1, yaxis_title=feature2, zaxis_title=feature3),
        row=i + 1,
        col=1,
    )

fig.update_layout(height=600 * rows, width=600, title_text="Trivariate DBSCAN Analysis")
fig.update_traces(marker_showscale=False)

# Show the plot
fig.write_html("plots/task2_2.html")
fig.show()