# Label Distribution

In [None]:
import os
import matplotlib.pyplot as plt
import pandas as pd

from pathlib import Path
from dotenv import load_dotenv

load_dotenv()
DATA_PATH = Path(os.getenv("DATA_PATH"))

# only for .ipynb because relative imports don't work
root_path = Path(DATA_PATH).parent
os.chdir(str(root_path))

import src.database.db_connector as db

In [None]:
# database name for results
db_name = "clustering_db"
cnx = db.connect_to_database(db_name)
cursor = db.get_connection_cursor(cnx)

label_distribution_by_count = DATA_PATH / "labels" / "label_distribution_by_count.csv"
# df = pd.read_csv(label_distribution_by_count)

query = """
    select l.name, l.type, Count(*)
    from screenshots as s
    inner join websites as w on w.url=s.page_url
    inner join website_labels as wl on w.id=wl.website_id
    inner join labels as l on wl.label_id=l.id
    group by l.name
    having count(*) > 100
    order by count(*) desc;
"""

df = pd.read_sql(query, cnx)
print(df[:15])

In [None]:
# map type to colors
types = list(df["type"].sort_values().unique())
print(types)

colors = ["b", "c", "r", "y"]

type_colors = list(map(lambda t: colors[types.index(t)], df["type"]))

In [None]:
plt.figure(figsize=(40,40))
plt.barh(df["name"], df["Count(*)"], color=type_colors)

# legend
handles = [plt.Rectangle((0,0),1,1, color=c) for c in colors]
plt.legend(handles, types)

# label counts
for index, value in enumerate(df["Count(*)"]):
    plt.text(value, index, str(value))

plt.gca().invert_yaxis()
plt.xticks(rotation=80)

# save plot as file
target_dir = Path(DATA_PATH) / "plots" / "clustering" / "labels" 
if not target_dir.is_dir():
        target_dir.mkdir(parents=True, exist_ok=True)

plt.savefig(target_dir / "original_distribution.jpeg")
plt.show()

In [None]:
fig = plt.figure(figsize=(16,9))

plt.bar(df["name"], df["Count(*)"], color=type_colors)

# legend
handles = [plt.Rectangle((0,0),1,1, color=c) for c in colors]
plt.legend(handles, types, prop={'size': 20})

# hide x-ticks
plt.xticks([], fontsize=24)
plt.yticks(fontsize=24)
plt.xlabel("Labels", fontsize=24)

# save plot as file
target_dir = Path(DATA_PATH) / "plots" / "clustering" / "labels" 
if not target_dir.is_dir():
        target_dir.mkdir(parents=True, exist_ok=True)

fig.tight_layout()
plt.savefig(target_dir / "original_distribution_horziontal.jpeg", bbox_inches='tight')
plt.show()

In [None]:
# calculate statistics
print(df["type"].value_counts())
print(df["type"].value_counts().shape)
print(df.groupby("type").median().shape)
print(df.groupby("type").median().squeeze().shape)
print("\nMean")
print(df.groupby("type").mean())
print("\nSTD")
print(df.groupby("type").std())
print("\nVariance")
print(df.groupby("type").var())

In [None]:
plt.figure(figsize=(8,6))
# print(list(set(df["type"].tolist())))
print(df["type"].sort_values().unique())
error_bars = list(df.groupby("type").std().squeeze())
print(error_bars)

plt.barh(df["type"].sort_values().unique(), df.groupby("type").mean().round(0).squeeze(), color=colors, xerr=error_bars, ecolor='black', capsize=10)

# legend
handles = [plt.Rectangle((0,0),1,1, color=c) for c in colors]
plt.legend(handles, types)

# label counts
for index, value in enumerate(df.groupby("type").mean().astype(int).squeeze()):
    plt.text(value, index, str(value))

plt.gca().invert_yaxis()
plt.xticks(rotation=80)
plt.show()

## Label Percentages

In [None]:
df["percentage"] = (df["Count(*)"] / 41150) * 100

print(df[:6])