In [None]:
import duckdb
import pandas as pd

from bs4 import BeautifulSoup


# dev_mode = True
dev_mode = False
if dev_mode:
    # DEV (user specific)
    database = "/home/heiler/development/projects/ascii/research-space/src/pipelines/ascii/ascii_dbt/ascii_pipeline.duckdb"
    prefix = "ascii_dev"
else:
    # prod
    database = "/data/raid5/data/ascii/mastered-data/ascii_pipeline.duckdb"
    prefix = "ascii"

con = duckdb.connect(
    database=database,
    read_only=True,
)

get all the urls and the ascii_id from which they came.

In [None]:
query = """
SELECT e.src_url, sn.ascii_id_company
FROM ascii_commoncrawl.edges e
JOIN ascii_commoncrawl.seed_nodes sn ON e.src_url_surt_host = sn.seed_node_url_surt
WHERE e.crawl_id = 'CC-MAIN-2021-49'
AND e.seed_nodes = 'Georgetown_v1.0.0'
AND e.src_url_surt_host NOT LIKE '%google%'
AND sn.seeds = 'Georgetown_v1.0.0';
"""

In [None]:
df_urls = con.execute(query).fetchdf()

In [None]:
len(df_urls)

### basic statistics

In [None]:
# Count unique src_urls per ascii_id_company
url_counts = (
    df_urls.groupby("ascii_id_company")["src_url"]
    .nunique()
    .reset_index(name="url_count")
)

# Calculate basic statistics
min_urls = url_counts["url_count"].min()
max_urls = url_counts["url_count"].max()
avg_urls = url_counts["url_count"].mean()
med_urls = url_counts["url_count"].median()

print(f"Minimum URLs per ID: {min_urls}")
print(f"Maximum URLs per ID: {max_urls}")
print(f"Average URLs per ID: {avg_urls}")
print(f"Median URLs per ID: {med_urls}")

In [None]:
# Calculate the 75th percentile
percentile_75 = url_counts["url_count"].quantile(0.75)

# Count companies with url_count above the 75th percentile
companies_above_75th = url_counts[url_counts["url_count"] > percentile_75].shape[0]

print(f"75th Percentile of URLs: {percentile_75}")
print(f"Companies above 75th Percentile: {companies_above_75th}")

In [None]:
# Calculate the 75th percentile
percentile_90 = url_counts["url_count"].quantile(0.9)

# Count companies with url_count above the 75th percentile
companies_above_90th = url_counts[url_counts["url_count"] > percentile_90].shape[0]

print(f"90th Percentile of URLs: {percentile_90}")
print(f"Companies above 90th Percentile: {companies_above_90th}")

In [None]:
# Calculate the 95th percentile
percentile_95 = url_counts["url_count"].quantile(0.95)

# Count companies with url_count above the 95th percentile
companies_above_95th = url_counts[url_counts["url_count"] > percentile_95].shape[0]

# Calculate the 99th percentile
percentile_99 = url_counts["url_count"].quantile(0.99)

# Count companies with url_count above the 99th percentile
companies_above_99th = url_counts[url_counts["url_count"] > percentile_99].shape[0]

print(f"95th Percentile of URLs: {percentile_95}")
print(f"Companies above 95th Percentile: {companies_above_95th}")
print(f"99th Percentile of URLs: {percentile_99}")
print(f"Companies above 99th Percentile: {companies_above_99th}")

distribution plot

filter out top 10%, 17 companies

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Create a complementary CDF plot of the URL counts for all companies with logarithmic x and y axes
plt.figure(figsize=(10, 6))
plt.hist(
    url_counts["url_count"],
    bins=np.logspace(
        np.log10(url_counts["url_count"].min()),
        np.log10(url_counts["url_count"].max()),
        30,
    ),
    color="skyblue",
    edgecolor="black",
    cumulative=-1,
    density=True,
)
plt.title("Complementary CDF of URL Counts for All Companies (Double Log Scale)")
plt.xlabel("URL Count (Log Scale)")
plt.ylabel("Complementary Cumulative Probability (Log Scale)")
plt.grid(axis="y", alpha=0.75)
plt.xscale("log")  # Set the x-axis to a logarithmic scale
plt.yscale("log")  # Set the y-axis to a logarithmic scale as well

plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats

# 'url_counts' contains the URL count data for all companies

# Fit your data to a log-normal distribution and get the parameters
shape, loc, scale = stats.lognorm.fit(url_counts["url_count"], floc=0)

# Print the fitted distribution and its parameters
print(f"Fitted distribution: Log-normal")
print(f"Shape parameter: {shape:.4f}")
print(f"Location parameter: {loc:.4f}")  # This was fixed to 0 in the fitting
print(f"Scale parameter: {scale:.4f}")


# Define the theoretical complementary CDF for the log-normal distribution
def theoretical_compl_cdf(x, shape, scale):
    return 1 - stats.lognorm.cdf(x, shape, loc=0, scale=scale)


# Generate x values from min to max of your data for plotting the theoretical curve
x = np.linspace(url_counts["url_count"].min(), url_counts["url_count"].max(), 1000)
y = theoretical_compl_cdf(x, shape, scale)

# Plot the theoretical complementary CDF
plt.figure(figsize=(10, 6))
plt.loglog(x, y, label="Theoretical Compl. CDF (Log-normal Fit)", color="red")

# Plot the empirical complementary CDF
plt.hist(
    url_counts["url_count"],
    bins=np.logspace(
        np.log10(url_counts["url_count"].min()),
        np.log10(url_counts["url_count"].max()),
        30,
    ),
    cumulative=-1,
    density=True,
    histtype="step",
    color="skyblue",
    label="Empirical Compl. CDF",
)

plt.title("Complementary CDF of URL Counts with Theoretical Fit (Double Log Scale)")
plt.xlabel("URL Count (Log Scale)")
plt.ylabel("Complementary Cumulative Probability (Log Scale)")
plt.grid(True, which="both", ls="--")
plt.legend()

plt.show()

In [None]:
%store df_urls