In [None]:
# pip install gensim #for glove embedding

In [None]:
import duckdb
import pandas as pd

from bs4 import BeautifulSoup


# dev_mode = True
dev_mode = False
if dev_mode:
    # DEV (user specific)
    database = "/home/heiler/development/projects/ascii/research-space/src/pipelines/ascii/ascii_dbt/ascii_pipeline.duckdb"
    prefix = "ascii_dev"
else:
    # prod
    database = "/data/raid5/data/ascii/mastered-data/ascii_pipeline.duckdb"
    prefix = "ascii"

con = duckdb.connect(
    database=database,
    read_only=True,
)

In [None]:
%store -r df_urls

# Filter the URLs 

In [None]:
# starting url count
len(df_urls)

In [None]:
# first drop duplicated urls
dedup_df = df_urls.drop_duplicates(subset="src_url", keep="first")

In [None]:
len(dedup_df)

## Step 1 filter by keywords

but only if over 50 urls for a company

In [None]:
# keywords = ['about', 'service', 'product', 'news', 'team', 'project', 'career']

# keywords = ['about', 'service', 'product', 'news']

keywords = ["about", "service", "product", "news", "semicon", "technology"]

Now apply the keyword filter only to companies that have more than 50 urls, ensuring not too much information is lost

In [None]:
# Identify companies with more than 50 URLs
companies_with_many_urls = (
    dedup_df.groupby("ascii_id_company")
    .filter(lambda x: len(x) > 50)["ascii_id_company"]
    .unique()
)

# Apply keyword filter to companies with more than 50 URLs
keywords_pattern = "|".join(keywords)
filtered_large_companies_df = dedup_df[
    (dedup_df["ascii_id_company"].isin(companies_with_many_urls))
    & (dedup_df["src_url"].str.contains(keywords_pattern, case=False, na=False))
]

# Get the data for companies with 50 or fewer URLs
filtered_small_companies_df = dedup_df[
    ~dedup_df["ascii_id_company"].isin(companies_with_many_urls)
]

# Concatenate the two DataFrames
filtered_df = pd.concat([filtered_large_companies_df, filtered_small_companies_df])

In [None]:
len(filtered_df) / len(dedup_df)

In [None]:
len(filtered_df)

In [None]:
filtered_df["ascii_id_company"].nunique()

In [None]:
dedup_df["ascii_id_company"].nunique()

In [None]:
# short statistics on the filtered urls
# Count unique src_urls per ascii_id_company
url_counts = (
    filtered_df.groupby("ascii_id_company")["src_url"]
    .nunique()
    .reset_index(name="url_count")
)

# Calculate basic statistics
min_urls = url_counts["url_count"].min()
max_urls = url_counts["url_count"].max()
avg_urls = url_counts["url_count"].mean()
med_urls = url_counts["url_count"].median()

print(f"Minimum URLs per ID: {min_urls}")
print(f"Maximum URLs per ID: {max_urls}")
print(f"Average URLs per ID: {avg_urls}")
print(f"Median URLs per ID: {med_urls}")

## filter for preferrably english urls

but if after this filter less than m urls would be left, dont apply it (ensures for instance firms that dont have this convention with /language)

In [None]:
def filter_en_urls(df, n=50, m=30):
    # Function to filter for English URLs and revert if count falls below 'm'
    def filter_for_company(group):
        if len(group) > n:
            en_filtered = group[group["src_url"].str.contains("/en")]
            if len(en_filtered) >= m:
                return en_filtered
        return group

    # Apply the filter to each company
    filtered_df = df.groupby("ascii_id_company", group_keys=False).apply(
        filter_for_company
    )

    return filtered_df


# First, apply the English URL filter
en_filtered_df = filter_en_urls(
    dedup_df, 100, 200
)  # if company has more than 100 urls, then apply and if falls under 200 urls revert change

In [None]:
len(en_filtered_df)

keep only 

## next step out of these urls we will do a filtering based on similarity search

In [None]:
from gensim.downloader import load
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

# Load the GloVe model
glove_model = load("glove-wiki-gigaword-50")

We take the descriptions of the input steps from georgtetown as the similarity search query to compare against

In [None]:
query = """
    SELECT description
    FROM READ_CSV('/data/raid5/data/ascii/mastered-data/reference-data/data_raw_direct_source_drop/joshua/georgetown/inputs.csv', HEADER=TRUE);
    """
input_desc = con.execute(query).fetchdf()

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Assuming 'input_desc' is your DataFrame and 'description' is the column with text
# Step 1: Concatenate all descriptions into a single text
all_descriptions = " ".join(input_desc["description"])

# Step 2: Clean the text
cleaned_text = re.sub(r"[^a-zA-Z\s]", "", all_descriptions)
cleaned_text = cleaned_text.lower()

# Step 3: Remove stop words
nltk.download("stopwords")
nltk.download("punkt")
stop_words = set(stopwords.words("english"))
word_tokens = word_tokenize(cleaned_text)
filtered_text = [w for w in word_tokens if not w in stop_words]

# Step 4: Lemmatize
nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()
lemmatized_text = [lemmatizer.lemmatize(w) for w in filtered_text]

# The 'lemmatized_text' now contains your processed text ready for GloVe embeddings

In [None]:
lemmatized_text[:5]

In [None]:
bow = np.array([glove_model[word] for word in lemmatized_text if word in glove_model])

In [None]:
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity


def filter_urls_by_similarity(df, n=50, m=10, keyword_embeddings=None):
    # Function to convert a URL into an embedding
    def url_to_embedding(url):
        # Clean the URL by keeping only alphanumeric characters and spaces
        cleaned_url = re.sub(r"[^a-zA-Z\s]", "", url)
        cleaned_url = cleaned_url.lower()

        # Remove stop words
        stop_words = set(stopwords.words("english"))
        word_tokens = word_tokenize(cleaned_url)
        filtered_url = [w for w in word_tokens if not w in stop_words]

        # Lemmatize
        lemmatizer = WordNetLemmatizer()
        lemmatized_url = [lemmatizer.lemmatize(w) for w in filtered_url]

        # Split the cleaned, lemmatized URL into words based on common delimiters
        parts = lemmatized_url  # This assumes the 'parts' are now the cleaned, tokenized, and lemmatized words
        words = [word for part in parts for word in part.split("-")]

        # Filter out words to only include meaningful ones (based on glove_model availability)
        words = [word for word in words if word and word in glove_model]

        # Convert words into embeddings and average them to get the URL embedding
        if words:
            embeddings = np.array([glove_model[word] for word in words])
            return embeddings.mean(axis=0)
        else:
            # Return a zero vector if no meaningful words are found
            return np.zeros((glove_model.vector_size,))

    # Function to filter URLs for a single company
    def filter_for_company(group):
        if len(group) <= n:
            return group
        else:
            # Convert URLs to embeddings
            url_embeddings = np.array(
                [url_to_embedding(url) for url in group["src_url"]]
            )

            # Calculate similarity to keywords
            similarities = cosine_similarity(url_embeddings, keyword_embeddings).mean(
                axis=1
            )

            # Get indices of the top 'm' similar URLs
            top_indices = np.argsort(similarities)[-m:]

            return group.iloc[top_indices]

    # Apply the filter to each company
    filtered_df = df.groupby("ascii_id_company", group_keys=False).apply(
        filter_for_company
    )

    return filtered_df

In [None]:
df = filter_urls_by_similarity(en_filtered_df, 60, 60, bow)

check for instance tesla which had issues before

In [None]:
len(df[df["ascii_id_company"] == "bWyO7uUNWBS9MN2QvXHLzQ=="])

In [None]:
for url in df["src_url"][df["ascii_id_company"] == "bWyO7uUNWBS9MN2QvXHLzQ=="]:
    print(url)

In [None]:
for url in df["src_url"][df["ascii_id_company"] == "bWyO7uUNWBS9MN2QvXHLzQ=="]:
    print(url)

In [None]:
for url in df["src_url"][df["ascii_id_company"] == "+xs/sbiUV1CWPJtfrGWtMw=="]:
    print(url)

In [None]:
for url in df["src_url"][df["ascii_id_company"] == "+xs/sbiUV1CWPJtfrGWtMw=="]:
    print(url)

In [None]:
for url in df.iloc[:50]["src_url"]:
    print(url)

In [None]:
len(df)

In [None]:
df_urls_filtered = df

In [None]:
%store df_urls_filtered

In [None]:
%store bow