In [None]:
# pip install sacremoses #for hugging face models
# pip install langdetect

In [None]:
import duckdb
import pandas as pd

from bs4 import BeautifulSoup


# dev_mode = True
dev_mode = False
if dev_mode:
    # DEV (user specific)
    database = "/home/heiler/development/projects/ascii/research-space/src/pipelines/ascii/ascii_dbt/ascii_pipeline.duckdb"
    prefix = "ascii_dev"
else:
    # prod
    database = "/data/raid5/data/ascii/mastered-data/ascii_pipeline.duckdb"
    prefix = "ascii"

con = duckdb.connect(
    database=database,
    read_only=True,
)

In [None]:
%store -r content_df

In [None]:
df = content_df
df

now append all the html content into one large html

In [None]:
# Group by 'ascii_id_company' and aggregate the other columns
df_grouped = (
    df.groupby("ascii_id_company")
    .agg(
        {
            "src_url": lambda x: list(x),  # Convert all src_url values to a list
            "content": lambda x: " ".join(x),  # Join all content values with a space
        }
    )
    .reset_index()
)

# Rename columns to match your requirement
df_grouped = df_grouped.rename(
    columns={"src_url": "url_list", "content": "content_combined"}
)

In [None]:
df_grouped

## extract text from html

In [None]:
from bs4 import BeautifulSoup


def extract_text_simple(html_content):
    soup = BeautifulSoup(html_content, "lxml")

    # Extracting text from paragraph and article tags only
    text = " ".join(
        [
            element.get_text(separator=" ", strip=True)
            for element in soup.find_all(["p", "article"])
        ]
    )

    return text

In [None]:
# Apply the function to the 'content_combined' column to create 'extr_text'
df_grouped["extr_text"] = df_grouped["content_combined"].apply(extract_text_simple)

# Now df_grouped has an additional column 'extr_text' with the extracted text

In [None]:
from IPython.display import display, Markdown


def display_text_as_markdown(text):
    if len(text) < 10000:
        # Convert the text to a Markdown formatted string (using triple backticks for code block)
        formatted_text = f"```{text}```"
        # Display the text as Markdown, which will show it in a preformatted style
        display(Markdown(formatted_text))
    else:
        print(f"over 10k chars ({len(text)}), but here are the first 10k: \n")
        text = text[:10000]
        # Convert the text to a Markdown formatted string (using triple backticks for code block)
        formatted_text = f"```{text}```"
        # Display the text as Markdown, which will show it in a preformatted style
        display(Markdown(formatted_text))


# Example usage with the first row's extracted text
display_text_as_markdown(df_grouped.iloc[0]["extr_text"])

In [None]:
display_text_as_markdown(df_grouped.iloc[16]["extr_text"])

now check how long the texts are

In [None]:
# Compute lengths of the extracted texts
text_lengths = df_grouped["extr_text"].apply(len)

# Calculate basic statistics
min_length = text_lengths.min()
max_length = text_lengths.max()
median_length = text_lengths.median()
average_length = text_lengths.mean()

# Print the statistics
print(f"Minimum length: {min_length}")
print(f"Maximum length: {max_length}")
print(f"Median length: {median_length}")
print(f"Average length: {average_length:.2f}")

check longest ones

In [None]:
# Assuming 'df_grouped' has columns 'id' and 'extr_text'
# First, add a new column to your DataFrame with the text lengths
df_grouped["text_length"] = df_grouped["extr_text"].apply(len)

# Now, sort the DataFrame by 'text_length' in descending order to get the longest texts at the top
df_sorted = df_grouped.sort_values(by="text_length", ascending=False)

# Finally, print the top 10 rows with the longest texts
print(df_sorted[["ascii_id_company", "text_length"]].head(15))

longest_text = df_sorted["ascii_id_company"].head(15).tolist()

In [None]:
display_text_as_markdown(df_grouped.iloc[87]["extr_text"])

## Similarity search in extracted text

In [None]:
%store -r bow

In [None]:
from gensim.downloader import load
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download("punkt")
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Load the GloVe model
glove_model = load("glove-wiki-gigaword-50")

# Define your keywords from before
keyword_embeddings = bow


def clean_and_tokenize(text, glove_model):
    # Lowercase and remove non-alphanumeric characters
    text = re.sub(r"\W+", " ", text.lower())
    # Tokenize and filter tokens not in GloVe and stopwords
    tokens = [
        word
        for word in word_tokenize(text)
        if word in glove_model and word not in stop_words
    ]
    return tokens


def text_to_chunks(tokens, chunk_size=500):
    # Divide tokens into chunks of specified size
    for i in range(0, len(tokens), chunk_size):
        yield tokens[i : i + chunk_size]


def chunk_to_embedding(chunk, glove_model):
    # Convert chunk tokens to embeddings and calculate mean embedding
    embeddings = np.array([glove_model[token] for token in chunk])
    return embeddings.mean(axis=0)


def filter_chunks_by_similarity(
    df,
    glove_model,
    keyword_embeddings,
    n_chunks=5,
    chunk_size=250,
    min_text_length=10000,
):
    filtered_texts = []

    for text in df["extr_text"]:
        if len(text) > min_text_length:
            tokens = clean_and_tokenize(text, glove_model)
            chunks = list(text_to_chunks(tokens, chunk_size))

            if chunks:  # Proceed only if there are chunks
                chunk_embeddings = np.array(
                    [chunk_to_embedding(chunk, glove_model) for chunk in chunks]
                )
                similarities = cosine_similarity(
                    chunk_embeddings, keyword_embeddings
                ).mean(axis=1)
                top_indices = np.argsort(similarities)[-n_chunks:]

                # Join the top chunks into a single string
                top_chunks = [" ".join(chunks[index]) for index in top_indices]
                filtered_text = " ".join(top_chunks)
            else:
                filtered_text = ""  # Empty string if no valid chunks

        else:
            filtered_text = text

        filtered_texts.append(filtered_text)

    # Update the DataFrame with filtered texts
    df["filtered_text"] = filtered_texts
    return df

In [None]:
# Assuming df_grouped is your DataFrame containing the 'extr_text' column
df_simsearch = filter_chunks_by_similarity(
    df_sorted, glove_model, keyword_embeddings, n_chunks=5, chunk_size=400
)

In [None]:
df_backup = df_simsearch

In [None]:
df_simsearch = df_simsearch.reset_index()

In [None]:
display_text_as_markdown(df_simsearch["filtered_text"].iloc[6])  # looks ok

### Garbage
Ok after inspecting the top rows, drop some that were garbage

In [None]:
df_simsearch.drop([0, 4, 6], axis="index", inplace=True)  # drop garbage

### display the short texts that is where sth has not worked properly and also drop them

In [None]:
df_simsearch["text_length"].tail(40)

In [None]:
# drop when text is less than 200 chars

df_dropped = df_simsearch[df_simsearch["text_length"] >= 300]

In [None]:
len(df_dropped)

In [None]:
# now check a few of the short texts
for i in range(len(df_dropped) - 20, len(df_dropped)):
    print(display_text_as_markdown(df_dropped.iloc[i]["extr_text"]))

In [None]:
# check text from middle length

display_text_as_markdown(df_dropped.iloc[14]["filtered_text"])

# Problems

- [ ] a lot of chinese japenese websites

- relatively low quality text for the lowest few


In [None]:
df_dropped.drop(
    ["content_combined", "url_list", "extr_text", "index", "text_length"],
    axis=1,
    inplace=True,
)

In [None]:
df_dropped["filtered_text"].str.len()

In [None]:
df_extr_text = df_dropped

In [None]:
df_extr_text = df_dropped.rename(columns={"filtered_text": "extr_text"})

In [None]:
df_extr_text.head(2)

In [None]:
%store df_extr_text

# Translation

these were first tries and are suspended for now. That is turned into markdown cells. so just ignore this next section.

# switch to gpu if possible
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"

device

# prepare hugging face
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get Hugging Face API key from environment
hf_api_key = os.getenv('HUGGINGFACE_API_KEY')

# Ensure the API key is loaded
if hf_api_key is None:
    raise ValueError("Hugging Face API key not found. Make sure it's set in your .env file as HUGGINGFACE_API_KEY")

# from transformers import MarianMTModel, MarianTokenizer

# # Load the tokenizer and model using the API key with the updated argument
# tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en", token=hf_api_key)
# model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-zh-en", token=hf_api_key)

# # Updated translate function
# def translate(text):
#     model_inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
#     translation = model.generate(**model_inputs, max_length=512, num_beams=4, early_stopping=True)
#     translated_text = tokenizer.decode(translation[0], skip_special_tokens=True)
#     return translated_text


# Example usage
translated_text = translate("示例文本")  # "示例文本" means example text
print(translated_text)

import nltk

from nltk.tokenize import sent_tokenize

def chunk_text_by_sentence(text, max_length=512):
    sentences = sent_tokenize(text)
    current_chunk = []
    chunks = []
    
    for sentence in sentences:
        if len(' '.join(current_chunk) + ' ' + sentence) > max_length:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
        else:
            current_chunk.append(sentence)
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

def translate_text_chunked(text):
    chunks = chunk_text_by_sentence(text)
    translated_chunks = [translate(chunk) for chunk in chunks]
    return ' '.join(translated_chunks)

def translate_non_english(row):
    if row['language'] != 'en':
        return translate_text_chunked(row['extr_text'])
    else:
        return row['extr_text']




# Apply the translation to the first 20 rows of df_grouped
df_trans = df_grouped.head(20).copy()
df_trans['extr_text_en'] = df_trans.apply(translate_non_english, axis=1)

df_trans

display_text_as_markdown(df_trans.iloc[10]['extr_text'])

display_text_as_markdown(df_trans.iloc[10]['extr_text_en'])

display_text_as_markdown(df_trans.iloc[8]['extr_text'])

display_text_as_markdown(df_trans.iloc[8]['extr_text_en'])

## Test other model


# from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

# # Initialize the tokenizer
# tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B")

# # Load the model and move it to the GPU
# model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_1.2B").to(device)

# def translate_m2m100(text, src_lang):
#     tokenizer.src_lang = src_lang
#     # Encode the text and move the tensors to the same device as the model
#     encoded_input = tokenizer(text, return_tensors="pt").to(device)
    
#     # Generate translation and move the output tensors back to CPU for decoding, if needed
#     translated_tokens = model.generate(**encoded_input).cpu()
    
#     # Decode the translated tokens
#     translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
#     return translated_text





# Translate a sample text from German to English
sample_text_german = "Dies ist ein Beispieltext."
translated_text = translate_m2m100(sample_text_german, "de")
print(translated_text)

# adjust functions for new model

def translate_text_chunked(text, lang):
    if lang=='zh-cn':
        lang = 'zh'
    chunks = chunk_text_by_sentence(text)
    translated_chunks = [translate_m2m100(chunk, lang) for chunk in chunks]
    return ' '.join(translated_chunks)

def translate_non_english(row):
    if row['language'] != 'en':
        return translate_text_chunked(row['extr_text'],row['language'])
    else:
        return row['extr_text']

df_trans1 = df_grouped.head(20).copy()
df_trans1['extr_text_en'] = df_trans1.apply(translate_non_english, axis=1) # wow gpu so much faster

df_trans1

display_text_as_markdown(df_trans1.iloc[8]['extr_text'])
display_text_as_markdown(df_trans1.iloc[8]['extr_text_en'])