In [84]:
import pandas as pd
import numpy as np
from typing import List
import re

In [43]:
translate_df = pd.read_csv("../app/eval_set.csv", header=None)

In [44]:
print(translate_df)

                                                    0  \
0                              15+ years track record   
1   125,500 people already found a broker via this...   
2                          Mutual funds are available   
3                                Best futures brokers   
4   Access to archived threads is only available t...   
5                       across [dataPoints]+ criteria   
6                        Alternatives to [brokerName]   
7                          Available in [countryName]   
8                                  Best forex brokers   
9                             Best stock trading apps   
10                        BrokerChooser Awards [year]   
11        Broker is not available in [countryTheName]   
12  Calculate stock trade commission at various br...   
13            Check out the best brokers in [country]   
14              Complete Find My Broker questionnaire   
15                                  FX Fee Calculator   
16                             

In [48]:
def row_terms(row: pd.Series, col_name: str) -> List[str]:
    pattern = r"\[.*?\]|\S+"
    unnecessary = ".;,?!"
    return [str(elem).strip(unnecessary).lower() for elem in re.findall(pattern, str(row[col_name]))]

In [49]:
row_index = 17
row_terms(translate_df.loc[row_index], 0)

['[broker name]',
 '[param name]',
 'fees',
 'are',
 'about',
 'half',
 'of',
 'the',
 'industry',
 'average']

In [50]:
def get_df_terms(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    new_df = df.copy()
    new_df[f"{str(col_name)}_terms"] = new_df.apply(
        lambda row: row_terms(row, col_name), axis=1
    )
    return new_df

In [85]:
row_index = 4
terms_df = get_df_terms(translate_df, 0)
terms_df = get_df_terms(terms_df, 1)
for element in terms_df.loc[row_index]["0_terms"]:
    print(element)

access
to
archived
threads
is
only
available
to
registered
users
without
this
we
can
only
keep
them
for
you
for
1
month


In [86]:
print(terms_df.columns)

Index([0, 1, '0_terms', '1_terms'], dtype='object')


In [172]:
def calculate_term_freq_in_doc(term: str, row: pd.Series, col_name: str) -> dict:
    return sum([int(element == term) for element in row[f"{str(col_name)}_terms"]])

def calcualte_doc_length(row: pd.Series, col_name: str) -> int:
    return len(row[f"{str(col_name)}_terms"])


def docs_contain(term: str, df: pd.DataFrame, col_name: str) -> int:
    return df[f"{str(col_name)}_terms"].apply(
        lambda row: term.lower() in row
    ).sum().astype("int")
    
    
def term_idf(term: str, df: pd.DataFrame, col_name: str) -> float:
    doc_num = df.shape[0]
    return np.log(doc_num/docs_contain(str(term).lower(), df, col_name))
    


def calcualte_term_bm25_for_row(term: str, row: pd.Series, k: float, b: float, df: pd.DataFrame, col_name: str) -> dict:
    idf = term_idf(term, df, col_name)
    freq = calculate_term_freq_in_doc(str(term).lower(), row, col_name)
    doc_length = calcualte_doc_length(row, col_name)
    avg_doc_length = df.apply(
        lambda row: calcualte_doc_length(row, col_name), axis=1
    ).mean()
    
    return idf*(freq*(k + 1.0))/(freq + k*(1.0 - b + b*doc_length/avg_doc_length))
    
    
def calcualte_distribution_for_row(row: pd.Series, k: float, b: float, df: pd.DataFrame, col_name: str) -> List[float]:
    term_distribution = []
    for term in row[f"{str(col_name)}_terms"]:
        term_distribution.append(
            calcualte_term_bm25_for_row(
                str(term).lower(),
                row,
                k,
                b,
                df,
                col_name
            )
        )
    return sorted(term_distribution)


def calculate_max_distance(first_list: List[float], second_list: List[float]) -> float:
    max_distance = 0.0
    for target in first_list + second_list:
        first_cdf = sum(elem <= target for elem in first_list)/len(first_list)
        second_cdf = sum(elem <= target for elem in second_list)/len(second_list)
        if np.abs(first_cdf - second_cdf) > max_distance:
            max_distance = np.abs(first_cdf - second_cdf)
        #print(target, first_cdf, second_cdf)
    return max_distance


def bm25_term_similarity(
    k: float,
    b: float,
    df: pd.DataFrame,
    col_name_original: str,
    col_name_translated: str
) -> float:
    assert col_name_original in df.columns, f"Column {col_name_original} not found in DataFrame"
    assert col_name_translated in df.columns, f"Column {col_name_translated} not found in DataFrame"
    assert k > 0, "Parameter k must be positive"
    assert b > 0, "Parameter b must be positive"
    
    if df.empty:
        raise ValueError("DataFrame is empty or contains no rows")
    
    terms_df = get_df_terms(df, col_name_original)
    terms_df = get_df_terms(terms_df, col_name_translated)
    o_bm_name = f"{col_name_original}_bm25s"
    t_bm_name = f"{col_name_translated}_bm25s"
    o_bm_avg_name = f"{col_name_original}_bm25s_avg"
    t_bm_avg_name = f"{col_name_translated}_bm25s_avg"
    terms_df[o_bm_name] = terms_df.apply(
        lambda row: calcualte_distribution_for_row(
            row,
            k,
            b,
            terms_df,
            col_name_original
        ),
        axis=1
    )
    terms_df[t_bm_name] = terms_df.apply(
        lambda row: calcualte_distribution_for_row(
            row,
            k,
            b,
            terms_df,
            col_name_translated
        ),
        axis=1
    )
    terms_df[o_bm_avg_name] = terms_df.apply(
        lambda row: np.average(row[f"{col_name_original}_bm25s"]), axis=1
    )
    terms_df[t_bm_avg_name] = terms_df.apply(
        lambda row: np.average(row[t_bm_name]), axis=1
    )
    terms_df[o_bm_name] = terms_df.apply(
        lambda row: [elem - row[o_bm_avg_name] for elem in row[o_bm_name]],
        axis=1
    )
    terms_df[t_bm_name] = terms_df.apply(
        lambda row: [elem - row[t_bm_avg_name] for elem in row[t_bm_name]],
        axis=1
    )
    
    return 1.0 - terms_df.apply(
        lambda row: calculate_max_distance(
            row[f"{col_name_original}_bm25s"],
            row[f"{col_name_translated}_bm25s"]
        ), axis=1
    ).mean()

In [96]:
row_index = 20
print(calcualte_doc_length(terms_df.loc[row_index], 0))
print(terms_df.apply(
    lambda row: calcualte_doc_length(row, 0), axis=1
).mean())
print(terms_df.apply(
    lambda row: calcualte_doc_length(row, 0), axis=1
))

12
5.956521739130435
0      4
1      9
2      4
3      3
4     22
5      4
6      3
7      3
8      3
9      4
10     3
11     6
12     7
13     7
14     5
15     3
16     4
17    10
18     4
19     5
20    12
21     7
22     5
dtype: int64


TypeError: calculate_term_freq_in_doc() missing 1 required positional argument: 'col_name'

In [97]:
docs_contain("best", terms_df, 0)

np.int64(4)

In [137]:
b = 0.75
k = 1.2

term = "brokerchooser"
row_id = 17

first_bms = calcualte_distribution_for_row(
    terms_df.loc[row_id],
    k,
    b,
    terms_df,
    0
)

second_bms = calcualte_distribution_for_row(
    terms_df.loc[row_id],
    k,
    b,
    terms_df,
    1
)

print(first_bms)
print(second_bms)
print(calculate_max_distance(first_bms, second_bms))

[np.float64(1.5941734948752988), np.float64(1.5941734948752988), np.float64(1.9115123252668873), np.float64(1.9115123252668873), np.float64(2.454006639005572), np.float64(2.454006639005572), np.float64(2.454006639005572), np.float64(2.454006639005572), np.float64(2.454006639005572), np.float64(2.454006639005572)]
[np.float64(0.7343403507450252), np.float64(1.5941734948752988), np.float64(1.9115123252668873), np.float64(1.9115123252668873), np.float64(2.454006639005572), np.float64(2.454006639005572), np.float64(2.454006639005572), np.float64(2.454006639005572), np.float64(2.454006639005572), np.float64(2.454006639005572)]
0.0


In [110]:
calcualte_term_bm25_for_row(
    "BrokerChooser",
    terms_df.loc[10],
    k,
    b,
    terms_df,
    1
)

doc_num idf
23
docs contain idf
1
IDF
3.1354942159291497
freq
1
doc length
3
Average doc length
5.956521739130435


np.float64(3.9343795032516478)

In [127]:
num_list = [
    0.2,
    0.3,
    0.3,
    0.4
]
second_list = [
    0.23,
    0.27,
    0.3,
    0.35,
    0.41
]
target = 0.3
print(sum(elem <= target for elem in num_list)/len(num_list))
print(calculate_max_distance(num_list, second_list))

0.75
0.2 0.25 0.0
0.3 0.75 0.6
0.3 0.75 0.6
0.4 1.0 0.8
0.23 0.25 0.2
0.27 0.25 0.4
0.3 0.75 0.6
0.35 0.75 0.8
0.41 1.0 1.0
0.25


In [138]:
translated_df = pd.read_csv("translated.csv")

In [139]:
print(translated_df)

    Unnamed: 0                                            English  \
0            0                             15+ years track record   
1            1  125,500 people already found a broker via this...   
2            2                         Mutual funds are available   
3            3                               Best futures brokers   
4            4  Access to archived threads is only available t...   
5            5                      across [dataPoints]+ criteria   
6            6                       Alternatives to [brokerName]   
7            7                         Available in [countryName]   
8            8                                 Best forex brokers   
9            9                            Best stock trading apps   
10          10                        BrokerChooser Awards [year]   
11          11        Broker is not available in [countryTheName]   
12          12  Calculate stock trade commission at various br...   
13          13            Check ou

In [192]:
languages = [
    "Hungarian",
    "Spanish",
    "French",
    "German",
    "Japanese",
    "Arabic",
    "Hindi",
    "Portugese",
]
similarities = []
for language in languages:
    similarities.append({
        "language": language,
        "similarity": bm25_term_similarity(
            k,
            b,
            translated_df,
            "English",
            language
        )
    })
print(pd.DataFrame(similarities).to_markdown(index=False))

| language   |   similarity |
|:-----------|-------------:|
| Hungarian  |     0.553473 |
| Spanish    |     0.508901 |
| French     |     0.503316 |
| German     |     0.485719 |
| Japanese   |     0.490617 |
| Arabic     |     0.532907 |
| Hindi      |     0.45173  |
| Portugese  |     0.543742 |


In [174]:
google_df = pd.read_csv("google_translation.csv")

In [191]:
similarities = []
for language in languages:
    similarities.append({
        "language": language,
        "similarity": bm25_term_similarity(
            k,
            b,
            google_df,
            "English",
            language
        )
    })
print(pd.DataFrame(similarities).to_markdown(index=False))


| language   |   similarity |
|:-----------|-------------:|
| Hungarian  |     0.539884 |
| Spanish    |     0.487512 |
| French     |     0.462427 |
| German     |     0.469132 |
| Japanese   |     0.458423 |
| Arabic     |     0.520939 |
| Hindi      |     0.515892 |
| Portugese  |     0.534345 |


In [188]:
def extract_placeholders(text: str, placeholder_style: str = r"\[(.*?)\]") -> List[str]:
    return list(set(re.findall(placeholder_style, text)))

def placeholder_identity(original: str, translated: str) -> float:
    original_placeholders = extract_placeholders(original)
    translated_placeholders = extract_placeholders(translated)
    # if not float(set(original_placeholders) == set(translated_placeholders)):
    #     print(set(original_placeholders))
    #     print(set(translated_placeholders))
    #     print()

    return float(set(original_placeholders) == set(translated_placeholders))

In [189]:
placeholders_changed = []
for language in languages:
    placeholders_changed.append({
        "language": language,
        "Wrong": google_df.apply(
            lambda row: 1 - placeholder_identity(row["English"], row[language]),
            axis=1
        ).astype("int").sum()
    })
print(pd.DataFrame(placeholders_changed).to_markdown(index=False))

| language   |   Wrong |
|:-----------|--------:|
| Hungarian  |       7 |
| Spanish    |       7 |
| French     |       7 |
| German     |       8 |
| Japanese   |       6 |
| Arabic     |       6 |
| Hindi      |       8 |
| Portugese  |       7 |


In [194]:
from langchain_openai import OpenAIEmbeddings
from langchain.evaluation.schema import EvaluatorType, StringEvaluator
from langchain.evaluation import load_evaluator
from langchain.chains.base import Chain

from dotenv import load_dotenv

load_dotenv()


def openai_embeddings_similarity(original: str, translated: str) -> dict:
    embedding_model = OpenAIEmbeddings(
        model="text-embedding-3-large",
    )
    evaluator = load_evaluator(
        EvaluatorType.EMBEDDING_DISTANCE,
        embeddings=embedding_model
    )

    if not hasattr(evaluator, "evaluate_strings"):
        raise TypeError(f"Unexpected evaluator type: {type(evaluator)}")

    if isinstance(evaluator, StringEvaluator):
        return evaluator.evaluate_strings(
            prediction=translated,
            reference=original
        )
    elif isinstance(evaluator, Chain):
        raise TypeError("Evaluator is a Chain, which does not support evaluate_strings.")
    else:
        raise TypeError(f"Unexpected evaluator type: {type(evaluator)}")

In [200]:
similarities = []
print(translated_df["English"])
for language in languages:
    similarities.append({
        "language": language,
        "similarity": translated_df.apply(
            lambda row: openai_embeddings_similarity(
                str(translated_df["English"]),
                str(translated_df[1][language])
            ),
            axis=1
        )
    })
print(pd.DataFrame(similarities).to_markdown(index=False))

0                                15+ years track record
1     125,500 people already found a broker via this...
2                            Mutual funds are available
3                                  Best futures brokers
4     Access to archived threads is only available t...
5                         across [dataPoints]+ criteria
6                          Alternatives to [brokerName]
7                            Available in [countryName]
8                                    Best forex brokers
9                               Best stock trading apps
10                          BrokerChooser Awards [year]
11          Broker is not available in [countryTheName]
12    Calculate stock trade commission at various br...
13              Check out the best brokers in [country]
14                Complete Find My Broker questionnaire
15                                    FX Fee Calculator
16                                Scam brokers to avoid
17    [Broker name] [param name] fees are about 

KeyError: (0, 'Hungarian')