# **Our research running:**


1.   English proverbs analysis with clustering and cosine similarity.
2.   Chinese proverbs analysis with clustering under the pre-assigned classes.
3.   Multy language analysis and clustering - using all of our data.


Then we took those results and looked up for intresting finding for our paper.



In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.metrics import adjusted_rand_score


# Load all of the data names from the exel files for future use:


# language - the file name for it.

english = "4437_English_proverbs With vectors.xlsx"

hebrew = "updated_1867_Hebrew_Proverbs with Vectors.xlsx"

arabic = "updated_kaggle_Arabic_proverbs with Vectors.xlsx"

french = "updated_kaggle_French_proverbs with Vectors.xlsx"

chinese = "updated_kaggle_Chinese_proverbs with Vectors.xlsx"

#The Excel vector columns of the embedding for each language.
vector_columns = [
    "Vector bert-base-uncased",
    "Vector paraphrase-multilingual-MiniLM-L12-v2",
    "Vector All-MiniLM-L12-v2",
    "Vector roberta-large-nli-stsb-mean-tokens"
]

## Clustering and analysing using the English proverbs

In [None]:
# Load the dataset
def load_data(file_path):
    data = pd.read_excel(file_path)
    return data

# Convert vector column from string to numpy array
def process_vectors(data, vector_column):
    data[vector_column] = data[vector_column].apply(lambda x: np.array(eval(x)))
    return data

# Perform clustering
def perform_clustering(vectors, num_clusters):
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    clusters = kmeans.fit_predict(list(vectors))
    return clusters

# Find pairs with high cosine similarity within each cluster
def find_similar_pairs(data, vector_column, cluster_column, threshold=0.75):
    results = []
    for cluster_id in data[cluster_column].unique():
        cluster_data = data[data[cluster_column] == cluster_id]
        vectors = np.stack(cluster_data[vector_column].values)
        similarities = cosine_similarity(vectors)
        n = len(cluster_data)

        for i in range(n):
            for j in range(i + 1, n):
                if threshold <= similarities[i, j] <= 0.91:
                    results.append({
                        "sentence_1": cluster_data.iloc[i]["trimmed"],
                        "sentence_2": cluster_data.iloc[j]["trimmed"],
                        "similarity": similarities[i, j],
                        "cluster": cluster_id
                    })
    return pd.DataFrame(results)

# Main workflow
def main(file_path, vector_columns, num_clusters=20, similarity_threshold=0.75, output_csv=True):
    data = load_data(file_path)

    for vector_column in vector_columns:
        print(f"Processing for vector representation: {vector_column}")

        # Process vectors
        data = process_vectors(data, vector_column)

        # Perform clustering
        clusters = perform_clustering(data[vector_column], num_clusters)
        cluster_column = f"{vector_column}_cluster"
        data[cluster_column] = clusters

        # Find similar pairs within clusters
        similar_pairs = find_similar_pairs(data, vector_column, cluster_column, similarity_threshold)
        similar_pairs = similar_pairs.sort_values(by="similarity", ascending=False)
        print(f"Similar pairs for {vector_column}:")
        print(similar_pairs)

        # Save results to CSV if required
        if output_csv:
            output_file = f"similar_pairs_{vector_column.replace(' ', '_')}.csv"
            similar_pairs.to_csv(output_file, index=False)
            print(f"Saved similar pairs to {output_file}")


# Run the workflow
main(file_path, vector_columns)

Processing for vector representation: Vector bert-base-uncased
Similar pairs for Vector bert-base-uncased:
                                              sentence_1  \
58424                    englishman's home is his castle   
38761                                          acid test   
38760                                          acid test   
31170                                          in spades   
76525                               fight the good fight   
...                                                  ...   
76404  famous last words (dying statements of famous ...   
17480                                      alphabet soup   
18652                                   blow a raspberry   
21577                                       gregory peck   
74817             all promises are either broken or kept   

                                              sentence_2  similarity  cluster  
58424                                     zero tolerance    0.910000       11  
38761       

## Clustering the Chinese proverbs.

This code was not used to run the clustering, but is still saved.

In [None]:
def load_data(file_path):
    """Load the dataset from an Excel file."""
    data = pd.read_excel(file_path)
    return data

def process_vectors(data, vector_column):
    """Convert vector column from string to numpy array."""
    data[vector_column] = data[vector_column].apply(lambda x: np.array(eval(x)))
    return data

def perform_clustering(vectors, num_clusters):
    """Perform clustering using KMeans."""
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    clusters = kmeans.fit_predict(list(vectors))
    return clusters

def evaluate_clustering(data, cluster_column, category_column):
    """Evaluate clustering using Adjusted Rand Index (ARI)."""
    ari = adjusted_rand_score(data[category_column], data[cluster_column])
    return ari

def find_similar_pairs(data, vector_column, cluster_column, threshold=0.75):
    """Find pairs with high cosine similarity within each cluster."""
    results = []
    for cluster_id in data[cluster_column].unique():
        cluster_data = data[data[cluster_column] == cluster_id]
        vectors = np.stack(cluster_data[vector_column].values)
        similarities = cosine_similarity(vectors)
        n = len(cluster_data)

        for i in range(n):
            for j in range(i + 1, n):
                if threshold <= similarities[i, j] <= 0.91:
                    results.append({
                        "sentence_1": cluster_data.iloc[i]["trimmed"],
                        "sentence_2": cluster_data.iloc[j]["trimmed"],
                        "similarity": similarities[i, j],
                        "cluster": cluster_id
                    })
    return pd.DataFrame(results)

def main(file_path, vector_columns, category_column="category", similarity_threshold=0.75, output_csv=False):
    """Main workflow."""
    # Load the data
    data = load_data(file_path)

    # Determine the number of unique categories for clustering
    num_clusters = data[category_column].nunique()

    results_summary = []

    for vector_column in vector_columns:
        print(f"Processing for vector representation: {vector_column}")

        # Process vectors
        data = process_vectors(data, vector_column)

        # Perform clustering
        clusters = perform_clustering(data[vector_column], num_clusters)
        cluster_column = f"{vector_column}_cluster"
        data[cluster_column] = clusters

        # Evaluate clustering performance
        ari = evaluate_clustering(data, cluster_column, category_column)
        print(f"Adjusted Rand Index for {vector_column}: {ari}")
        results_summary.append({"vector_column": vector_column, "ari": ari})

        # Find similar pairs within clusters
        similar_pairs = find_similar_pairs(data, vector_column, cluster_column, similarity_threshold)
        similar_pairs = similar_pairs.sort_values(by="similarity", ascending=False)
        print(f"Similar pairs for {vector_column}:\n", similar_pairs.head())

        # Save results to CSV if required
        if output_csv:
            output_file = f"similar_pairs_{vector_column.replace(' ', '_')}.csv"
            similar_pairs.to_csv(output_file, index=False)
            print(f"Saved similar pairs to {output_file}")

    # Summary of ARI scores
    results_summary_df = pd.DataFrame(results_summary)
    print("\nSummary of Adjusted Rand Index scores:")
    print(results_summary_df)
    if output_csv:
        results_summary_df.to_csv("clustering_evaluation_summary.csv", index=False)
        print("Saved clustering evaluation summary to clustering_evaluation_summary.csv")





# main(chin, vector_columns, category_column="category")


Processing for vector representation: Vector bert-base-uncased
Adjusted Rand Index for Vector bert-base-uncased: 0.14895844413218606
Similar pairs for Vector bert-base-uncased:
      sentence_1        sentence_2  similarity  cluster
6      有情人终成眷属。          桂林山水甲天下。    0.909712        6
1      一日之计在于晨。             逆来顺受。    0.909106        3
8  但愿人长久，千里共婵娟。             龙马精神。    0.908993        6
0     道不同，不相为谋。  一寸光阴一寸金，寸金难买寸光阴。    0.908515        3
9  但愿人长久，千里共婵娟。             车水马龙。    0.907023        6
Saved similar pairs to similar_pairs_Vector_bert-base-uncased.csv
Processing for vector representation: Vector paraphrase-multilingual-MiniLM-L12-v2
Adjusted Rand Index for Vector paraphrase-multilingual-MiniLM-L12-v2: 0.11347488852535291
Similar pairs for Vector paraphrase-multilingual-MiniLM-L12-v2:
        sentence_1    sentence_2  similarity  cluster
92          知音难觅。      广交友，无深交。    0.887603        5
128         龙飞凤舞。         龙马精神。    0.875140        6
54       清官难断家务事。       家丑不可外

Doing clustering on chinese proverbs

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, homogeneity_score, completeness_score, v_measure_score, fowlkes_mallows_score
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

def load_data(file_path):
    """Load the dataset from an Excel file."""
    data = pd.read_excel(file_path)
    return data

def process_vectors(data, vector_column):
    """Convert vector column from string to numpy array."""
    data[vector_column] = data[vector_column].apply(lambda x: np.array(eval(x)))
    return data

def perform_clustering(vectors, num_clusters):
    """Perform KMeans clustering."""
    model = KMeans(n_clusters=num_clusters, random_state=42)
    return model.fit_predict(list(vectors))

def evaluate_clustering(data, cluster_column, category_column, vectors):
    """Evaluate clustering using various metrics."""
    results = {}

    # Supervised metrics (require ground truth labels)
    results["Adjusted Rand Index"] = adjusted_rand_score(data[category_column], data[cluster_column])
    results["Normalized Mutual Information"] = normalized_mutual_info_score(data[category_column], data[cluster_column])
    results["Homogeneity"] = homogeneity_score(data[category_column], data[cluster_column])
    results["Completeness"] = completeness_score(data[category_column], data[cluster_column])
    results["V-Measure"] = v_measure_score(data[category_column], data[cluster_column])
    results["Fowlkes-Mallows Index"] = fowlkes_mallows_score(data[category_column], data[cluster_column])

    # Unsupervised metrics (no ground truth required)
    results["Silhouette Score"] = silhouette_score(list(vectors), data[cluster_column], metric='cosine')
    results["Davies-Bouldin Index"] = davies_bouldin_score(list(vectors), data[cluster_column])
    results["Calinski-Harabasz Index"] = calinski_harabasz_score(list(vectors), data[cluster_column])

    return results

def main(file_path, vector_columns, category_column="category", output_csv=True):
    """Main workflow."""
    # Load the data
    data = load_data(file_path)

    # Determine the number of unique categories for clustering
    num_clusters = data[category_column].nunique()

    results_summary = []

    for vector_column in vector_columns:
        print(f"Processing for vector representation: {vector_column}")

        # Process vectors
        data = process_vectors(data, vector_column)

        # Perform clustering
        clusters = perform_clustering(data[vector_column], num_clusters=num_clusters)
        cluster_column = f"{vector_column}_cluster"
        data[cluster_column] = clusters

        # Evaluate clustering performance
        evaluation_results = evaluate_clustering(data, cluster_column, category_column, data[vector_column])
        print(f"Evaluation results for {vector_column}:")
        for metric, value in evaluation_results.items():
            print(f"{metric}: {value:.4f}")

        # Find the best metric and its value
        best_metric = max(evaluation_results, key=evaluation_results.get)
        best_value = evaluation_results[best_metric]
        print(f"Best Metric for {vector_column}: {best_metric} with value {best_value:.4f}\n")

        results_summary.append({"vector_column": vector_column, **evaluation_results})

    # Summary of evaluation scores
    results_summary_df = pd.DataFrame(results_summary)
    print("\nSummary of evaluation scores:")
    print(results_summary_df)
    if output_csv:
        results_summary_df.to_csv("clustering_evaluation_summary.csv", index=False)
        print("Saved clustering evaluation summary to clustering_evaluation_summary.csv")

# Example usage
# main("path_to_file.xlsx", ["bert-base-uncased", "paraphrase-multilingual-MiniLM-L12-v2", "All-MiniLM-L12-v2", "roberta-large-nli-stsb-mean-tokens"], category_column="category")



main(chin, vector_columns, category_column="category")



Processing for vector representation: Vector bert-base-uncased
Evaluation results for Vector bert-base-uncased:
Adjusted Rand Index: 0.1490
Normalized Mutual Information: 0.3187
Homogeneity: 0.3102
Completeness: 0.3276
V-Measure: 0.3187
Fowlkes-Mallows Index: 0.2862
Silhouette Score: 0.0814
Davies-Bouldin Index: 2.7721
Calinski-Harabasz Index: 5.0962
Best Metric for Vector bert-base-uncased: Calinski-Harabasz Index with value 5.0962

Processing for vector representation: Vector paraphrase-multilingual-MiniLM-L12-v2
Evaluation results for Vector paraphrase-multilingual-MiniLM-L12-v2:
Adjusted Rand Index: 0.1135
Normalized Mutual Information: 0.2782
Homogeneity: 0.2617
Completeness: 0.2970
V-Measure: 0.2782
Fowlkes-Mallows Index: 0.2700
Silhouette Score: 0.0901
Davies-Bouldin Index: 2.8385
Calinski-Harabasz Index: 4.6036
Best Metric for Vector paraphrase-multilingual-MiniLM-L12-v2: Calinski-Harabasz Index with value 4.6036

Processing for vector representation: Vector All-MiniLM-L12-v2
E

## Multy language clustering

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity


# Load the dataset
def load_data(file_path):
    """Load data and handle encoding issues."""
    data = pd.read_excel(file_path, engine="openpyxl")

    # Decode text fields to UTF-8
    for col in ["Proverbs", "trimmed"]:
        if col in data.columns:
            data[col] = data[col].astype(str).str.encode("utf-8", errors="ignore").str.decode("utf-8")

    return data


# Convert vector column from string to numpy array
def process_vectors(data, vector_column):
    """Convert vector column from string to numpy array."""
    data[vector_column] = data[vector_column].apply(lambda x: np.array(eval(x)))
    return data


# Perform clustering
def perform_clustering(vectors, num_clusters):
    """Perform KMeans clustering."""
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    clusters = kmeans.fit_predict(vectors)
    return clusters


def filter_same_language_pairs(data, cluster_column, vector_column, threshold_min=0.75, threshold_max=0.91):
    """Filter pairs within clusters by similarity score."""
    filtered_results = []

    for cluster_id in data[cluster_column].unique():
        cluster_data = data[data[cluster_column] == cluster_id].reset_index(drop=True)  # Reset index
        vectors = np.stack(cluster_data[vector_column].values)
        similarities = cosine_similarity(vectors)

        for i, row1 in cluster_data.iterrows():
            for j, row2 in cluster_data.iterrows():
                if i < j and row1["Language"] != row2["Language"]:
                    similarity = similarities[i, j]
                    if threshold_min <= similarity <= threshold_max:
                        filtered_results.append({
                            "sentence_1": row1["trimmed"],
                            "language_1": row1["Language"],
                            "sentence_2": row2["trimmed"],
                            "language_2": row2["Language"],
                            "cosine_similarity": similarity,
                            "cluster": cluster_id
                        })

    return pd.DataFrame(filtered_results)


# Main workflow for clustering all languages together
def unified_clustering(files, vector_columns, num_clusters=20, similarity_threshold_min=0.75, similarity_threshold_max=0.91, output_csv=True):
    """Unified clustering workflow."""
    # Load and combine all datasets
    combined_data = []
    for lang, file in files.items():
        data = load_data(file)
        data["Language"] = lang  # Add language identifier
        combined_data.append(data)

    combined_data = pd.concat(combined_data, ignore_index=True)

    for vector_column in vector_columns:
        print(f"Processing for vector representation: {vector_column}")

        # Process vectors
        combined_data = process_vectors(combined_data, vector_column)

        # Stack all vectors for clustering
        all_vectors = np.stack(combined_data[vector_column].values)

        # Perform clustering
        clusters = perform_clustering(all_vectors, num_clusters)
        combined_data[f"{vector_column}_cluster"] = clusters

        print(f"Clustering completed for {vector_column}.")

        # Filter cross-language pairs within clusters by similarity
        cross_language_pairs = filter_same_language_pairs(
            combined_data,
            cluster_column=f"{vector_column}_cluster",
            vector_column=vector_column,
            threshold_min=similarity_threshold_min,
            threshold_max=similarity_threshold_max,
        )

        # Save the results
        if output_csv:

            filtered_output_file = f"filtered_cross_language_pairs_{vector_column.replace(' ', '_')}.csv"
            cross_language_pairs.to_csv(filtered_output_file, index=False)
            print(f"Saved filtered cross-language pairs to {filtered_output_file}")


# Define file paths and vector columns
files = {
    "English": "updated_4437_English_proverbs-1 With vectors.xlsx",
    "Hebrew": "updated_1867_Hebrew_Proverbs with Vectors.xlsx",
    "Arabic": "updated_kaggle_Arabic_proverbs with Vectors.xlsx",
    "French": "updated_kaggle_French_proverbs with Vectors.xlsx",
    "Chinese": "updated_kaggle_Chinese_proverbs with Vectors.xlsx"
}

vector_columns = [
    "Vector bert-base-uncased",
    "Vector paraphrase-multilingual-MiniLM-L12-v2",
    "Vector All-MiniLM-L12-v2",
    "Vector roberta-large-nli-stsb-mean-tokens"
]

# Run the unified clustering workflow
unified_clustering(files, vector_columns)

Processing for vector representation: Vector bert-base-uncased
Clustering completed for Vector bert-base-uncased.
Saved filtered cross-language pairs to filtered_cross_language_pairs_Vector_bert-base-uncased.csv
Processing for vector representation: Vector paraphrase-multilingual-MiniLM-L12-v2
Clustering completed for Vector paraphrase-multilingual-MiniLM-L12-v2.
Saved filtered cross-language pairs to filtered_cross_language_pairs_Vector_paraphrase-multilingual-MiniLM-L12-v2.csv
Processing for vector representation: Vector All-MiniLM-L12-v2
Clustering completed for Vector All-MiniLM-L12-v2.
Saved filtered cross-language pairs to filtered_cross_language_pairs_Vector_All-MiniLM-L12-v2.csv
Processing for vector representation: Vector roberta-large-nli-stsb-mean-tokens
Clustering completed for Vector roberta-large-nli-stsb-mean-tokens.
Saved filtered cross-language pairs to filtered_cross_language_pairs_Vector_roberta-large-nli-stsb-mean-tokens.csv


In [None]:
import pandas as pd

def rewrite_file_for_download(input_file, output_file, file_format="csv", encoding="utf-8"):
    """
    Reads a file and rewrites it with proper encoding for local download.

    Parameters:
    - input_file (str): Path to the input file.
    - output_file (str): Path to save the rewritten file.
    - file_format (str): Format to save the file ('csv' or 'excel').
    - encoding (str): Encoding to use when saving the file (default: 'utf-8').
    """
    try:
        # Read the input file (CSV or Excel)
        if input_file.endswith(".csv"):
            data = pd.read_csv(input_file, encoding="utf-8")
        elif input_file.endswith(".xlsx"):
            data = pd.read_excel(input_file, engine="openpyxl")
        else:
            raise ValueError("Unsupported file format. Use CSV or Excel.")

        # Rewriting the file
        if file_format == "csv":
            data.to_csv(output_file, index=False, encoding=encoding)
            print(f"File successfully rewritten as CSV: {output_file}")
        elif file_format == "excel":
            data.to_excel(output_file, index=False, engine="openpyxl")
            print(f"File successfully rewritten as Excel: {output_file}")
        else:
            raise ValueError("Unsupported output format. Use 'csv' or 'excel'.")

    except Exception as e:
        print(f"Error processing file: {e}")


input_file = "filtered_cross_language_pairs_Vector_bert-base-uncased.csv"
output_file_csv = "filtered_cross_language_pairs_cleaned.csv"
output_file_excel = "filtered_cross_language_pairs_cleaned.xlsx"

# Rewrite the file as CSV
rewrite_file_for_download(input_file, output_file_csv, file_format="csv", encoding="utf-8")

# Rewrite the file as Excel
rewrite_file_for_download(input_file, output_file_excel, file_format="excel")


File successfully rewritten as CSV: filtered_cross_language_pairs_cleaned.csv
File successfully rewritten as Excel: filtered_cross_language_pairs_cleaned.xlsx


Checking to see how many rows for each of the embedding models got assigend in the multy language phase - for the results.

In [None]:
import os
import pandas as pd

def check_file_rows_with_filter(prefix="filter"):
    """
    Checks the number of rows in all files starting with a specific prefix in the current directory.

    Parameters:
    - prefix (str): The prefix to filter files (default: "filter").
    """
    # Get all files in the current directory
    all_files = [f for f in os.listdir() if f.startswith(prefix)]

    for file in all_files:
        try:
            # Determine file type and load accordingly
            if file.endswith(".csv"):
                data = pd.read_csv(file, encoding="utf-8")
            elif file.endswith(".xlsx"):
                data = pd.read_excel(file, engine="openpyxl")
            else:
                print(f"Unsupported file format: {file}")
                continue

            # Count rows
            num_rows = len(data)
            print(f"File: {file} | Rows: {num_rows}")

        except Exception as e:
            print(f"Error reading file {file}: {e}")


# Run the row check for all files starting with "filter"
check_file_rows_with_filter(prefix="filter")


File: filtered_cross_language_pairs_Vector_bert-base-uncased.csv | Rows: 12082
File: filtered_cross_language_pairs_Vector_roberta-large-nli-stsb-mean-tokens.csv | Rows: 887
File: filtered_cross_language_pairs_Vector_paraphrase-multilingual-MiniLM-L12-v2.csv | Rows: 22098
File: filtered_cross_language_pairs_Vector_All-MiniLM-L12-v2.csv | Rows: 61
