In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("intfloat/multilingual-e5-small")

In [None]:
import pandas as pd
import re
from html import unescape

# Load Google Sheet as a CSV
df = pd.read_csv("cofounder_profiles_original.csv")
# Step 1: Clean and rename columns
df.columns = df.columns.str.strip()  # remove leading/trailing whitespace
df = df.rename(columns={
    "Vorname": "first_name",
    "Nachname": "last_name",
    "Wohnort": "location",
    "Aktuelle Branche/Industrie": "current_industry",
    "Brancheninteresse": "industry_interest",
    "Gründungsstatus": "startup_status",
    "Skills/Hintergrund": "skills_background",
    "Ich suche...": "looking_for_1",
    "Ich suche..": "looking_for_2",
    "Ich biete...": "offering_1",
    "Ich biete..": "offering_2",
    "LinkedIn-Profil": "linkedin",
    "E-Mail-Adresse": "email",
    "Telefonnummer": "phone",
    "Zeitstempel": "timestamp"
})

# Step 2: Combine the 'looking for' and 'offering' columns
df["looking_for"] = df["looking_for_1"].fillna("") + " " + df["looking_for_2"].fillna("")
df["offering"] = df["offering_1"].fillna("") + " " + df["offering_2"].fillna("")

# Step 3: Drop the old columns (optional)
df = df.drop(columns=["looking_for_1", "looking_for_2", "offering_1", "offering_2"])

# Step 4: Replace NaNs with empty strings
df = df.fillna("")

# Step 5: Clean the text data
def sanitize_text(text):
    if not isinstance(text, str):
        return ""
    text = unescape(text)  # Convert HTML entities (e.g., &amp;) to normal characters
    text = re.sub(r'<[^>]*?>', '', text)  # Remove any HTML tags
    text = re.sub(r'[\r\n]+', ' ', text)  # Replace newlines with spaces
    text = re.sub(r'\s+', ' ', text)  # Collapse multiple spaces
    return text.strip()
# Apply the cleaning function to all relevant columns
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].apply(sanitize_text)

# Step 6: Remove duplicates based on 'email' and 'phone' columns
df = df.drop_duplicates(subset=["email", "phone"], keep="first")

# Step 7: Save the cleaned DataFrame to a new CSV file
df.to_csv("cofounder_profiles_cleaned.csv", index=False)

# Optional: preview the cleaned DataFrame
print(df.head(1).T)

In [None]:
# Embedding the data
def embed_text(texts, prefix=""):
    return [model.encode(f"{prefix} {t}", convert_to_numpy=True) for t in texts]

# Embed 'looking_for' as queries
df["embedding_looking_for"] = embed_text(df["looking_for"], prefix="query:")

# Embed 'offering' as passages
df["embedding_offering"] = embed_text(df["offering"], prefix="passage:")

# Embed industry fields
df["embedding_current_industry"] = embed_text(df["current_industry"], prefix="info:")
df["embedding_industry_interest"] = embed_text(df["industry_interest"], prefix="info:")

# Embed skills
df["embedding_skills"] = embed_text(df["skills_background"], prefix="info:")
df.to_csv("cofounder_profiles_embeddings.csv", index=False)
df.to_pickle("embedded_profiles.pkl")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load your embedded DataFrame (if not already loaded)
df = pd.read_pickle("embedded_profiles.pkl")

def compute_match_score(
    row_a,
    row_b,
    weight_main=0.6,
    weight_industry=0.15,
    weight_skills=0.2,
    location_bonus=0.05
):
    # --- Main match (looking for vs offering)
    sim_a = cosine_similarity(
        row_a["embedding_looking_for"].reshape(1, -1),
        row_b["embedding_offering"].reshape(1, -1)
    )[0][0]

    sim_b = cosine_similarity(
        row_b["embedding_looking_for"].reshape(1, -1),
        row_a["embedding_offering"].reshape(1, -1)
    )[0][0]

    score_main = (sim_a + sim_b) / 2 * weight_main

    # --- Industry match
    if weight_industry != 0:
        industry_sim = cosine_similarity(
            row_a["embedding_industry_interest"].reshape(1, -1),
            row_b["embedding_current_industry"].reshape(1, -1)
        )[0][0]
        score_industry = industry_sim * weight_industry
    else:
        # If weight_industry is 0, we don't want to compute the industry similarity
        score_industry = 0

    # --- Skills match (symmetric, how similar their skills are)
    if weight_skills != 0:
        skills_sim = cosine_similarity(
            row_a["embedding_skills"].reshape(1, -1),
            row_b["embedding_skills"].reshape(1, -1)
        )[0][0]
        score_skills = skills_sim * weight_skills
    else:
        # If weight_skills is 0, we don't want to compute the skills similarity
        score_skills = 0

    # --- Optional location bonus
    bonus = 0
    if row_a["location"] and row_b["location"]:
        if row_a["location"].strip().lower() == row_b["location"].strip().lower():
            bonus += location_bonus

    # --- Total score
    total_score = score_main + score_industry + score_skills + bonus

    return total_score


results = []

for idx_a, row_a in df.iterrows():
    for idx_b, row_b in df.iterrows():
        if idx_a == idx_b:
            continue  # skip self

        score = compute_match_score(row_a, row_b)

        results.append({
            "person_a": row_a["first_name"] + " " + row_a["last_name"],
            "person_b": row_b["first_name"] + " " + row_b["last_name"],
            "score": score,
            "email_a": row_a["email"],
            "email_b": row_b["email"],
            "location_a": row_a["location"],
            "location_b": row_b["location"]
        })

# Create a DataFrame of match scores
match_df = pd.DataFrame(results)

# Sort best matches
match_df = match_df.sort_values(by="score", ascending=False)

# Optional: top N matches per person
top_matches = match_df.groupby("person_a").head(5)
top_matches.to_csv("top_matches.csv", index=False)
top_matches

In [None]:
from collections import defaultdict
import heapq
from itertools import combinations
import pandas as pd
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm
import os

def compute_group_score(group_rows, **kwargs):
    """Compute average pairwise match score for a group of rows"""
    scores = []
    for a, b in combinations(group_rows, 2):
        score = compute_match_score(a, b, **kwargs)
        scores.append(score)
    return sum(scores) / len(scores)  # average of pairwise scores



def score_group_helper(group, kwargs):
    score = compute_group_score(group, **kwargs)
    group_info = {
        f"name_{i+1}": f"{r['first_name']} {r['last_name']}" for i, r in enumerate(group)
    }
    group_info.update({
        f"email_{i+1}": r.get("email", "") for i, r in enumerate(group)
    })
    group_info["score"] = score
    # Also include indexes or unique id for reference per person
    group_info["person_ids"] = [r['email'] for r in group]  # or any unique identifier per person
    return (score, group_info)

def find_top_n_per_person(df, group_number=2, top_n=5, max_workers=None, **kwargs):
    rows = [row._asdict() for row in df.itertuples(index=False)]
    all_groups = list(combinations(rows, group_number))

    if max_workers is None:
        max_workers = os.cpu_count() or 1

    # Dictionary: person_id -> min-heap of (score, group_info)
    person_matches = defaultdict(list)

    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(score_group_helper, group, kwargs): group for group in all_groups}

        for future in tqdm(as_completed(futures), total=len(futures), desc="Computing groups"):
            score, group_info = future.result()
            person_ids = group_info["person_ids"]

            for pid in person_ids:
                heap = person_matches[pid]
                if len(heap) < top_n:
                    heapq.heappush(heap, (score, group_info))
                else:
                    if score > heap[0][0]:
                        heapq.heapreplace(heap, (score, group_info))

    # Prepare results: flatten to one row per person per match
    records = []
    for pid, matches in person_matches.items():
        for score, group_info in sorted(matches, key=lambda x: x[0], reverse=True):
            # Include person id, score, and group info
            record = {
                "person_id": pid,
                "score": score,
            }
            record.update(group_info)
            records.append(record)

    df_results = pd.DataFrame(records)
    return df_results


In [None]:
import os

print(f"Detected CPU cores: {os.cpu_count()}")
max_workers = os.cpu_count() - 2 if os.cpu_count() > 2 else 1
top_groups_df = find_top_n_per_person(df, group_number=3, top_n=5, max_workers=max_workers)

print(top_groups_df.head())

# Save to CSV
top_groups_df.to_csv("top_cofounder_groups.csv", index=False)

In [None]:
top_groups_df

In [None]:
from itertools import combinations
import heapq
from tqdm import tqdm

def find_top_matches_for_person(df, target_email, group_size=2, top_n=5, **kwargs):
    # Convert DataFrame rows to dicts for processing
    rows = [row._asdict() for row in df.itertuples(index=False)]

    # Find the target person
    target = next((row for row in rows if row["email"] == target_email), None)
    if not target:
        raise ValueError(f"No person found with email: {target_email}")

    # Prepare pool of other participants
    others = [r for r in rows if r["email"] != target_email]

    # Generate all possible groups including the target
    all_groups = [tuple([target] + list(comb)) for comb in combinations(others, group_size - 1)]

    # Score and keep top_n matches using a min-heap
    heap = []
    for group in tqdm(all_groups, desc="Scoring groups"):
        score = compute_group_score(group, **kwargs)

        group_info = {
            f"name_{i+1}": f"{r['first_name']} {r['last_name']}" for i, r in enumerate(group)
        }
        group_info.update({
            f"email_{i+1}": r.get("email", "") for i, r in enumerate(group)
        })
        group_info["score"] = score

        if len(heap) < top_n:
            heapq.heappush(heap, (score, group_info))
        else:
            if score > heap[0][0]:
                heapq.heapreplace(heap, (score, group_info))

    # Sort results by descending score
    top_groups = [x[1] for x in sorted(heap, key=lambda x: x[0], reverse=True)]
    df_top_groups = pd.DataFrame(top_groups)

    return df_top_groups


In [None]:
top_matches = find_top_matches_for_person(
    df,
    target_email="jamal.alkharrat@gmail.com",
    group_size=2,
    top_n=5
)

print(top_matches)
