In [2]:
import pandas as pd
import numpy as np
from google import genai
from google.genai import types
from pydantic import BaseModel
import time
import os
from dotenv import load_dotenv
import importlib
import session_organizer
# Only needed if you want to reload the module after making changes
importlib.reload(session_organizer)

ModuleNotFoundError: No module named 'session_organizer'

In [None]:
def embed_documents_with_genai(df_presentations, topic_column, model_name, api_key=None, df_embeddings=None, delay_seconds=1):
    """ Embed the presentation topics using Google GenAI with rate limiting and resume capability.
    Args:
        df_presentations (pd.DataFrame): DataFrame containing the presentations.
        topic_column (str): Column name in df_presentations that contains the topics to embed.
        model_name (str): Name of the Google GenAI model to use.
        api_key (str, optional): API key for authenticating with Google GenAI.
        df_embeddings (pd.DataFrame, optional): DataFrame to store the embeddings. If None, a new DataFrame will be created.
        delay_seconds (float): Delay between API calls to respect rate limits.

    Returns:
        pd.DataFrame: DataFrame containing the embedded presentation topics.
    """
    # Load environment variables
    load_dotenv(".env")
    # Check for API key in environment variables if not provided
    if api_key is None:
        if "GEMINI_API_KEY" not in os.environ:
            raise ValueError(
                "API key must be provided or in environmental variables. GEMINI_API_KEY not found in environment variables. Please set it in your .env file."
            )
        else:
            api_key = os.environ["GEMINI_API_KEY"]

    # Validate API key
    if not api_key:
        raise ValueError("API key is required to use Google GenAI.")

    # Extract the topics to embed
    topics_to_embed = df_presentations[topic_column].tolist()
    
    # Initialize or validate existing embeddings DataFrame
    if df_embeddings is None:
        df_embeddings = pd.DataFrame(columns=['topic', 'embedding', 'status'])
    
    # Identify which topics still need to be embedded
    already_embedded = set(df_embeddings['topic'].tolist()) if 'topic' in df_embeddings.columns else set()
    topics_to_process = [topic for topic in topics_to_embed if topic not in already_embedded]
    
    print(f"Total topics: {len(topics_to_embed)}")
    print(f"Already embedded: {len(already_embedded)}")
    print(f"Topics to process: {len(topics_to_process)}")
    
    # Process each topic individually
    for i, topic in enumerate(topics_to_process):
        print(f"Processing topic {i+1}/{len(topics_to_process)}: {topic[:50]}...")
        
        try:
            # Call the Google GenAI API for single topic
            response = call_google_genai_api_single(topic, model_name, api_key)
            
            # Process the response and add to embeddings DataFrame
            if response and hasattr(response, 'embeddings') and response.embeddings:
                embedding_values = response.embeddings[0].values
                new_row = pd.DataFrame({
                    'topic': [topic],
                    'embedding': [embedding_values],
                    'status': ['success']
                })
                df_embeddings = pd.concat([df_embeddings, new_row], ignore_index=True)
                print(f"Successfully embedded topic {i+1}")
            else:
                # Add failed embedding to track progress
                new_row = pd.DataFrame({
                    'topic': [topic],
                    'embedding': [None],
                    'status': ['failed']
                })
                df_embeddings = pd.concat([df_embeddings, new_row], ignore_index=True)
                print(f"Failed to embed topic {i+1}")
        
        except Exception as e:
            print(f"Error embedding topic {i+1}: {str(e)}")
            # Add error to track progress
            new_row = pd.DataFrame({
                'topic': [topic],
                'embedding': [None],
                'status': ['error']
            })
            df_embeddings = pd.concat([df_embeddings, new_row], ignore_index=True)
        
        # Rate limiting delay
        if i < len(topics_to_process) - 1:  # Don't delay after the last item
            time.sleep(delay_seconds)
    
    return df_embeddings

def call_google_genai_api_single(topic, model_name, api_key):
    """ Call the Google GenAI API to embed a single topic.
    Args:
        topic (str): Single topic to embed.
        model_name (str): Name of the Google GenAI model to use.
        api_key (str): API key for authenticating with Google GenAI.
    Returns:
        response: The response from the Google GenAI API.
    """
    try:
        client = genai.Client(api_key=api_key)
        
        result = client.models.embed_content(
            model=model_name,
            contents=[topic],  # Single topic in a list
            config=types.EmbedContentConfig(task_type="SEMANTIC_SIMILARITY")
        )
        
        return result
    except Exception as e:
        print(f"API call failed for topic: {str(e)}")
        return None

def save_embeddings_to_file(df_embeddings, filename="embeddings_backup.pkl"):
    """ Save embeddings DataFrame to a pickle file for backup/resume capability.
    Args:
        df_embeddings (pd.DataFrame): DataFrame containing embeddings.
        filename (str): Name of the backup file.
    """
    df_embeddings.to_pickle(filename)
    print(f"Embeddings saved to {filename}")

def load_embeddings_from_file(filename="embeddings_backup.pkl"):
    """ Load embeddings DataFrame from a pickle file.
    Args:
        filename (str): Name of the backup file.
    Returns:
        pd.DataFrame: DataFrame containing embeddings, or None if file doesn't exist.
    """
    try:
        df_embeddings = pd.read_pickle(filename)
        print(f"Embeddings loaded from {filename}")
        return df_embeddings
    except FileNotFoundError:
        print(f"Backup file {filename} not found. Starting fresh.")
        return None
    except Exception as e:
        print(f"Error loading backup file: {str(e)}")
        return None

# Example usage with resume capability
def embed_with_resume(df_presentations, topic_column, model_name, backup_filename="embeddings_backup.pkl"):
    """ Embed documents with automatic backup and resume capability.
    Args:
        df_presentations (pd.DataFrame): DataFrame containing the presentations.
        topic_column (str): Column name containing topics to embed.
        model_name (str): Name of the Google GenAI model to use.
        backup_filename (str): Name of the backup file.
    Returns:
        pd.DataFrame: DataFrame containing the embedded presentation topics.
    """
    # Try to load existing embeddings
    df_embeddings = load_embeddings_from_file(backup_filename)
    
    # Embed documents
    df_embeddings = embed_documents_with_genai(
        df_presentations=df_presentations,
        topic_column=topic_column,
        model_name=model_name,
        df_embeddings=df_embeddings,
        delay_seconds=1  # Adjust delay as needed for rate limits
    )
    
    # Save backup
    save_embeddings_to_file(df_embeddings, backup_filename)
    
    return df_embeddings

In [59]:
# First, examine the Excel file structure
# We have to use 2 files as the abstracts are not in the final sechedule file.
file_path = "25 AIM SESSIONS FINAL.xlsx"
df_temp = pd.read_excel(file_path)

print("Available columns:")
for i, col in enumerate(df_temp.columns):
    print(f"{i}: {col}")

print(f"\nFile contains {len(df_temp)} rows and {len(df_temp.columns)} columns")
print("\nFirst few rows preview:")
print(df_temp.head())

Available columns:
0: 7 digit ID
1: TC
2: session type
3: Session 1
4: Pres. Title
5: Order
6: Start time
7: End time
8: Pres Auth FN
9: Pres Auth LN
10: Pres Affiliation
11: Pres Auth Location
12: All Authors
13: Student Pres?
14: Unnamed: 14
15: Unnamed: 15

File contains 1121 rows and 16 columns

First few rows preview:
   7 digit ID                                 TC  session type  \
0     2500818  ASE-Applied Science & Engineering  Oral Session   
1     2501347  ASE-Applied Science & Engineering  Oral Session   
2     2501703  ASE-Applied Science & Engineering  Oral Session   
3     2501172  ASE-Applied Science & Engineering  Oral Session   
4     2500283  ASE-Applied Science & Engineering  Oral Session   

                                           Session 1  \
0  101 Biomass Preprocessing and Logistics for Bi...   
1  101 Biomass Preprocessing and Logistics for Bi...   
2  101 Biomass Preprocessing and Logistics for Bi...   
3  101 Biomass Preprocessing and Logistics for Bi...  

In [60]:
# Define your column selections based on the output above
TITLE_COLUMN = 'Pres. Title'  # Update based on your file
ID_COLUMN = '7 digit ID'  # Update based on your file

In [61]:
df_no_abstracts = pd.read_excel(file_path)
Title_name='Title'
Abstract_ID_name='Submission ID',


# Drop any presentations that are missing an ID number or title
df_no_abstracts = df_no_abstracts.dropna(subset=[TITLE_COLUMN, ID_COLUMN])

In [62]:
# Examine the Excel file structure for the abstracts.
file_path = "Abstracts 5.20.xlsx"
df_temp = pd.read_excel(file_path)

print("Available columns:")
for i, col in enumerate(df_temp.columns):
    print(f"{i}: {col}")

print(f"\nFile contains {len(df_temp)} rows and {len(df_temp.columns)} columns")
print("\nFirst few rows preview:")
print(df_temp.head())

Available columns:
0: Sub #
1: Created Date & Time
2: Completed Date & Time
3: Submission Status
4: Acceptance Status
5: # Reviews
6: Rating
7: Std Dev
8: Owner-E-mail Address
9: Owner-First Name
10: Owner-Last Name
11: Owner-Company/University
12: Owner-City
13: Owner-State
14: Owner-Country
15: 7 digit ID
16: TC
17: Session
18: Pres. Title
19: Abstract 
20: Order
21: Move to oral
22: Paper link
23: Pres Auth FN
24: Pres Auth LN
25: Pres Affiliation
26: Pres Auth Location
27: Pres Auth Email
28: Edited Pres Title
29: All Authors
30: Student Pres?

File contains 1118 rows and 31 columns

First few rows preview:
   Sub # Created Date & Time Completed Date & Time Submission Status  \
0      1 2025-01-22 14:12:00   2025-04-22 10:24:00          Complete   
1      1 2025-01-17 10:41:00   2025-04-15 10:43:00          Complete   
2      1 2025-01-22 18:00:00                   NaT        Incomplete   
3      1 2025-01-23 00:28:00   2025-05-01 00:22:00          Complete   
4      1 2025-01-18 0

In [63]:
# Define your column selections based on the output above
TITLE_COLUMN = 'Pres. Title'  # Update based on your file
ABSTRACT_COLUMN = 'Abstract '  # Update based on your file  
ID_COLUMN = '7 digit ID'  # Update based on your file

In [64]:
df_only_abstracts = pd.read_excel(file_path, usecols=[ABSTRACT_COLUMN, ID_COLUMN])
# Drop any presentations that are missing an ID number or abstract
df_only_abstracts = df_only_abstracts.dropna(subset=[ABSTRACT_COLUMN, ID_COLUMN])

In [71]:
# Merge the abstracts from df_only_abstracts into df_no_abstracts based on ID_COLUMN
df_final_presentations_25 = df_no_abstracts.merge(
    df_only_abstracts, 
    on=ID_COLUMN, 
    how='left'  # Keep all rows from df_no_abstracts, add abstracts where available
)

print(f"Original df_no_abstracts shape: {df_no_abstracts.shape}")
print(f"df_only_abstracts shape: {df_only_abstracts.shape}")
print(f"Final merged df_final_presentations_25 shape: {df_final_presentations_25.shape}")

# Check how many presentations now have abstracts
abstracts_added = df_final_presentations_25[ABSTRACT_COLUMN].notna().sum()
print(f"Number of presentations with abstracts: {abstracts_added}")


Original df_no_abstracts shape: (1121, 16)
df_only_abstracts shape: (1118, 2)
Final merged df_final_presentations_25 shape: (1121, 17)
Number of presentations with abstracts: 1092


In [72]:
# Drop extra columns 'Unnamed: 14', 'Unamed: 15', 'Unnamed: 16' if they exist
extra_columns = ['Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16']
df_final_presentations_25 = df_final_presentations_25.drop(columns=[col for col in extra_columns if col in df_final_presentations_25.columns], errors='ignore')

In [73]:
topic_column='Title and Abstract'
# Drop any presentations that are missing an ID number or abstract
df_final_presentations_25 = df_final_presentations_25.dropna(subset=[ABSTRACT_COLUMN, TITLE_COLUMN, ID_COLUMN])
df_final_presentations_25[topic_column] = df_final_presentations_25[[TITLE_COLUMN, ABSTRACT_COLUMN]].agg(': '.join, axis=1)
print(f"Final merged df_final_presentations_25 shape: {df_final_presentations_25.shape}")

# Check how many presentations now have abstracts
abstracts_added = df_final_presentations_25[ABSTRACT_COLUMN].notna().sum()
print(f"Number of presentations with abstracts: {abstracts_added}")

Final merged df_final_presentations_25 shape: (1092, 16)
Number of presentations with abstracts: 1092


In [31]:
df_embeddings = embed_with_resume(df_final_presentations_25, topic_column, 'gemini-embedding-exp-03-07')

Backup file embeddings_backup.pkl not found. Starting fresh.
Total topics: 1092
Already embedded: 0
Topics to process: 1092
Processing topic 1/1092: Upcycling of Agri-food Resources into Packaging Ma...
Successfully embedded topic 1
Processing topic 2/1092: Optimizing Hydrothermal Liquefaction of Oat Hulls:...
Successfully embedded topic 2
Processing topic 3/1092: Production of butanol from electro-fermentation of...
Successfully embedded topic 3
Processing topic 4/1092: Turning cannabis waste into high-value products: C...
Successfully embedded topic 4
Processing topic 5/1092: Canola meal extract as a potential feedstock for m...
Successfully embedded topic 5
Processing topic 6/1092: Evaluating Grinding Laws for Predicting the Specif...
Successfully embedded topic 6
Processing topic 7/1092: Optimization and Characterization of Lignin Extrac...
Successfully embedded topic 7
Processing topic 8/1092: Food Waste Diversion and Anaerobic Digestion as Pi...
Successfully embedded topic 8
Proc

In [69]:
# Save the embeddings DataFrame to a file
df_embeddings_main = df_embeddings.copy() # Create a copy since we will do another embedding
save_embeddings_to_file(df_embeddings_main, "embeddings_final_presentations_25_main.pkl")

Embeddings saved to embeddings_final_presentations_25_main.pkl


In [75]:
# Check for duplicate topics in the embeddings DataFrame
duplicates = df_embeddings_main[df_embeddings_main.duplicated(subset=['topic'], keep=False)]
if not duplicates.empty:
    print("Duplicate topics found in df_embeddings_main:")
    print(duplicates[['topic', 'embedding']])
else:
    print("No duplicate topics found in df_embeddings_main.")

Duplicate topics found in df_embeddings_main:
                                                  topic  \
0     Upcycling of Agri-food Resources into Packagin...   
21    Advancements in Agrivoltaics: Autonomous LiDAR...   
25    Watt a Waste: Utilizing Hurricane Debris with ...   
82    Machine learning-powered activatable NIR-II fl...   
94    Nucleic Acid Sensing Analysis Based on Integra...   
101   Attention-LSTM-Based Emulation of Expert Clima...   
187   Watt a Waste: Utilizing Hurricane Debris with ...   
199   Machine learning-powered activatable NIR-II fl...   
268   Plastic Mulch Effects on Hydrological Processe...   
384   Design and Fabrication of Ergonomic Auxiliary ...   
407   Precision Detection of the Real-Time Health an...   
416   Nucleic Acid Sensing Analysis Based on Integra...   
454   High-Resolution Drone Imagery and Machine Lear...   
566   Attention-LSTM-Based Emulation of Expert Clima...   
765   Plastic Mulch Effects on Hydrological Processe...   
790   Tech

In [76]:
# Merge embeddings into df_final_presentations_25 based on the topic_column
df_final_presentations_25_w_embeddings = df_final_presentations_25.merge(
    df_embeddings_main[['topic', 'embedding']].drop_duplicates(subset=['topic']),
    left_on=topic_column,
    right_on='topic',
    how='left'
)
print("Embeddings column added to df_final_presentations_25.")




Embeddings column added to df_final_presentations_25.


# Duplicate Presentations

In [81]:
# Check for duplicate topics in the DataFrame
duplicates = df_final_presentations_25_w_embeddings[
    df_final_presentations_25_w_embeddings.duplicated(subset=[topic_column], keep=False)
]
if not duplicates.empty:
    print("Duplicate topics found in df_final_presentations_25_w_embeddings:")
    print(f"Number of duplicate topics: {duplicates[topic_column].nunique()}")
    print(duplicates[[topic_column, 'topic']])
else:
    print("No duplicate topics found in df_final_presentations_25_w_embeddings.")

# Only use hashable columns for full-row duplicate check (exclude 'embedding')
hashable_cols = [col for col in df_final_presentations_25_w_embeddings.columns if col != 'embedding']
duplicates_all = df_final_presentations_25_w_embeddings[
    df_final_presentations_25_w_embeddings.duplicated(subset=hashable_cols, keep=False)
]
if not duplicates_all.empty:
    print("Duplicate rows found in df_final_presentations_25_w_embeddings (excluding 'embedding'):")
    print(duplicates_all)
else:
    print("No duplicate rows found in df_final_presentations_25_w_embeddings (excluding 'embedding').")

Duplicate topics found in df_final_presentations_25_w_embeddings:
Number of duplicate topics: 13
                                     Title and Abstract  \
0     Upcycling of Agri-food Resources into Packagin...   
21    Advancements in Agrivoltaics: Autonomous LiDAR...   
25    Watt a Waste: Utilizing Hurricane Debris with ...   
82    Machine learning-powered activatable NIR-II fl...   
94    Nucleic Acid Sensing Analysis Based on Integra...   
101   Attention-LSTM-Based Emulation of Expert Clima...   
187   Watt a Waste: Utilizing Hurricane Debris with ...   
199   Machine learning-powered activatable NIR-II fl...   
268   Plastic Mulch Effects on Hydrological Processe...   
384   Design and Fabrication of Ergonomic Auxiliary ...   
407   Precision Detection of the Real-Time Health an...   
416   Nucleic Acid Sensing Analysis Based on Integra...   
454   High-Resolution Drone Imagery and Machine Lear...   
566   Attention-LSTM-Based Emulation of Expert Clima...   
765   Plastic Mulc

In [82]:
# Find all rows with duplicate topics (including the first occurrence)
duplicates_all_versions = df_final_presentations_25_w_embeddings[
    df_final_presentations_25_w_embeddings.duplicated(subset=[topic_column], keep=False)
]

# Drop the 'embedding' column for export
duplicates_all_versions_no_embedding = duplicates_all_versions.drop(columns=['embedding'])

# Export to CSV
duplicates_all_versions_no_embedding.to_csv("duplicate_topics_all_versions.csv", index=False)

In [77]:
# Show rows that do NOT have embeddings added
rows_without_embeddings = df_final_presentations_25_w_embeddings[df_final_presentations_25_w_embeddings['embedding'].isna()]
print(f"Number of rows without embeddings: {len(rows_without_embeddings)}")
print(rows_without_embeddings[[topic_column]].head())

Number of rows without embeddings: 0
Empty DataFrame
Columns: [Title and Abstract]
Index: []


In [32]:
df_final_presentations_25.to_csv("df_final_presentations_25.csv", index=False)

In [35]:
# Check the ones that were dropped
# Merge the abstracts from df_only_abstracts into df_no_abstracts based on ID_COLUMN
df_pre_presentations_25 = df_no_abstracts.merge(
    df_only_abstracts, 
    on=ID_COLUMN, 
    how='left'  # Keep all rows from df_no_abstracts, add abstracts where available
)

print(f"Original df_no_abstracts shape: {df_no_abstracts.shape}")
print(f"df_only_abstracts shape: {df_only_abstracts.shape}")
print(f"Final merged df_final_presentations_25 shape: {df_pre_presentations_25.shape}")
# Check how many presentations now have abstracts
abstracts_added = df_final_presentations_25[ABSTRACT_COLUMN].notna().sum()
print(f"Number of presentations with abstracts: {abstracts_added}")
# Check how many presentations now have abstracts
abstracts_added = df_pre_presentations_25[ABSTRACT_COLUMN].notna().sum()
# Find rows that would be dropped due to missing ABSTRACT_COLUMN, TITLE_COLUMN, or ID_COLUMN
dropped_rows = df_pre_presentations_25[
    df_pre_presentations_25[[ABSTRACT_COLUMN, TITLE_COLUMN, ID_COLUMN]].isnull().any(axis=1)
]
print(f"Number of rows that would be dropped: {len(dropped_rows)}")
dropped_rows.head()

Original df_no_abstracts shape: (1121, 16)
df_only_abstracts shape: (1118, 2)
Final merged df_final_presentations_25 shape: (1121, 17)
Number of presentations with abstracts: 1092
Number of rows that would be dropped: 29


Unnamed: 0,7 digit ID,TC,session type,Session 1,Pres. Title,Order,Start time,End time,Pres Auth FN,Pres Auth LN,Pres Affiliation,Pres Auth Location,All Authors,Student Pres?,Unnamed: 14,Unnamed: 15,Abstract
239,2501384,MS-Machinery Systems,Oral Session,132 Robotics and Mechanization for Specialty C...,Development of A Ground-based Machine Vision S...,109,4:30pm,4:45pm,Jiajun,Xu,Michigan State University,"East Lansing, Michigan, USA","Xinyang Mu, Yuzhen Lu",No,,,
255,2500242,NRES-Natural Resources & Environmental Systems,Oral Session,134 Advances in Micro-Irrigation and Sprinkler...,Evaluation of Agricultural Reservoir Supply Ef...,109,4:30pm,4:45pm,JunYoung,Lee,Kangwon National University,"Chuncheon, Kangwon-do, Republic of Korea","Sangjoon Bak, Yeonji Jeong, Seoro Lee, Jeongho...",Yes,,,
260,2500753,NRES-Natural Resources & Environmental Systems,Oral Session,"135 Hydrological Modeling, Water Resource Mana...",Integrating Machine Learning and Topographic I...,106,3:45pm,4:00pm,Hamid,Mohebzadeh,University of Guelph,"Guelph, Ontario, Canada","Hamid Mohebzadeh, Asim Biswas, Ben DeVries, Ra...",No,,,
280,2501662,NRES-Natural Resources & Environmental Systems,Oral Session,138 Open-Source “pyfao56” Evapotranspiration a...,WISE Pro Software for Smart Crop Irrigation an...,109,4:30pm,4:45pm,Mazdak,Arabi,Colorado State University,"Fort Collins, Colorado, USA",,,,,
317,2500295,POSTER SESSIONS,Poster Session,143 NRES-Advances in Environmental Systems POS...,Quantitative Analysis of Runoff in Submerged P...,106,6,,Yeonji,Jeong,Kangwon National University,"Chuncheon-si, Kangwon","Yeonji Jeong, Gwanjae Lee, Seoro Lee, Jeongho ...",Yes,,,


# Missing Presentations
Some abstracts were in the drop list on 5/20 and were added back to the program. This file contains those. This code does the embedding for these.

In [36]:
# Add back the missing abstracts to the final DataFrame
# Examine the Excel file structure for the abstracts.
file_path = "Missing Abstracts.xlsx"
df_temp = pd.read_excel(file_path)

print("Available columns:")
for i, col in enumerate(df_temp.columns):
    print(f"{i}: {col}")

print(f"\nFile contains {len(df_temp)} rows and {len(df_temp.columns)} columns")
print("\nFirst few rows preview:")
print(df_temp.head())

Available columns:
0: 7 digit ID
1: Abstract

File contains 29 rows and 2 columns

First few rows preview:
   7 digit ID                                           Abstract
0     2501384  Accurate maturity assessment and yield estimat...
1     2500242  Water resources have been regarded as a critic...
2     2500753  Effective mitigation of ephemeral gully erosio...
3     2501662  Diminishing water supplies and nutrient pollut...
4     2500295  This study introduces the WAPLE4 system, devel...


In [41]:
# Define your column selections based on the output above
ABSTRACT_COLUMN = 'Abstract'  # Update based on your file  
ID_COLUMN = '7 digit ID'  # Update based on your file

In [None]:
df_only_abstracts_missing = pd.read_excel(file_path, usecols=[ABSTRACT_COLUMN, ID_COLUMN])
# Drop any presentations that are missing an ID number or abstract
df_only_abstracts_missing = df_only_abstracts_missing.dropna(subset=[ABSTRACT_COLUMN, ID_COLUMN])

In [45]:
df_presentations_25_missing = df_no_abstracts.merge(
    df_only_abstracts_missing, 
    on=ID_COLUMN, 
    how='left'  # Keep all rows from df_no_abstracts, add abstracts where available
)

In [46]:
print(f"Original df_no_abstracts shape: {df_no_abstracts.shape}")
print(f"df_only_abstracts shape: {df_only_abstracts.shape}")
print(f"Final merged df_presentations_25_missing shape: {df_presentations_25_missing.shape}")

# Check how many presentations now have abstracts
abstracts_added = df_presentations_25_missing[ABSTRACT_COLUMN].notna().sum()
print(f"Number of presentations with abstracts: {abstracts_added}")

Original df_no_abstracts shape: (1121, 16)
df_only_abstracts shape: (1118, 2)
Final merged df_presentations_25_missing shape: (1121, 17)
Number of presentations with abstracts: 29


In [None]:
topic_column='Title and Abstract'
# Drop any presentations that are missing an ID number or abstract
df_presentations_25_missing = df_presentations_25_missing.dropna(subset=[ABSTRACT_COLUMN, TITLE_COLUMN, ID_COLUMN])
df_presentations_25_missing[topic_column] = df_presentations_25_missing[[TITLE_COLUMN, ABSTRACT_COLUMN]].agg(': '.join, axis=1)
print(f"Final merged df_presentations_25_missing shape: {df_presentations_25_missing.shape}")

# Check how many presentations now have abstracts
abstracts_added = df_presentations_25_missing[ABSTRACT_COLUMN].notna().sum()
print(f"Number of presentations with abstracts: {abstracts_added}")

Final merged df_presentations_25_missing shape: (29, 18)
Number of presentations with abstracts: 29


In [84]:
# Drop extra columns 'Unnamed: 14', 'Unamed: 15', 'Unnamed: 16' if they exist
extra_columns = ['Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16']
df_presentations_25_missing = df_presentations_25_missing.drop(columns=[col for col in extra_columns if col in df_presentations_25_missing.columns], errors='ignore')

In [85]:
df_embeddings_missing = embed_with_resume(df_presentations_25_missing, topic_column, 'gemini-embedding-exp-03-07')

Embeddings loaded from embeddings_backup.pkl
Total topics: 29
Already embedded: 1079
Topics to process: 29
Processing topic 1/29: Development of A Ground-based Machine Vision Syste...
Successfully embedded topic 1
Processing topic 2/29: Evaluation of Agricultural Reservoir Supply Effici...
Successfully embedded topic 2
Processing topic 3/29: Integrating Machine Learning and Topographic Index...
Successfully embedded topic 3
Processing topic 4/29: WISE Pro Software for Smart Crop Irrigation and Nu...
Successfully embedded topic 4
Processing topic 5/29: Quantitative Analysis of Runoff in Submerged Paddy...
Successfully embedded topic 5
Processing topic 6/29: Phosphoric Acid Activation of Biochar for Applicat...
Successfully embedded topic 6
Processing topic 7/29: Development and Evaluation of L-THIA Sub-daily mod...
Successfully embedded topic 7
Processing topic 8/29: Automated Cleaning Systems Effects On Long Term Ox...
Successfully embedded topic 8
Processing topic 9/29: Autonomous Act

In [86]:
# Merge embeddings into df_final_presentations_25 based on the topic_column
df_presentations_25_missing_w_embeddings = df_presentations_25_missing.merge(
    df_embeddings_missing[['topic', 'embedding']].drop_duplicates(subset=['topic']),
    left_on=topic_column,
    right_on='topic',
    how='left'
)
print("Embeddings column added to df_presentations_25_missing.")

Embeddings column added to df_presentations_25_missing.


# Add guest and other speakers without ID numbers

In [48]:
# Examine the Excel file structure .
file_path = "AIM 25 Final Speakers No ID Num.xlsx"
df_temp = pd.read_excel(file_path)

print("Available columns:")
for i, col in enumerate(df_temp.columns):
    print(f"{i}: {col}")

print(f"\nFile contains {len(df_temp)} rows and {len(df_temp.columns)} columns")
print("\nFirst few rows preview:")
print(df_temp.head())

Available columns:
0: 7 digit ID
1: TC
2: session type
3: Session 1
4: Pres. Title
5: Order
6: Start time
7: End time
8: Pres Auth FN
9: Pres Auth LN
10: Pres Affiliation
11: Pres Auth Location
12: All Authors
13: Student Pres?

File contains 44 rows and 14 columns

First few rows preview:
      7 digit ID                                                 TC  \
0  Guest Speaker         CBSI-Circular Bioeconomy Systems Institute   
1  Guest Speaker         CBSI-Circular Bioeconomy Systems Institute   
2            NaN  EOPD-Education, Outreach, & Professional Devel...   
3  Guest Speaker     NRES-Natural Resources & Environmental Systems   
4  Guest Speaker     NRES-Natural Resources & Environmental Systems   

            session type                                          Session 1  \
0         Hybrid Session  102 Advancing Circular Bioeconomy Systems (CBS...   
1         Hybrid Session  102 Advancing Circular Bioeconomy Systems (CBS...   
2            Rap Session  104 Engineering Eth

In [50]:
df_no_ID = pd.read_excel(file_path)

TITLE_COLUMN = 'Pres. Title'  # Update based on your file
# Drop any presentations that are missing an ID number or title
df_no_ID = df_no_ID.dropna(subset=[TITLE_COLUMN])

In [52]:
topic_column='Title and Abstract'

df_no_ID[topic_column] = df_no_ID[TITLE_COLUMN]
print(f"Final merged df_no_ID shape: {df_no_ID.shape}")

Final merged df_no_ID shape: (44, 15)


In [87]:
df_embeddings_no_ID = embed_with_resume(df_no_ID, topic_column, 'gemini-embedding-exp-03-07')

Embeddings loaded from embeddings_backup.pkl
Total topics: 44
Already embedded: 1108
Topics to process: 44
Processing topic 1/44: Advancing Circular Bioeconomy Systems (CBS): Oppor...
Successfully embedded topic 1
Processing topic 2/44: Advancing Circular Bioeconomy Systems (CBS): Oppor...
Successfully embedded topic 2
Processing topic 3/44: Engineering Ethics across Cultures-RAP Session...
Successfully embedded topic 3
Processing topic 4/44: NRES Distinguished Lecture Series - AI and Digital...
Successfully embedded topic 4
Processing topic 5/44: Ramping up AI in the Classroom, for Student Succes...
Successfully embedded topic 5
Processing topic 6/44: Industry and Academic Perspectives on AI tools...
Successfully embedded topic 6
Processing topic 7/44: AI Tools for International Online Education and Wo...
Successfully embedded topic 7
Processing topic 8/44: Development of AI Tools for Water Management...
Successfully embedded topic 8
Processing topic 9/44: Assessing Benefits and Conce

In [88]:
# Merge embeddings into df_final_presentations_25 based on the topic_column
df_no_ID_w_embeddings = df_no_ID.merge(
    df_embeddings_no_ID[['topic', 'embedding']].drop_duplicates(subset=['topic']),
    left_on=topic_column,
    right_on='topic',
    how='left'
)
print("Embeddings column added to df_no_ID.")

Embeddings column added to df_no_ID.


# Combine the dataframes

In [90]:
# Rename 'Abstract ' (with trailing space) to 'Abstract' in df_final_presentations_25_w_embeddings
df_final_presentations_25_w_embeddings = df_final_presentations_25_w_embeddings.rename(columns={'Abstract ': 'Abstract'})

In [91]:
# Step 1: Check for matching columns (except for 'Abstract')

# Get sets of columns for each DataFrame
cols_main = set(df_final_presentations_25_w_embeddings.columns)
cols_missing = set(df_presentations_25_missing_w_embeddings.columns)
cols_no_id = set(df_no_ID_w_embeddings.columns)

# Find differences
print("Columns in df_final_presentations_25_w_embeddings but not in df_no_ID_w_embeddings:")
print(cols_main - cols_no_id)

print("\nColumns in df_no_ID_w_embeddings but not in df_final_presentations_25_w_embeddings:")
print(cols_no_id - cols_main)

print("\nColumns in df_presentations_25_missing_w_embeddings but not in df_final_presentations_25_w_embeddings:")
print(cols_missing - cols_main)

print("\nColumns in df_final_presentations_25_w_embeddings but not in df_presentations_25_missing_w_embeddings:")
print(cols_main - cols_missing)

Columns in df_final_presentations_25_w_embeddings but not in df_no_ID_w_embeddings:
{'Abstract'}

Columns in df_no_ID_w_embeddings but not in df_final_presentations_25_w_embeddings:
set()

Columns in df_presentations_25_missing_w_embeddings but not in df_final_presentations_25_w_embeddings:
set()

Columns in df_final_presentations_25_w_embeddings but not in df_presentations_25_missing_w_embeddings:
set()


In [92]:
df_all_w_embeddings = pd.concat([
    df_final_presentations_25_w_embeddings,
    df_presentations_25_missing_w_embeddings,
    df_no_ID_w_embeddings
], ignore_index=True)

# Duplicate Presentations

In [93]:
# Check for duplicate topics in the DataFrame
duplicates = df_all_w_embeddings[
    df_all_w_embeddings.duplicated(subset=[topic_column], keep=False)
]
if not duplicates.empty:
    print("Duplicate topics found in df_all_w_embeddings:")
    print(f"Number of duplicate topics: {duplicates[topic_column].nunique()}")
    print(duplicates[[topic_column, 'topic']])
else:
    print("No duplicate topics found in df_all_w_embeddings.")

# Only use hashable columns for full-row duplicate check (exclude 'embedding')
hashable_cols = [col for col in df_all_w_embeddings.columns if col != 'embedding']
duplicates_all = df_all_w_embeddings[
    df_all_w_embeddings.duplicated(subset=hashable_cols, keep=False)
]
if not duplicates_all.empty:
    print("Duplicate rows found in df_all_w_embeddings (excluding 'embedding'):")
    print(duplicates_all)
else:
    print("No duplicate rows found in df_all_w_embeddings (excluding 'embedding').")

Duplicate topics found in df_all_w_embeddings:
Number of duplicate topics: 14
                                     Title and Abstract  \
0     Upcycling of Agri-food Resources into Packagin...   
21    Advancements in Agrivoltaics: Autonomous LiDAR...   
25    Watt a Waste: Utilizing Hurricane Debris with ...   
82    Machine learning-powered activatable NIR-II fl...   
94    Nucleic Acid Sensing Analysis Based on Integra...   
101   Attention-LSTM-Based Emulation of Expert Clima...   
187   Watt a Waste: Utilizing Hurricane Debris with ...   
199   Machine learning-powered activatable NIR-II fl...   
268   Plastic Mulch Effects on Hydrological Processe...   
384   Design and Fabrication of Ergonomic Auxiliary ...   
407   Precision Detection of the Real-Time Health an...   
416   Nucleic Acid Sensing Analysis Based on Integra...   
454   High-Resolution Drone Imagery and Machine Lear...   
566   Attention-LSTM-Based Emulation of Expert Clima...   
765   Plastic Mulch Effects on Hydrol

In [94]:
# Find all rows with duplicate topics (including the first occurrence)
duplicates_full = df_all_w_embeddings[
    df_all_w_embeddings.duplicated(subset=[topic_column], keep=False)
]

# Drop the 'embedding' column for export
duplicates_full_no_embedding = duplicates_full.drop(columns=['embedding'])

# Export to CSV
duplicates_full_no_embedding.to_csv("duplicate_topics_full.csv", index=False)

In [95]:
# Show rows that do NOT have embeddings added
rows_without_embeddings = df_all_w_embeddings[df_all_w_embeddings['embedding'].isna()]
print(f"Number of rows without embeddings: {len(rows_without_embeddings)}")
print(rows_without_embeddings[[topic_column]].head())

Number of rows without embeddings: 0
Empty DataFrame
Columns: [Title and Abstract]
Index: []


# Export Files

In [174]:
# Split  the 'embedding' column for export
df_all_no_embeddings = df_all_w_embeddings.drop(columns=['embedding'])
df_all_embeddings = df_all_w_embeddings['embedding']

# Export to CSV
df_all_no_embeddings.to_csv("Full Presentation List.csv", index=False)

## Create similarity matrix

In [175]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Convert the embeddings Series to a 2D numpy array
embeddings_matrix = np.vstack(df_all_embeddings.dropna().values)

# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings_matrix)

# print(similarity_matrix)

## Create sharable dataframe
Abstracts, names and emails should not be posted openly on the web. Remove them from the non-encrypted basic version of the data set.

First, check what columns are available. Then select the ones to remove.

In [176]:
print("Available columns:")
for i, col in enumerate(df_all_no_embeddings.columns):
    print(f"{i}: {col}")

print(f"Dataframe contains {len(df_all_no_embeddings)} rows and {len(df_all_no_embeddings.columns)} columns")

Available columns:
0: 7 digit ID
1: TC
2: session type
3: Session 1
4: Pres. Title
5: Order
6: Start time
7: End time
8: Pres Auth FN
9: Pres Auth LN
10: Pres Affiliation
11: Pres Auth Location
12: All Authors
13: Student Pres?
14: Abstract
15: Title and Abstract
16: topic
Dataframe contains 1165 rows and 17 columns


In [177]:
rename_dict = {
    # "Current Name": "New Name",
    # For example:
    "digit ID": "7 Digit ID",
    "TC": "Technical Community",
    "session type": "Session Type",
    "Session 1": "Session",
    "Pres. Title": "Presentation Title",
    "Order": "Order",
    "Start time:": "Start Time or Poster Position",
    "End time:": "End Time",
    "Pres Auth FN": "Presenter First Name",
    "Pres Auth LN": "Presenter Last Name",
    "Pres Affiliation": "Presenter Affiliation",
    "Pres Auth Location": "Presenter Location",
    # Add more renaming rules as needed
}
# Apply renaming
df_all_no_embeddings = df_all_no_embeddings.rename(columns=rename_dict)

print("Renamed columns:")
print(df_all_no_embeddings.columns.tolist())

Renamed columns:
['7 digit ID', 'Technical Community', 'Session Type', 'Session', 'Presentation Title', 'Order', 'Start time', 'End time', 'Presenter First Name', 'Presenter Last Name', 'Presenter Affiliation', 'Presenter Location', 'All Authors', 'Student Pres?', 'Abstract', 'Title and Abstract', 'topic']


# Create Sessions

In [178]:
session_column_name = 'Session'

# Group by the session column and aggregate the results
df_sessions = df_all_no_embeddings.groupby(session_column_name).apply(lambda x: pd.Series({
    'Session Name': x.name,
    'Session Size': len(x),
    'gen_presentation_indices': x.index.tolist()
})).reset_index(drop=True)

print("df_sessions created successfully.")
df_sessions.head()

df_sessions created successfully.


  df_sessions = df_all_no_embeddings.groupby(session_column_name).apply(lambda x: pd.Series({


Unnamed: 0,Session Name,Session Size,gen_presentation_indices
0,101 Biomass Preprocessing and Logistics for Bi...,7,"[0, 1, 2, 3, 4, 5, 6]"
1,102 Advancing Circular Bioeconomy Systems (CBS...,8,"[7, 8, 9, 10, 11, 12, 1121, 1122]"
2,103 China Exchange & AOCABFE Business Meeting-...,4,"[1140, 1141, 1142, 1143]"
3,104 Engineering Ethics across Cultures-RAP SES...,1,[1123]
4,"105 Advances in Biomass Preprocessing, Pretrea...",8,"[13, 14, 15, 16, 17, 18, 19, 20]"


### Analyze Sessions
- session_coherence = "Are presentations within this session similar?" (internal session quality)
- session_distinctiveness = "Is this session's topic unique compared to others?" (relative session positioning)
- presentation_session_fit = "Does this presentation match the topic of others in the session?" (presentation fit)

Session Coherence measures cluster cohesion. It reflects how tighly grouped the topic of presentations within the session are.

Session Distinctiveness measures how unique each session's topic is. High values mean the session has a clear, focused theme that's different from other sessions. Low values suggest either the session mixes different topics or overlaps too much with other sessions.

Presentation-Session Fit is an individual presentations's average similarity to other presentation in its session. Generically, it can be referred to as "within_cluster_fit", "cluster_membership_strength", or "local_cohesion_score".

In [179]:
from itertools import chain
from sklearn.metrics import silhouette_samples
def calculate_placement_metrics(df_presentations, df_sessions, pres_similarities_matrix, session_column_name='Session Code'):
    """
    Calculate comprehensive quality metrics for session assignments.
    
    This function computes three key metrics to evaluate how well presentations 
    are organized into sessions:
    
    1. Session Coherence: Average pairwise similarity within each session 
        (measures internal session quality)
    2. Session Distinctiveness: Silhouette score for each session 
        (measures how unique/separable each session is from others)
    3. Presentation-Session Fit: For each presentation, average similarity 
        to other presentations in the same session (measures individual fit)

    Args:
        df_presentations (pd.DataFrame): DataFrame containing presentation data with 
            session assignments in the specified column.
        df_sessions (pd.DataFrame): DataFrame containing session metadata with 
            presentation indices stored in session_organizer.COLUMNS['GEN_PRESENTATION_INDICES'].
        pres_similarities_matrix (np.ndarray): Symmetric similarity matrix where 
            element [i,j] represents similarity between presentations i and j.
        session_column_name (str, optional): Column name in df_presentations that 
            contains session identifiers. Defaults to 'Session Code'.

    Returns:
        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing:
            - df_sessions: Updated with 'session_coherence' and 'session_distinctiveness' columns
            - df_presentations: Updated with 'presentation_session_fit' column
            
    Raises:
        AssertionError: If presentation indices exceed the valid range for df_presentations.
        
    Note:
        The function modifies the input DataFrames in place and also returns them.
        Silhouette scores range from -1 to 1, where higher values indicate better separation.
        Similarity scores depend on the embedding model used but typically range from 0 to 1.
    """
    
    # Convert similarity to distance for silhouette calculation
    distance_matrix = 1 - pres_similarities_matrix
    # Perfect matches which occurs on the diagonal should have similiarity of 1 or 
    # distance 0, but they often don't due to numerical instability. 
    # So we ensure the diagonal is 0 to avoid self-similarity issues.
    np.fill_diagonal(distance_matrix, 0)
    # Any duplicates not removed should also have a distance of 0.
    # These will not appear on the diagonal, but we ensure no negative distances.
    distance_matrix[distance_matrix < 0] = 0

    pres_indices_by_session = df_sessions['gen_presentation_indices']
    pos_to_ind, ind_to_pos= session_organizer.create_index_mappings(df_presentations)   
    pres_positions_by_session = pres_indices_by_session.apply(lambda indices_list: [ind_to_pos[i] for i in indices_list])
    # Set the index of pres_positions_by_session to be the session names for easier lookup
    pres_positions_by_session.index = df_sessions['Session Name']
    max_position = max(chain.from_iterable(pres_positions_by_session))
    # Ensure max_position is within valid range
    assert max_position <= len(df_presentations) - 1, (
        f"Maximum position {max_position} exceeds maximum valid index {len(df_presentations) - 1} for df_presentations of length {len(df_presentations)}"
    )

    # Initialize outputs
    session_session_similarity_dataframe = pd.DataFrame(index=df_sessions['Session Name'], columns=df_sessions['Session Name'])
    session_average_similarity_series = pd.Series(index=df_sessions['Session Name'], dtype=float, name='session_coherence')
    presentation_session_fit_series = pd.Series(index=df_presentations.index, dtype=float, name='presentation_session_fit')


    # Calculate the silhouette scores for each presentation first
    # Get the assigned labels for each presentation
    labels_str = df_presentations[session_column_name].values
    # Convert string labels to numeric labels for silhouette_samples
    unique_labels, labels_numeric = pd.factorize(labels_str)
    sample_scores = silhouette_samples(distance_matrix, unique_labels, metric='precomputed')

    # Create a pandas Series for session distinctiveness with matching index
    session_distinctiveness_series = pd.Series(index=df_sessions.index, dtype=float, name='session_distinctiveness')

    # Calculate average silhouette score for each cluster
    for i, cluster_name in enumerate(labels_numeric):
        cluster_mask = labels_str == cluster_name
        if np.sum(cluster_mask) > 0:
            cluster_score = np.mean(sample_scores[cluster_mask])
            # Find the corresponding index in df_sessions
            session_index = df_sessions[df_sessions['Session Name'] == cluster_name].index
            if not session_index.empty:
                session_distinctiveness_series.loc[session_index[0]] = cluster_score


    # Calculate average similarity for each session
    for session_id_i, positions_i in pres_positions_by_session.items():
        for session_id_j, positions_j in pres_positions_by_session.items():
            if not positions_i or not positions_j:
                session_session_similarity_dataframe.loc[session_id_i, session_id_j] = 0.0
                continue
            session_i_to_j_similarities = pres_similarities_matrix[np.ix_(positions_i, positions_j)]
            if session_id_i == session_id_j:
                # First calculate the presentation to session similarities
                # Mask the diagonal (self-similarity) by setting it to np.nan, then compute mean of each row ignoring nan
                session_i_to_j_similarities_no_diag = session_i_to_j_similarities.copy()
                np.fill_diagonal(session_i_to_j_similarities_no_diag, np.nan)
                presentation_session_fits = np.nanmean(session_i_to_j_similarities_no_diag, axis=1)
                for i, pos in enumerate(positions_i):
                    presentation_session_fit_series.loc[pos_to_ind[pos]] = presentation_session_fits[i]
                # Same session: use upper triangle to avoid double-counting pairs
                n = session_i_to_j_similarities.shape[0]
                if n > 1:
                    triu_indices = np.triu_indices(n, k=1)
                    avg_similarity = np.mean(session_i_to_j_similarities[triu_indices])
                    # Store the average similarity in the series for this session
                    session_average_similarity_series.loc[session_id_i] = avg_similarity
                else:
                    avg_similarity = 0.0
            else:
                # Different sessions: use all similarities
                avg_similarity = np.mean(session_i_to_j_similarities)
            session_session_similarity_dataframe.loc[session_id_i, session_id_j] = avg_similarity

    # Calculate standard deviation of presentation fits for each session
    session_std_series = presentation_session_fit_series.groupby(df_presentations[session_column_name]).std()
    session_std_series.name = 'session_std'

    # Map session average similarity and std dev to each presentation
    presentation_session_avg_map = df_presentations[session_column_name].map(session_average_similarity_series)
    presentation_session_std_map = df_presentations[session_column_name].map(session_std_series)

    # Calculate raw and standardized deviation for each presentation
    presentation_raw_deviation_series = presentation_session_fit_series - presentation_session_avg_map
    presentation_raw_deviation_series.name = 'presentation_raw_deviation'
    
    presentation_std_series = presentation_raw_deviation_series / presentation_session_std_map
    presentation_std_series.name = 'presentation_std'
            
    return (presentation_session_fit_series, session_average_similarity_series, 
            session_distinctiveness_series, session_session_similarity_dataframe,
            session_std_series, presentation_raw_deviation_series, presentation_std_series)

In [180]:
presentation_session_fit_series, session_average_similarity_series, session_distinctiveness_series, df_session_session_similarity, session_std_series, presentation_raw_deviation_series, presentation_std_series = calculate_placement_metrics(
    df_presentations=df_all_no_embeddings,
    df_sessions=df_sessions,
    pres_similarities_matrix=similarity_matrix,
    session_column_name=session_column_name
)

  presentation_session_fits = np.nanmean(session_i_to_j_similarities_no_diag, axis=1)


In [181]:
# Save the series to the DataFrames
df_all_no_embeddings['Presentation Session Fit'] = presentation_session_fit_series.reindex(df_all_no_embeddings.index)
df_all_no_embeddings['Presentation Raw Deviation'] = presentation_raw_deviation_series.reindex(df_all_no_embeddings.index)
df_all_no_embeddings['Presentation Standardized Deviation'] = presentation_std_series.reindex(df_all_no_embeddings.index)

# Align the index of session_average_similarity_series to match the 'Session Name' column in df_sessions
if 'Session Name' in df_sessions.columns:
    df_sessions['Session Coherence'] = df_sessions['Session Name'].map(session_average_similarity_series)
    df_sessions['Session Std Dev'] = df_sessions['Session Name'].map(session_std_series)
else:
    df_sessions['Session Coherence'] = session_average_similarity_series.reindex(df_sessions.index)
    df_sessions['Session Std Dev'] = session_std_series.reindex(df_sessions.index)
df_sessions['Session Distinctiveness'] = session_distinctiveness_series.reindex(df_sessions.index)

In [182]:
columns_to_drop = [
    'presentation_session_fit', # remove repetitive topic column
]
df_all_no_embeddings = df_all_no_embeddings.drop(columns_to_drop, axis=1, errors='ignore')
columns_to_drop = [
    'session_coherence', # remove repetitive topic column
    'session_distinctiveness', # remove repetitive topic column
]
df_sessions = df_sessions.drop(columns_to_drop, axis=1, errors='ignore')

In [183]:
columns_to_drop = [
    'topic', # remove repetitive topic column
]
df_all_no_embeddings = df_all_no_embeddings.drop(columns_to_drop, axis=1, errors='ignore')

## Create Encrypted Dataframe

In [184]:
columns_to_drop = [
    'Abstract',
    'Title and Abstract',
]
df_no_abstract = df_all_no_embeddings.drop(columns_to_drop, axis=1, errors='ignore')

In [185]:
import os
from dotenv import load_dotenv
# Load environment variables
load_dotenv(".env")
if "DATAFRAME_PW" not in os.environ:
    raise ValueError(
        "Dataframe Encryption Password must be in environmental variables. DATAFRAME_PW not found in environment variables. Please set it in your .env file."
    )
else:
    password_df = os.environ["DATAFRAME_PW"]
filename_enc_df = 'encrypted_df.crypt'
# Create an encrypted dataframe for publishing.
import cryptpandas as crp

# Ensure '7 digit ID' and 'Start time' are string type to avoid ArrowInvalid errors
columns_to_string = ['7 digit ID', 'Start time']
for col in columns_to_string:
    if col in df_all_no_embeddings.columns:
        df_all_no_embeddings[col] = df_all_no_embeddings[col].astype(str)

# Encrypt the DataFrame and save as a pickle
crp.to_encrypted(df_all_no_embeddings, password=password_df, path=filename_enc_df)

In [189]:
# Ensure '7 digit ID' and 'Start time' are string type to avoid ArrowInvalid errors
columns_to_string = ['7 digit ID', 'Start time']
for col in columns_to_string:
    if col in df_no_abstract.columns:
        df_no_abstract[col] = df_no_abstract[col].astype(str)

# Save the abstract-free DataFrame without encryption
df_no_abstract.to_parquet('df_no_abstractAIM25.parquet', compression='snappy')

In [187]:
# Save the Sessions Dataframe
df_sessions.to_parquet('df_sessionsAIM25.parquet', compression='snappy')

### Save the similarities

In [188]:
pres_similarities_df = pd.DataFrame(similarity_matrix, 
                                    index=df_all_embeddings.index, 
                                    columns=df_all_embeddings.index)
# Save presentation similarities matrix
pres_similarities_df.to_parquet('pres_similarities_matrixAIM25.parquet', compression='snappy')

# Save the session similiarities matrix DataFrame
df_session_session_similarity.to_parquet('session_similarities_matrixAIM25.parquet', compression='snappy')

print("✓ Similarity matrices saved as Parquet files")

✓ Similarity matrices saved as Parquet files


### Match Committees to Related Sessions

In [None]:
# Read the committee file from CSV/Excel with flexible column selection
committee_file_path = 'ASABE Committees.csv'  # Update this path as needed (can also use .xlsx)

# Load committees
df_committees, committee_name_column, description_column, combined_column = session_organizer.load_committees(
    committee_file_path,
    Committee_Name_column='Committee_Name',  # Actual column name in your file
    Description_column='Description',        # Actual column name in your file
    committee_name_column='Committee_Name',  # Desired output column name
    description_column='Description',        # Desired output column name
    combined_column='Name_Description'       # Combined column for embeddings
)

print(f"Loaded {len(df_committees)} committees")
print(f"Committee name column: {committee_name_column}")
print(f"Description column: {description_column}")
print(f"Combined column: {combined_column}")
print("\nFirst few committees:")
print(df_committees[[committee_name_column, description_column]].head())

# Generate embeddings for committees using the combined column
df_committee_embeddings = embed_with_resume(df_committees, combined_column, 'gemini-embedding-exp-03-07')

In [None]:
# Find the most similar committees for each session
session_committee_matches = session_organizer.find_most_similar_committees_by_presentations(
    df_sessions, 
    df_all_embeddings, 
    df_committees, 
    df_committee_embeddings, 
    top_n=3,
)

In [None]:
df_sessions = session_organizer.add_committee_matches_to_clusters(df_sessions, session_committee_matches)

In [1]:
df_embeddings_main = load_embeddings_from_file("embeddings_final_presentations_25_main.pkl")

NameError: name 'load_embeddings_from_file' is not defined