This notebook appends new corpuses that were sampled from CZI and then cleaned to the current one and saves it as a new version. 

In [69]:
import pandas as pd
from typing import List, Union


In [70]:
def ensure_list(x):
    """Normalize a cell to a list of strings."""
    if isinstance(x, list):
        return x
    if pd.isna(x) or x == '':
        return []
    if isinstance(x, str):
        return [u.strip() for u in x.split(',') if u.strip()]
    return [x]

In [71]:
def preprocess_new_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Group & clean a new source DataFrame:
      - Rename 'software_mention' → 'name'
      - Build `candidate_urls` & `url (ground truth)` lists
      - Group by (name, doi, paragraph) deduplicating URLs
      - Rename back → 'software_mention', add 'annotator'
    """
    # Rename for grouping
    df.rename(columns={'software_mention': 'name'}, inplace=True)
    # Ensure URL cols exist
    url_cols = ['package_url']
    for col in url_cols:
        if col not in df.columns:
            df[col] = ''
    # Build lists
    df['candidate_urls'] = df[url_cols].apply(
        lambda row: [u.strip() for u in row if isinstance(u, str) and u.strip()],
        axis=1
    )
    df['url (ground truth)'] = df.apply(
        lambda row: [u.strip() for u in row[url_cols]
                     if row.get('exact_match', False) and isinstance(u, str) and u.strip()],
        axis=1
    )
    # Group & aggregate
    grouped = df.groupby(['name', 'doi', 'paragraph'], as_index=False).agg({
        'candidate_urls':       lambda lists: list(set(sum(lists, []))),
        'url (ground truth)':   lambda lists: list(set(sum(lists, []))),
        'authors_oa':           'first',
        'authors':              'first',
        'field/topic/keywords': 'first'
    })
    # Finalize
    grouped['annotator'] = 'JelenaDuric'
    return grouped


In [72]:
def append_and_deduplicate(corpus_df: pd.DataFrame,
                           new_grouped_df: pd.DataFrame) -> pd.DataFrame:
    """
    Append new grouped rows to corpus_df, normalize URL cells to lists,
    then re-group on (software_mention, doi, paragraph) to de-dupe URLs,
    preserve 'comments', then re-fill 'id' sequentially,
    convert URL lists back to strings, and reorder columns.
    """
    # Normalize URL lists
    for col in ['candidate_urls', 'url (ground truth)']:
        if col in corpus_df:
            corpus_df[col] = corpus_df[col].apply(ensure_list)
        new_grouped_df[col] = new_grouped_df[col].apply(ensure_list)

    # Preserve any existing comments column, or create empty if missing
    if 'comments' not in corpus_df.columns:
        corpus_df['comments'] = ''
    if 'comments' not in new_grouped_df.columns:
        new_grouped_df['comments'] = ''

    # Concatenate and dedupe
    combined = pd.concat([corpus_df, new_grouped_df], ignore_index=True, sort=False)
    deduped = combined.groupby(
        ['name', 'doi', 'paragraph'], as_index=False
    ).agg({
        'authors_oa':           'first',
        'authors':              'first',
        'field/topic/keywords': 'first',
        'url (ground truth)':   lambda lists: list(set(sum(lists, []))),
        'annotator':            'first',
        'comments':             'first',
        'candidate_urls':       lambda lists: list(set(sum(lists, []))),
    })

    # Convert URL lists back to comma-separated strings
    deduped['url (ground truth)'] = deduped['url (ground truth)'].apply(lambda lst: ','.join(lst))
    deduped['candidate_urls']     = deduped['candidate_urls'].apply(lambda lst: ','.join(lst))

    # Re-fill `id` sequentially
    deduped.insert(0, 'id', range(1, len(deduped) + 1))

    # Reorder columns exactly as requested
    final_cols = [
        'id',
        'name',
        'doi',
        'paragraph',
        'authors_oa',
        'authors',
        'field/topic/keywords',
        'url (ground truth)',
        'annotator',
        'comments',
        'candidate_urls'
    ]
    return deduped[final_cols]

In [73]:
def update_corpus(corpus_df: pd.DataFrame,
                  new_dfs: List[pd.DataFrame]) -> pd.DataFrame:
    """
    Process multiple new DataFrames and merge into corpus_df:
      1. preprocess_new_dataframe on each
      2. concat all, then append_and_deduplicate
      3. return updated DataFrame
    """
    grouped_list = [preprocess_new_dataframe(df) for df in new_dfs]
    all_new = pd.concat(grouped_list, ignore_index=True, sort=False)
    return append_and_deduplicate(corpus_df, all_new)


In [74]:
corpus = pd.read_excel('../corpus_v1.xlsx')
cran_top_10 = pd.read_csv('../cran_sampled_top_10.csv')
cran = pd.read_csv('../cran_sampled.csv')
pypi_top_10 = pd.read_csv('../pypi_sampled_top_10.csv')
pypi = pd.read_csv('../pypi_sampled.csv')
github = pd.read_csv('../github_sample_cleaned.csv', delimiter=';')
new_df  = update_corpus(corpus, [cran_top_10,cran,pypi_top_10,pypi,github])
new_df.to_excel('../corpus_v2.xlsx', index=False) 
# Save the final DataFrame to a CSV file
