This notebook will be used for sampling large dataset to get smaller one that will then be incorporated into the corpus, piece by piece. IDs will be chosen randomly and we can specify how much IDs we want. Then for each ID we fetch a few rows (we can specify how much) from preferably different papers. If there isn't enough different papers, some can be repeated, but the combination of the software mention (ID), doi and paragraph text in which the software is mentioned need to be unique. There is also an option to sample exact software mentions we ask for. 

In [1]:
import pandas as pd
import numpy as np
from typing import List,Optional


In [19]:
github = pd.read_csv('D:/MASTER/TMF/Software-Disambiguation/corpus/github_from_CZI.csv')
github = github[github['exact_match']]
print(github.shape)
print(github.columns)
print(github.head())

(1327602, 11)
Index(['ID', 'software_mention', 'mapped_to', 'platform', 'package_url',
       'homepage_url', 'other_urls', 'github_repo', 'exact_match', 'doi',
       'paragraph'],
      dtype='object')
      ID software_mention mapped_to  platform                    package_url  \
0  SM900              XDS       XDS       NaN  https://github.com/ichfly/XDS   
1  SM900              XDS       XDS       NaN  https://github.com/ichfly/XDS   
2  SM900              XDS       XDS       NaN  https://github.com/ichfly/XDS   
3  SM900              XDS       XDS       NaN  https://github.com/ichfly/XDS   
4  SM900              XDS       XDS       NaN  https://github.com/ichfly/XDS   

   homepage_url  other_urls                    github_repo  exact_match  \
0           NaN         NaN  https://github.com/ichfly/XDS         True   
1           NaN         NaN  https://github.com/ichfly/XDS         True   
2           NaN         NaN  https://github.com/ichfly/XDS         True   
3           NaN

In [2]:
def get_random_ids(df: pd.DataFrame, num_ids: int, seed: int = None) -> np.ndarray:
    """
    Sample `num_ids` unique IDs at random from df.ID.
    """
    uniq_ids = df['ID'].unique()
    if seed is not None:
        np.random.seed(seed)
    return np.random.choice(uniq_ids, size=num_ids, replace=False)

In [3]:

def get_ids_by_software(df: pd.DataFrame, software_names: List[str]) -> np.ndarray:
    """
    Return the unique IDs from df where the 'software_mention' column
    matches any of the names in software_names.
    
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing at least the columns 'ID' and 'software_mention'.
    software_names : List[str]
        List of software names to look up in df['software_mention'].
        
    Returns
    -------
    np.ndarray
        Array of unique IDs corresponding to the given software names.
        If none match, returns an empty array.
    """
    # Filter rows where software_mention is in the provided list
    mask = df['software_mention'].isin(software_names)
    
    # Extract unique IDs
    uniq_ids = df.loc[mask, 'ID'].unique()
    
    return uniq_ids

In [4]:
def sample_for_id(group: pd.DataFrame, n_per_id: int = 5) -> pd.DataFrame:
    """
    For a single-ID group:
      - If there are >= n_per_id distinct DOIs, pick one row per DOI up to n_per_id.
      - Otherwise, take one per DOI and then fill up to n_per_id by sampling additional rows.
    Returns a DataFrame of up to n_per_id unique rows.
    """
    unique_dois = group['doi'].unique()
    picks = []

    if len(unique_dois) >= n_per_id:
        chosen = np.random.choice(unique_dois, size=n_per_id, replace=False)
        for doi in chosen:
            picks.append(group[group['doi'] == doi].sample(1))
    else:
        # one row per DOI
        for doi in unique_dois:
            picks.append(group[group['doi'] == doi].sample(1))
        # fill up the rest
        needed = n_per_id - len(unique_dois)
        remaining = group.drop(pd.concat(picks).index)
        if needed > 0 and len(remaining) > 0:
            picks.append(remaining.sample(min(needed, len(remaining))))

    return pd.concat(picks)


In [5]:
def sample_data(
    df: pd.DataFrame,
    num_ids: int = 100,
    n_per_id: int = 5,
    seed: Optional[int] = None,
    software_names: Optional[List[str]] = None
) -> pd.DataFrame:
    """
    Orchestrate the sampling:
      1. If `software_names` is a non-empty list, pick IDs for those software via get_ids_by_software.
         Otherwise sample `num_ids` IDs at random via get_random_ids.
      2. For each ID, sample up to `n_per_id` rows via sample_for_id.
      3. Return the concatenated sample.
    
    Parameters
    ----------
    df : pd.DataFrame
        Must contain columns 'ID' and 'software_mention'.
    num_ids : int
        How many IDs to randomly sample if `software_names` is empty.
    n_per_id : int
        How many rows to sample per ID.
    seed : int or None
        Random seed for reproducibility (only used when sampling randomly).
    software_names : list of str, optional
        If provided and non-empty, select IDs for these software names instead of random.
    
    Returns
    -------
    pd.DataFrame
        Sampled subset of `df`.
    """
    # 1. choose IDs
    if software_names:
        # use the list of software names to pick IDs
        selected_ids = get_ids_by_software(df, software_names)
    else:
        # fall back to random sampling
        selected_ids = get_random_ids(df, num_ids, seed)

    # 2. filter and sample
    sub = df[df['ID'].isin(selected_ids)]
    sampled = (
        sub
        .groupby('ID', group_keys=False)
        .apply(sample_for_id, n_per_id=n_per_id)
        .reset_index(drop=True)
    )
    return sampled


In [23]:
github_sampled = sample_data(github, num_ids=100, n_per_id=5, seed=42)
github_sampled.to_csv('D:/MASTER/TMF/Software-Disambiguation/corpus/github_sampled.csv', index=False)

  .apply(sample_for_id, n_per_id=n_per_id)


In [8]:
pypi = pd.read_csv('D:/MASTER/TMF/Software-Disambiguation/corpus/pypi_from_CZI.csv')
#pypi = pypi[pypi['exact_match']]
print(pypi.shape)
print(pypi.columns)
print(pypi.head())

(292233, 11)
Index(['ID', 'software_mention', 'mapped_to', 'platform', 'package_url',
       'homepage_url', 'other_urls', 'github_repo', 'exact_match', 'doi',
       'paragraph'],
      dtype='object')
       ID software_mention mapped_to platform                 package_url  \
0  SM5081                0         0     Pypi  https://pypi.org/project/0   
1  SM5081                0         0     Pypi  https://pypi.org/project/0   
2  SM5081                0         0     Pypi  https://pypi.org/project/0   
3  SM5081                0         0     Pypi  https://pypi.org/project/0   
4  SM5081                0         0     Pypi  https://pypi.org/project/0   

  homepage_url  other_urls github_repo  exact_match  \
0       [None]         NaN      [None]         True   
1       [None]         NaN      [None]         True   
2       [None]         NaN      [None]         True   
3       [None]         NaN      [None]         True   
4       [None]         NaN      [None]         True   

   

In [25]:
pypi_sampled = sample_data(pypi, num_ids=100, n_per_id=5, seed=42)
pypi_sampled.to_csv('D:/MASTER/TMF/Software-Disambiguation/corpus/pypi_sampled.csv', index=False)

  .apply(sample_for_id, n_per_id=n_per_id)


In [None]:
pypi_sampled_top_10 = sample_data(pypi,
    n_per_id=10,
    software_names=["numpy", "tensorflow", "scikit-learn", "pandas", "matplotlib","requests",
    "beautifulsoup4", "flask", "django", "pytorch","beautifulsoup", "flask", "django","BeautifulSoup4", "Flask", "Django","BeautifulSoup", "Beautiful Soup"]
)
pypi_sampled_top_10.to_csv('D:/MASTER/TMF/Software-Disambiguation/corpus/pypi_sampled_top_10.csv', index=False)

  .apply(sample_for_id, n_per_id=n_per_id)
