# Training Data Cleaning and Split

In [1]:
import os
import sys
import logging
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Fix this later, imports should work without this
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))


from common.utils import collect_files, get_or_create_folder
from common.logger import get_logger_config
from common.constants import (
    BASE_RAW_DATA_DIR,
    BASE_LOGS_DIR,
    BASE_PLOTS_DIR,
    BASE_REPORTS_CSV_DIR,
)

In [61]:
logger_config = get_logger_config(subdir=None)
logging.config.dictConfig(logger_config)
logger = logging.getLogger(__name__)

AssertionError: None

## Remove entries with `precursor_charge` less than 2

## Check how do the peptides in the dataset here overlap the peptides from Kevin

In [11]:
peptides_file_paths = collect_files(BASE_REPORTS_CSV_DIR, ext="csv")


In [17]:
# TODO: Improve the `collect_files` function
peptides_file_paths = [path for path in peptides_file_paths if "unique_peptides" in path]

assert peptides_file_paths, peptides_file_paths
# Grab all csv of interest but ATTENTION;
# loading all many csv files will increase the computation time
df = pd.concat([pd.read_csv(file) for file in peptides_file_paths], ignore_index=True)
df.head(20)

Unnamed: 0,Unique Peptides
0,HNGTGGR
1,SQNCHNSSSR
2,AAGMNHTK
3,ANASHDQPQK
4,HNDSGASECR
5,GGGGGGGGGGGGGSGSSSGSSTSR
6,RQQQQQQQQQQQQK
7,QQQQQQQQQQQQK
8,KNDSGAYR
9,KCLNHTTQK


In [41]:
df["Unique Peptides"].describe()

count                 73643
unique                44976
top       AVCMLSNTTAIAEAWAR
freq                      8
Name: Unique Peptides, dtype: object

In [55]:
unique_peptides_df = df["Unique Peptides"].unique()
pd.DataFrame({"Unique Peptides": unique_peptides_df}).to_csv(BASE_REPORTS_CSV_DIR / f"overall_projects_unique_peptides_counting_{df['Unique Peptides'].nunique()}.csv", index=False)

### Overlap with peptides from identity files from Kevin

In [3]:
def compute_and_save_overlap(file_path, reference_peptides):
    reference_project_name="projects"
    file_path = BASE_REPORTS_CSV_DIR / file_path
    df = pd.read_csv(file_path)
    # Extract unique peptides from the 'sequence' column
    unique_peptides = set(df["sequence"].unique())
    if isinstance(reference_peptides, str):
        ref_path = BASE_REPORTS_CSV_DIR / reference_peptides
        reference_project_name = ref_path.stem
        reference_peptides = pd.read_csv(ref_path)["sequence"].unique()
    else:
        reference_project_name = "glyco_projects"
    # Compute overlap
    overlap_peptides = set(reference_peptides) & set(unique_peptides)
    print(f"Overlap count = {len(overlap_peptides)}")
    # Save results
    overlap_df = pd.DataFrame({"Overlapped peptides": list(overlap_peptides)})
    output_file = BASE_REPORTS_CSV_DIR / (
        f"overlap_{file_path.stem}_{len(unique_peptides)}_with_{reference_project_name}_{len(reference_peptides)}_found_{len(overlap_peptides)}.csv"
    )
    overlap_df.to_csv(output_file, index=False)
    
    print(f"Saved: {output_file}")

In [79]:
 # identity_splits_proteome_tools_from_kevin.csv
identity_files_from_kevin = [
    "identity_splits_proteome_tools_from_kevin.csv",
    "identity_splits_massivekb_from_kevin.csv",
    "identity_splits_blacklist_from_kevin.csv",
    "identity_splits_phospho_from_kevin.csv",
]

for file_path in identity_files_from_kevin:
    compute_and_save_overlap(file_path, unique_peptides_df)

                    sequence  split
0                  YLQNWSHVL  train
1                    GLPELRR  train
2                   LLYNLFHK  train
3                  CLQEEEDMR  train
4               QPRPQSTSLLAL  train
...                      ...    ...
753910           YVDQVLQLVYK   test
753911      GFSVVADTPELQRLKK   test
753912              TSLPTPLR   test
753913  SCTTDVRPLSGSRPVCPLCK   test
753914             YAFNSLQLK   test

[753915 rows x 2 columns]
Saved: /home/hjisaac/AI4Science/instanovo_instadeep/InstanovoGlyco/reports/csv_misc/overlap_identity_splits_proteome_tools_from_kevin_753915_with_projects_44976_found_5491.csv
                                   sequence  split
0                       NTKEPPLSLTLHLTSPVVR  train
1        MNLLPCNPHGNGLLYAGFNQDHGCFACGMENGFR  train
2                               LNLFPDGGLAR  train
3                                SGELAQEYDK  train
4                             HLVLPVQPENALK  train
...                                     ...    ...
1608962 

In [4]:
compute_and_save_overlap("identity_splits_proteome_tools_from_kevin.csv", "identity_splits_blacklist_from_kevin.csv")

                    sequence  split
0                  YLQNWSHVL  train
1                    GLPELRR  train
2                   LLYNLFHK  train
3                  CLQEEEDMR  train
4               QPRPQSTSLLAL  train
...                      ...    ...
753910           YVDQVLQLVYK   test
753911      GFSVVADTPELQRLKK   test
753912              TSLPTPLR   test
753913  SCTTDVRPLSGSRPVCPLCK   test
753914             YAFNSLQLK   test

[753915 rows x 2 columns]
Overlap count = 75390
Saved: /home/hjisaac/AI4Science/instanovo_instadeep/InstanovoGlyco/reports/csv_misc/overlap_identity_splits_proteome_tools_from_kevin_753915_with_identity_splits_blacklist_from_kevin_248900_found_75390.csv


In [2]:
BASE_REPORTS_CSV_DIR

PosixPath('/home/hjisaac/AI4Science/instanovo_instadeep/InstanovoGlyco/reports/csv_misc')