In [20]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

In [23]:
# Load raw data
enrichments = pd.read_csv("Testing_Data/101724_all_combined_counts_post_processing.csv")
samples_info = pd.read_csv("Testing_Data/VIR3_clean.csv", usecols = ["id", "peptide"])
print("P1 enriched peptides: ", sum(enrichments["r3_P1enriched"]))
print("P1 high confidence peptides: ", (enrichments['r3_P1CV'] < 30).sum())


# Parsing important data
data = pd.DataFrame()
data["id"] = enrichments["id"]
data["sequence"] = samples_info["peptide"]
data["enrichment"] = enrichments["r3_P1"]
data["coef_of_var"] = enrichments["r3_P1CV"]

# Filter out low confidence peptides
data = data[data["coef_of_var"] < 30]
print(sum(data["enrichment"] == 0))

print(sum(data["enrichment"] >= 1))

data['binds'] = (data['enrichment'] > 0).astype(int)


data.head()

P1 enriched peptides:  198
P1 high confidence peptides:  43548
29909
5498


Unnamed: 0,id,sequence,enrichment,coef_of_var,binds
0,1,MRSLLFVVGAWVAALVTNLTPDAALASGTTTTAAAGNTSATASPGD...,0.0,0.0,0
1,2,TTTTAAAGNTSATASPGDNATSIDAGSTITAAAPPGHSTPWPALPT...,0.905954,4.690751,1
2,3,ITAAAPPGHSTPWPALPTDLALPLVIGGLCALTLAAMGAGALLHRC...,0.0,0.0,0
3,4,LCALTLAAMGAGALLHRCCRRCARRRQNVSSVSA,0.0,0.0,0
5,6,MGFGAAAALLALAVALARVPAGGGAYVPVDRALTRVSPNRFRGSSL...,0.0,0.0,0


In [None]:
# Initialize percentiles and counts array
percentiles = np.arange(0, 1, 0.005)
counts = []

for percentile in percentiles:
    counts.append(data['count'].quantile(percentile))

percentile_80th = data['count'].quantile(0.85)
print(f"80th percentile cutoff: {percentile_80th}")
plt.plot(percentiles, counts)
plt.xlabel("Percentile")
plt.ylabel("Count")
plt.show()

plt.hist(counts, bins=350)
plt.show()


In [30]:
data['binds'] = np.where(data['count'] > 0, 1, 0)
data.head()

Unnamed: 0,id,sequence,count,binds
0,1,MRSLLFVVGAWVAALVTNLTPDAALASGTTTTAAAGNTSATASPGD...,0,0
1,2,TTTTAAAGNTSATASPGDNATSIDAGSTITAAAPPGHSTPWPALPT...,98,1
2,3,ITAAAPPGHSTPWPALPTDLALPLVIGGLCALTLAAMGAGALLHRC...,0,0
3,4,LCALTLAAMGAGALLHRCCRRCARRRQNVSSVSA,0,0
4,5,RDRGPSRSRVRYTRLAASEA,338,1


In [31]:
print("Data points before trimming: ", len(data))
# Remove samples with more than 100 count
data = data[data['count'] <= 60]
print("Data points after trimming: ", len(data))
zero_count = sum(data['binds'])
print(zero_count, "samples do not bind and", len(data)-zero_count, "samples do bind")

Data points before trimming:  115753
Data points after trimming:  98475
36999 samples do not bind and 61476 samples do bind


In [19]:
# Splitting into train, val, and test sets
train_df, temp_df = train_test_split(data, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Reset indices
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Save datasets
train_df.to_csv('Testing_Data/VS_train_data.csv', index=False)
val_df.to_csv('Testing_Data/VS_val_data.csv', index=False)
test_df.to_csv('Testing_Data/VS_test_data.csv', index=False)