In [19]:
import pandas as pd

# Lists to store the processed headers and sequences
headers = []
sequences = []
current_seq = ""

# Open and read the file
with open("Sequence_data.txt", "r") as file:
    for line in file:
        line = line.strip()
        if line.startswith(">"):
            # If there is an existing sequence, append it before starting a new one
            if current_seq:
                sequences.append(current_seq)
                current_seq = ""
            # Remove the ">" and extract the middle part from the header
            full_header = line[1:]
            parts = full_header.split("|")
            # Use the middle part if available; otherwise, use the full header
            middle = parts[1] if len(parts) > 1 else full_header
            headers.append(middle)
        else:
            # Concatenate sequence lines
            current_seq += line
    # Append the last collected sequence
    if current_seq:
        sequences.append(current_seq)

# Create a DataFrame with the extracted header parts and sequences
df = pd.DataFrame({
    "Header": headers,
    "Sequence": sequences
})

df.head()

Unnamed: 0,Header,Sequence
0,E2RU10,MPPVKAPGNVSDCYFVGRVSLLKWISELLNEPVKKVEDLASGHHYC...
1,A8BPK8,MSHSNAPELHPQIVDPFHNVTYRPGKLLGKGGFAYVYEFHDVNSDS...
2,E2RTQ7,MPQHLVPHTGTGKRTTIEDFEIGRFLGRGKYGLVYLAREQSSKLVV...
3,E2RU97,MAEAFTREDYVFMAQLNENAERYDEMVETMRKISGMEGELSDKERN...
4,E9BPW4,MTETFAFQAEINQLMSLIINTFYSNKEIFLRELISNASDACDKIRY...


In [20]:
df.shape

(7551, 2)

In [21]:
df_labels = pd.read_excel("labels.xlsx") 
print(df_labels.shape)
df_labels.head()

(31349, 3)


Unnamed: 0,UniProt ID,AA,Position
0,E2RU10,S,148
1,A8BPK8,T,179
2,A8BPK8,T,183
3,E2RTQ7,T,205
4,E2RU97,T,214


In [22]:
df_merged = pd.merge(
    df,
    df_labels,
    left_on="Header",
    right_on="UniProt ID",
    how="left"
)

df_merged["target"] = df_merged["UniProt ID"].notnull().astype(int)

print(df_merged.head())

   Header                                           Sequence UniProt ID AA  \
0  E2RU10  MPPVKAPGNVSDCYFVGRVSLLKWISELLNEPVKKVEDLASGHHYC...     E2RU10  S   
1  A8BPK8  MSHSNAPELHPQIVDPFHNVTYRPGKLLGKGGFAYVYEFHDVNSDS...     A8BPK8  T   
2  A8BPK8  MSHSNAPELHPQIVDPFHNVTYRPGKLLGKGGFAYVYEFHDVNSDS...     A8BPK8  T   
3  E2RTQ7  MPQHLVPHTGTGKRTTIEDFEIGRFLGRGKYGLVYLAREQSSKLVV...     E2RTQ7  T   
4  E2RU97  MAEAFTREDYVFMAQLNENAERYDEMVETMRKISGMEGELSDKERN...     E2RU97  T   

   Position  target  
0     148.0       1  
1     179.0       1  
2     183.0       1  
3     205.0       1  
4     214.0       1  


In [23]:
df_merged.shape

(31350, 6)

In [24]:
df_merged.isnull().sum()

Header        0
Sequence      0
UniProt ID    1
AA            1
Position      1
target        0
dtype: int64

In [25]:
missing_rows = df_merged[df_merged.isnull().any(axis=1)]
print(missing_rows)

      Header                                           Sequence UniProt ID  \
6124  Q12053  MAQSRQLFLFGDQTADFVPKLRSLLSVQDSPILAAFLDQSHYVVRA...        NaN   

       AA  Position  target  
6124  NaN       NaN       0  


In [26]:
# 1. Identify the row(s) with missing UniProt ID
missing_uniprot = df_merged[df_merged["UniProt ID"].isnull()]
print("Rows with missing UniProt ID:\n", missing_uniprot)

# 2. Drop those rows from df_merged
df_merged.dropna(subset=["UniProt ID"], inplace=True)

# Now df_merged no longer contains the row(s) with NaN in UniProt ID
print("DataFrame after dropping missing UniProt ID rows:")
print(df_merged.head())

Rows with missing UniProt ID:
       Header                                           Sequence UniProt ID  \
6124  Q12053  MAQSRQLFLFGDQTADFVPKLRSLLSVQDSPILAAFLDQSHYVVRA...        NaN   

       AA  Position  target  
6124  NaN       NaN       0  
DataFrame after dropping missing UniProt ID rows:
   Header                                           Sequence UniProt ID AA  \
0  E2RU10  MPPVKAPGNVSDCYFVGRVSLLKWISELLNEPVKKVEDLASGHHYC...     E2RU10  S   
1  A8BPK8  MSHSNAPELHPQIVDPFHNVTYRPGKLLGKGGFAYVYEFHDVNSDS...     A8BPK8  T   
2  A8BPK8  MSHSNAPELHPQIVDPFHNVTYRPGKLLGKGGFAYVYEFHDVNSDS...     A8BPK8  T   
3  E2RTQ7  MPQHLVPHTGTGKRTTIEDFEIGRFLGRGKYGLVYLAREQSSKLVV...     E2RTQ7  T   
4  E2RU97  MAEAFTREDYVFMAQLNENAERYDEMVETMRKISGMEGELSDKERN...     E2RU97  T   

   Position  target  
0     148.0       1  
1     179.0       1  
2     183.0       1  
3     205.0       1  
4     214.0       1  


In [27]:
df_merged.shape

(31349, 6)

In [28]:
# 1. Find all duplicates in the entire row
duplicates = df_merged[df_merged.duplicated()]
print("Duplicate rows based on the entire row:\n", duplicates)

# 2. Drop duplicates from the DataFrame (keeping the first occurrence)
df_merged.drop_duplicates(inplace=True)
print("\nDataFrame after dropping duplicates (entire row match):")
print(df_merged.head())

Duplicate rows based on the entire row:
 Empty DataFrame
Columns: [Header, Sequence, UniProt ID, AA, Position, target]
Index: []

DataFrame after dropping duplicates (entire row match):
   Header                                           Sequence UniProt ID AA  \
0  E2RU10  MPPVKAPGNVSDCYFVGRVSLLKWISELLNEPVKKVEDLASGHHYC...     E2RU10  S   
1  A8BPK8  MSHSNAPELHPQIVDPFHNVTYRPGKLLGKGGFAYVYEFHDVNSDS...     A8BPK8  T   
2  A8BPK8  MSHSNAPELHPQIVDPFHNVTYRPGKLLGKGGFAYVYEFHDVNSDS...     A8BPK8  T   
3  E2RTQ7  MPQHLVPHTGTGKRTTIEDFEIGRFLGRGKYGLVYLAREQSSKLVV...     E2RTQ7  T   
4  E2RU97  MAEAFTREDYVFMAQLNENAERYDEMVETMRKISGMEGELSDKERN...     E2RU97  T   

   Position  target  
0     148.0       1  
1     179.0       1  
2     183.0       1  
3     205.0       1  
4     214.0       1  


In [29]:
df_merged.shape

(31349, 6)

In [30]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31349 entries, 0 to 31349
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Header      31349 non-null  object 
 1   Sequence    31349 non-null  object 
 2   UniProt ID  31349 non-null  object 
 3   AA          31349 non-null  object 
 4   Position    31349 non-null  float64
 5   target      31349 non-null  int32  
dtypes: float64(1), int32(1), object(4)
memory usage: 1.6+ MB


In [31]:
for i in df_merged["Sequence"].unique():
    if len(i) > 5000:
        print("we found 1 here")

we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here
we found 1 here


In [32]:
# 1) Drop sequences with length > 5000
df_merged = df_merged[df_merged["Sequence"].str.len() <= 5000]

# 2) Now df_final contains only sequences of length <= 5000
print(df_merged.shape)

(31073, 6)


In [33]:
import pandas as pd
import random

# --------------------------
# 1) Convert 'Position' to integer
# --------------------------
df_merged["Position"] = df_merged["Position"].astype(int)

# Optional: set a seed for reproducible random sampling
random.seed(42)

# We'll store our final data in a list of DataFrames, then concatenate at the end
df_list = []

# --------------------------
# 2) Group by the sequence ID (assuming 'Header' is unique per sequence)
# --------------------------
for header_value, group in df_merged.groupby("Header"):
    # Extract the amino-acid sequence (assuming one sequence per Header)
    seq = group["Sequence"].iloc[0]
    
    # --------------------------
    # 3) Identify positives & find S/T/Y positions
    # --------------------------
    # (A) Positive positions from the DataFrame
    #     If each row in 'group' is one positive site, we get them all here:
    positive_positions = group["Position"].unique().tolist()  # unique() if duplicates are possible
    
    # (B) Find all S/T/Y positions in the sequence
    #     Enumerate gives (index, amino_acid), we do 1-based by adding +1
    st_y_positions = [i+1 for i, aa in enumerate(seq) if aa in ["S", "T", "Y"]]
    
    # (C) Exclude the positives → negative candidates
    negative_candidates = [pos for pos in st_y_positions if pos not in positive_positions]
    
    # (D) Number of positives for this sequence
    n_pos = len(positive_positions)
    
    # --------------------------
    # 4) Randomly sample negatives
    # --------------------------
    # If there aren't enough negative candidates, you can:
    #  - use them all,
    #  - skip adding negatives, or
    #  - raise an error.
    if len(negative_candidates) >= n_pos:
        sampled_negatives = random.sample(negative_candidates, n_pos)
    else:
        # For example, just use whatever is available (partial negative set)
        sampled_negatives = negative_candidates
    
    # --------------------------
    # 5) Create new rows for negative sites
    # --------------------------
    new_rows = []
    for neg_pos in sampled_negatives:
        new_rows.append({
            "Header": header_value,
            "Sequence": seq,
            "UniProt ID": group["UniProt ID"].iloc[0],  # if needed
            "AA": seq[neg_pos - 1],   # -1 because 'neg_pos' is 1-based, Python string index is 0-based
            "Position": neg_pos,
            "target": 0
        })
    
    # --------------------------
    # 6) Mark positives in the group with target=1
    # --------------------------
    group = group.copy()
    group["target"] = 1
    
    # --------------------------
    # 7) Combine positives & negatives, store in a list
    # --------------------------
    neg_df = pd.DataFrame(new_rows)
    combined_df = pd.concat([group, neg_df], ignore_index=True)
    df_list.append(combined_df)

# --------------------------
# 8) Combine all groups into final DataFrame
# --------------------------
df_final = pd.concat(df_list, ignore_index=True)

# Optionally drop duplicates if needed (e.g., if you suspect overlap)
# df_final.drop_duplicates(subset=["Header", "Position", "target"], inplace=True)

# Now 'df_final' contains:
# - Original positive rows (target=1)
# - Matched number of negative rows (target=0) for each sequence
print(df_final.head())

       Header                                           Sequence  UniProt ID  \
0  A0A0A0HR72  MWNDEDNNPYGSFDRHSEAVNDHFHGSPGSTTFDPPSTPQSSASTL...  A0A0A0HR72   
1  A0A0A0HR72  MWNDEDNNPYGSFDRHSEAVNDHFHGSPGSTTFDPPSTPQSSASTL...  A0A0A0HR72   
2  A0A0A0HR72  MWNDEDNNPYGSFDRHSEAVNDHFHGSPGSTTFDPPSTPQSSASTL...  A0A0A0HR72   
3  A0A0A0HR72  MWNDEDNNPYGSFDRHSEAVNDHFHGSPGSTTFDPPSTPQSSASTL...  A0A0A0HR72   
4  A0A0A0HS42  MADGGHPNLNLTPEEKRVFYKLFQAADKTNLGVITGEVAVSFFERS...  A0A0A0HS42   

  AA  Position  target  
0  S        58       1  
1  S        60       1  
2  S      1525       0  
3  Y       302       0  
4  S       256       1  


In [34]:
df_final.head()

Unnamed: 0,Header,Sequence,UniProt ID,AA,Position,target
0,A0A0A0HR72,MWNDEDNNPYGSFDRHSEAVNDHFHGSPGSTTFDPPSTPQSSASTL...,A0A0A0HR72,S,58,1
1,A0A0A0HR72,MWNDEDNNPYGSFDRHSEAVNDHFHGSPGSTTFDPPSTPQSSASTL...,A0A0A0HR72,S,60,1
2,A0A0A0HR72,MWNDEDNNPYGSFDRHSEAVNDHFHGSPGSTTFDPPSTPQSSASTL...,A0A0A0HR72,S,1525,0
3,A0A0A0HR72,MWNDEDNNPYGSFDRHSEAVNDHFHGSPGSTTFDPPSTPQSSASTL...,A0A0A0HR72,Y,302,0
4,A0A0A0HS42,MADGGHPNLNLTPEEKRVFYKLFQAADKTNLGVITGEVAVSFFERS...,A0A0A0HS42,S,256,1


In [35]:
df_final.shape

(62120, 6)

In [36]:
# 1) Drop sequences with length > 5000
df_final = df_final[df_final["Sequence"].str.len() <= 5000]

# 2) Now df_final contains only sequences of length <= 5000
print(df_final.shape)

(62120, 6)


In [37]:
df_final.to_csv("raw_merged_with_negative_samples.csv", index=False)

In [38]:
df_final.columns

Index(['Header', 'Sequence', 'UniProt ID', 'AA', 'Position', 'target'], dtype='object')

In [40]:
def extract_aac(sequence):
    """
    Extract Amino Acid Composition (AAC) from a protein sequence.
    
    Parameters:
    sequence (str): Protein sequence
    
    Returns:
    dict: Dictionary with amino acids as keys and their frequencies as values
    """
    # List of 20 standard amino acids
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
                   'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    
    # Initialize dictionary with zeros
    aac = {aa: 0 for aa in amino_acids}
    
    # Count amino acids
    seq_length = len(sequence)
    for aa in sequence:
        if aa in aac:
            aac[aa] += 1
    
    # Convert counts to frequencies
    for aa in aac:
        aac[aa] = aac[aa] / seq_length if seq_length > 0 else 0
        
    return aac


def extract_dpc(sequence):
    """
    Extract Dipeptide Composition (DPC) from a protein sequence.
    
    Parameters:
    sequence (str): Protein sequence
    
    Returns:
    dict: Dictionary with dipeptides as keys and their frequencies as values
    """
    # List of 20 standard amino acids
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
                   'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    
    # Initialize dictionary with all possible dipeptides
    dpc = {}
    for aa1 in amino_acids:
        for aa2 in amino_acids:
            dpc[aa1 + aa2] = 0
    
    # Count dipeptides
    if len(sequence) < 2:
        return dpc
    
    for i in range(len(sequence) - 1):
        dipeptide = sequence[i:i+2]
        if dipeptide in dpc:
            dpc[dipeptide] += 1
    
    # Convert counts to frequencies
    total_dipeptides = len(sequence) - 1
    for dipeptide in dpc:
        dpc[dipeptide] = dpc[dipeptide] / total_dipeptides if total_dipeptides > 0 else 0
        
    return dpc

In [43]:
def extract_tpc(sequence):
    """
    Extract Tripeptide Composition (TPC) from a protein sequence.
    
    Parameters:
    sequence (str): Protein sequence
    
    Returns:
    dict: Dictionary with tripeptides as keys and their frequencies as values
    """
    # List of 20 standard amino acids
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
                   'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    
    # Initialize dictionary with all possible tripeptides
    tpc = {}
    for aa1 in amino_acids:
        for aa2 in amino_acids:
            for aa3 in amino_acids:
                tpc[aa1 + aa2 + aa3] = 0
    
    # Count tripeptides
    if len(sequence) < 3:
        return tpc
    
    for i in range(len(sequence) - 2):
        tripeptide = sequence[i:i+3]
        if tripeptide in tpc:
            tpc[tripeptide] += 1
    
    # Convert counts to frequencies
    total_tripeptides = len(sequence) - 2
    for tripeptide in tpc:
        tpc[tripeptide] = tpc[tripeptide] / total_tripeptides if total_tripeptides > 0 else 0
        
    return tpc

In [41]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

# Function to process TPC features in batches
def process_tpc_in_batches(df, batch_size=500, window_size=5, output_dir="tpc_batches"):
    """
    Process TPC features in batches to avoid memory errors
    
    Parameters:
    df (DataFrame): DataFrame containing sequences and positions
    batch_size (int): Number of samples to process in each batch
    window_size (int): Size of window around phosphorylation site
    output_dir (str): Directory to save batch files
    
    Returns:
    list: Paths to all batch files
    """
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Calculate number of batches
    n_samples = len(df)
    n_batches = (n_samples + batch_size - 1) // batch_size  # Ceiling division
    
    batch_files = []
    
    print(f"Processing {n_samples} samples in {n_batches} batches...")
    
    for batch_idx in range(n_batches):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, n_samples)
        
        print(f"Processing batch {batch_idx+1}/{n_batches} (samples {start_idx}-{end_idx})")
        
        # Get batch data
        batch_df = df.iloc[start_idx:end_idx].copy()
        
        # Extract windows if not already done
        if 'Window' not in batch_df.columns:
            tqdm.pandas(desc="Extracting windows")
            batch_df['Window'] = batch_df.progress_apply(
                lambda row: extract_window(row['Sequence'], row['Position'], window_size=window_size), 
                axis=1
            )
        
        # Process TPC features for this batch
        tpc_batch = []
        for idx, row in tqdm(batch_df.iterrows(), total=len(batch_df), desc="Extracting TPC"):
            window = row['Window']
            tpc_dict = extract_tpc(window)
            # Add identifier columns and target
            tpc_dict['Header'] = row['Header']
            tpc_dict['Position'] = row['Position']
            tpc_dict['target'] = row['target']
            tpc_batch.append(tpc_dict)
        
        # Convert to DataFrame and save this batch
        batch_output_file = os.path.join(output_dir, f"tpc_features_batch_{batch_idx+1}.csv")
        tpc_batch_df = pd.DataFrame(tpc_batch)
        tpc_batch_df.to_csv(batch_output_file, index=False)
        
        # Release memory
        del tpc_batch, tpc_batch_df
        
        # Add file to list of batch files
        batch_files.append(batch_output_file)
        
        print(f"Batch {batch_idx+1} saved to {batch_output_file}")
    
    return batch_files

# Function to combine all batch files
def combine_tpc_batches(batch_files, output_file="phosphorylation_tpc_features_window5.csv"):
    """
    Combine all TPC batch files into a single file
    
    Parameters:
    batch_files (list): List of batch file paths
    output_file (str): Output file path
    """
    print(f"Combining {len(batch_files)} batch files...")
    
    # Use pandas to combine batch files
    combined_df = pd.concat([pd.read_csv(file) for file in tqdm(batch_files, desc="Reading batches")])
    
    # Save combined file
    combined_df.to_csv(output_file, index=False)
    
    print(f"Combined TPC features saved to {output_file}")
    return output_file

In [None]:
from tqdm import tqdm

def extract_window(sequence, position, window_size=5):
    """Extract a window of amino acids around a position"""

    pos_idx = position - 1

    start = max(0, pos_idx - window_size)
    end = min(len(sequence), pos_idx + window_size + 1)

    window = sequence[start:end]
    return window


print("Creating sequence windows...")
tqdm.pandas(desc="Extracting windows")
df_final['Window'] = df_final.progress_apply(
    lambda row: extract_window(row['Sequence'], row['Position'], window_size=5), 
    axis=1
)


print("Extracting AAC features...")
aac_data = []
for index, row in tqdm(df_final.iterrows(), total=len(df_final), desc="AAC features"):
    window = row['Window']
    aac_dict = extract_aac(window)
    # Add identifier columns and class label
    aac_dict['Header'] = row['Header']
    aac_dict['Position'] = row['Position']
    aac_dict['target'] = row['target']
    aac_data.append(aac_dict)


aac_df = pd.DataFrame(aac_data)
aac_df.to_csv("phosphorylation_aac_features_window5.csv", index=False)
print("AAC features saved.")


print("Extracting DPC features...")
dpc_data = []
for index, row in tqdm(df_final.iterrows(), total=len(df_final), desc="DPC features"):
    window = row['Window']
    dpc_dict = extract_dpc(window)

    dpc_dict['Header'] = row['Header']
    dpc_dict['Position'] = row['Position']
    dpc_dict['target'] = row['target']
    dpc_data.append(dpc_dict)


dpc_df = pd.DataFrame(dpc_data)
dpc_df.to_csv("phosphorylation_dpc_features_window5.csv", index=False)
print("DPC features saved.")


print("Extracting TPC features...")
# Execute TPC batch processing
batch_files = process_tpc_in_batches(
    df_final,
    batch_size=2000,  # Adjust based on your available memory
    window_size=5,
    output_dir="tpc_batches"
)

# Optional: combine all batches into a single file
# Note: This might still cause memory issues if the final file is too large
try:
    combined_file = combine_tpc_batches(batch_files)
    print(f"Successfully combined all batches into {combined_file}")
except Exception as e:
    print(f"Error combining batches: {e}")
    print("You can still use the individual batch files for training.")

Creating sequence windows...


Extracting windows: 100%|████████████████████████████████████████████████████| 62120/62120 [00:00<00:00, 126311.49it/s]


Extracting AAC features...


AAC features: 100%|███████████████████████████████████████████████████████████| 62120/62120 [00:02<00:00, 21026.07it/s]


AAC features saved.
Extracting DPC features...


DPC features: 100%|████████████████████████████████████████████████████████████| 62120/62120 [00:09<00:00, 6502.46it/s]


DPC features saved.
Extracting TPC features...
Processing 62120 samples in 32 batches...
Processing batch 1/32 (samples 0-2000)


Extracting TPC: 100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 445.86it/s]


Batch 1 saved to tpc_batches\tpc_features_batch_1.csv
Processing batch 2/32 (samples 2000-4000)


Extracting TPC: 100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 454.46it/s]


Batch 2 saved to tpc_batches\tpc_features_batch_2.csv
Processing batch 3/32 (samples 4000-6000)


Extracting TPC: 100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 482.07it/s]


Batch 3 saved to tpc_batches\tpc_features_batch_3.csv
Processing batch 4/32 (samples 6000-8000)


Extracting TPC: 100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 455.13it/s]


Batch 4 saved to tpc_batches\tpc_features_batch_4.csv
Processing batch 5/32 (samples 8000-10000)


Extracting TPC: 100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:03<00:00, 508.94it/s]


Batch 5 saved to tpc_batches\tpc_features_batch_5.csv
Processing batch 6/32 (samples 10000-12000)


Extracting TPC: 100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 444.40it/s]


Batch 6 saved to tpc_batches\tpc_features_batch_6.csv
Processing batch 7/32 (samples 12000-14000)


Extracting TPC: 100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 489.00it/s]


Batch 7 saved to tpc_batches\tpc_features_batch_7.csv
Processing batch 8/32 (samples 14000-16000)


Extracting TPC: 100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 479.70it/s]


Batch 8 saved to tpc_batches\tpc_features_batch_8.csv
Processing batch 9/32 (samples 16000-18000)


Extracting TPC: 100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 432.11it/s]


Batch 9 saved to tpc_batches\tpc_features_batch_9.csv
Processing batch 10/32 (samples 18000-20000)


Extracting TPC: 100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 474.31it/s]


Batch 10 saved to tpc_batches\tpc_features_batch_10.csv
Processing batch 11/32 (samples 20000-22000)


Extracting TPC: 100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 445.95it/s]


Batch 11 saved to tpc_batches\tpc_features_batch_11.csv
Processing batch 12/32 (samples 22000-24000)


Extracting TPC: 100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:03<00:00, 501.44it/s]


Batch 12 saved to tpc_batches\tpc_features_batch_12.csv
Processing batch 13/32 (samples 24000-26000)


Extracting TPC: 100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 486.94it/s]


Batch 13 saved to tpc_batches\tpc_features_batch_13.csv
Processing batch 14/32 (samples 26000-28000)


Extracting TPC: 100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:03<00:00, 552.74it/s]


Batch 14 saved to tpc_batches\tpc_features_batch_14.csv
Processing batch 15/32 (samples 28000-30000)


Extracting TPC: 100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 443.89it/s]


Batch 15 saved to tpc_batches\tpc_features_batch_15.csv
Processing batch 16/32 (samples 30000-32000)


Extracting TPC: 100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 430.74it/s]


Batch 16 saved to tpc_batches\tpc_features_batch_16.csv
Processing batch 17/32 (samples 32000-34000)


Extracting TPC: 100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 430.78it/s]


Batch 17 saved to tpc_batches\tpc_features_batch_17.csv
Processing batch 18/32 (samples 34000-36000)


Extracting TPC: 100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 428.49it/s]


Batch 18 saved to tpc_batches\tpc_features_batch_18.csv
Processing batch 19/32 (samples 36000-38000)


Extracting TPC: 100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 434.35it/s]


Batch 19 saved to tpc_batches\tpc_features_batch_19.csv
Processing batch 20/32 (samples 38000-40000)


Extracting TPC:  48%|█████████████████████████████▋                                | 958/2000 [00:02<00:02, 422.28it/s]