In [None]:
pip install pandas biopython requests

In [None]:
## Store each fasta result in a separate file "Is not used"

import pandas as pd
import re
import requests
from pathlib import Path

# === Step 1: Load your CSV file ===
csv_path = 'mutation_matrix_features.csv' 
df = pd.read_csv(csv_path)

# Ensure Mutation column exists and is formatted like 'L112P'
def extract_position(mutation):
    match = re.search(r'([A-Z])(\d+)([A-Z])', mutation)
    return int(match.group(2)) if match else None

df['Position'] = df['Mutation'].apply(extract_position)

# === Step 2: Fetch FASTA sequences for each unique UniProt ID ===
def fetch_uniprot_fasta(uniprot_id):
    url = f'https://www.uniprot.org/uniprot/{uniprot_id}.fasta'
    response = requests.get(url)
    if response.ok:
        return response.text
    else:
        print(f" Failed to fetch {uniprot_id}")
        return None

fasta_dir = Path('fasta_sequences')
fasta_dir.mkdir(exist_ok=True)

unique_ids = df['UniProt ID'].unique()

for uid in unique_ids:
    fasta = fetch_uniprot_fasta(uid)
    if fasta:
        with open(fasta_dir / f"{uid}.fasta", 'w') as f:
            f.write(fasta)

print(" FASTA files downloaded.")


In [None]:
## Store each 1000 fasta result in a separate file "This was just a check if it works well"

import pandas as pd
import re
import requests
from pathlib import Path

# === Step 1: Load your CSV file ===
csv_path = 'mutation_matrix_features.csv' 
df = pd.read_csv(csv_path)

# Ensure Mutation column exists and is formatted like 'L112P'
def extract_position(mutation):
    match = re.search(r'([A-Z])(\d+)([A-Z])', mutation)
    return int(match.group(2)) if match else None

df['Position'] = df['Mutation'].apply(extract_position)

# === Step 2: Fetch FASTA sequences for each unique UniProt ID ===
def fetch_uniprot_fasta(uniprot_id):
    url = f'https://www.uniprot.org/uniprot/{uniprot_id}.fasta'
    response = requests.get(url)
    if response.ok:
        return response.text
    else:
        return None

fasta_dir = Path('fasta_sequences')
fasta_dir.mkdir(exist_ok=True)

# === Step 3: Process in chunks of 1000 rows ===
chunk_size = 1000
start_index = 0
chunk_num = 1

while start_index < len(df):
    # Select the next 1000 rows (or the remaining rows if less than 1000)
    chunk = df.iloc[start_index:start_index + chunk_size]
    
    # Fetch the FASTA sequences for this chunk
    fasta_content = ""
    failed_rows = 0  # Counter to track failed rows for this chunk
    for _, row in chunk.iterrows():
        uniprot_id = row['UniProt ID']
        fasta = fetch_uniprot_fasta(uniprot_id)
        if fasta:
            fasta_content += fasta + "\n"  # Add the FASTA data for this ID
        else:
            failed_rows += 1  # If fetching fails, increment the counter
    
    # If there are FASTA sequences, save to a file
    if fasta_content:
        with open(fasta_dir / f"fasta_{chunk_num}.fasta", 'w') as f:
            f.write(fasta_content)
        print(f" Chunk {chunk_num}: {len(chunk) - failed_rows}/{len(chunk)} FASTA sequences saved successfully.")
    else:
        print(f" Chunk {chunk_num}: No FASTA sequences were saved (all requests failed).")

    # Move to the next chunk
    start_index += chunk_size
    chunk_num += 1

print(" FASTA files downloaded and saved in chunks.")

In [None]:
import pandas as pd
import re
import requests
from pathlib import Path

# === Step 1: Load your CSV file ===
csv_path = 'mutation_matrix_features.csv' 
df = pd.read_csv(csv_path)

# Ensure Mutation column exists and is formatted like 'L112P'
def extract_position(mutation):
    match = re.search(r'([A-Z])(\d+)([A-Z])', mutation)
    return int(match.group(2)) if match else None

df['Position'] = df['Mutation'].apply(extract_position)

# === Step 2: Fetch FASTA sequences for each unique UniProt ID ===
def fetch_uniprot_fasta(uniprot_id):
    url = f'https://www.uniprot.org/uniprot/{uniprot_id}.fasta'
    response = requests.get(url)
    if response.ok:
        return response.text
    else:
        return None

# === Step 3: Extract mutated region from FASTA sequence ===
def extract_mutated_region(fasta, mutation, position, region_size=10):
    """
    Extracts the mutated region of the protein sequence based on mutation position
    and the surrounding context.
    """
    # Find the actual mutation (e.g., L112P) and extract sequence
    mutated_sequence = ""
    lines = fasta.splitlines()
    sequence = ''.join(lines[1:])  # Join sequence lines together

    # Convert position to 0-based indexing
    position = position - 1

    # Extract a region around the mutation site
    start = max(0, position - region_size)
    end = min(len(sequence), position + region_size + 1)
    mutated_sequence = sequence[start:end]

    # Modify the sequence according to the mutation
    mutated_sequence = mutated_sequence[:position - start] + mutation[1] + mutated_sequence[position - start + 1:]

    return mutated_sequence

fasta_dir = Path('fasta_sequences')
fasta_dir.mkdir(exist_ok=True)

# === Step 4: Process in chunks of 1000 rows ===
chunk_size = 1000
start_index = 0
chunk_num = 1

while start_index < len(df):
    # Select the next 1000 rows (or the remaining rows if less than 1000)
    chunk = df.iloc[start_index:start_index + chunk_size]
    
    # Fetch the FASTA sequences for this chunk
    fasta_content = ""
    failed_rows = 0  # Counter to track failed rows for this chunk
    for _, row in chunk.iterrows():
        uniprot_id = row['UniProt ID']
        mutation = row['Mutation']
        position = row['Position']
        
        # Fetch full FASTA sequence
        fasta = fetch_uniprot_fasta(uniprot_id)
        
        if fasta:
            mutated_sequence = extract_mutated_region(fasta, mutation, position)
            fasta_content += f">{uniprot_id}_{mutation}\n{mutated_sequence}\n"  # Add the mutated FASTA data
        else:
            failed_rows += 1  # If fetching fails, increment the counter
    
    # If there are FASTA sequences, save to a file
    if fasta_content:
        with open(fasta_dir / f"fasta_mutated_{chunk_num}.fasta", 'w') as f:
            f.write(fasta_content)
        print(f" Chunk {chunk_num}: {len(chunk) - failed_rows}/{len(chunk)} FASTA mutated sequences saved successfully.")
    else:
        print(f" Chunk {chunk_num}: No FASTA sequences were saved (all requests failed).")

    # Move to the next chunk
    start_index += chunk_size
    chunk_num += 1

print(" Mutated FASTA files downloaded and saved in chunks.")


In [None]:
# 18/5/2025

In [None]:
# Merging the results from NetSurfP into one file, and then  I will merge this result including the new features 
# with the first feature and the original data
# During the merging, the mutation "row" which has no results from NetSurfP, I will have its value as NAN.


## This code is to merge the netsurfp files together, but I will find another better way, because each file contains 19000
## rows. 
## I WON'T USE THIS, IT'S JUST A TRIAL.
import pandas as pd
from pathlib import Path

# Define the directory containing NetSurfP CSV files
netsurfp_dir = Path("netsurfp_outputs")

# List and sort CSV files by their festa number
csv_files = sorted(
    netsurfp_dir.glob("*.csv"),
    key=lambda x: int(x.stem.replace("festa", ""))
)

# Prepare list to hold valid DataFrames
dataframes = []

# Process each file safely
for file in csv_files:
    try:
        df = pd.read_csv(file)
        print(f" Loaded: {file.name} | Rows: {len(df)}")
        dataframes.append(df)
    except Exception as e:
        print(f" Skipped {file.name} due to error: {e}")

# Concatenate all DataFrames
netsurfp_df = pd.concat(dataframes, ignore_index=True)

# Final output
print(f"\n Total merged NetSurfP rows: {len(netsurfp_df)}")
print(netsurfp_df.head())


# Save the merged DataFrame to a CSV file
netsurfp_df.to_csv("merged_netsurfp_results.csv", index=False)
print("Merged result saved to: merged_netsurfp_results.csv")

In [None]:
## This code flattens the netsurfp result files. I tested it for one file and then I will do this to all 19 files.
## And group them together.
import pandas as pd
import numpy as np

# Load CSV
df_netsurf = pd.read_csv("festa8.csv", sep=",")
df_netsurf.columns = df_netsurf.columns.str.strip()

features_per_residue = [
    "rsa", "asa", 
    "p[q3_H]", "p[q3_E]", "p[q3_C]",
    "p[q8_G]", "p[q8_H]", "p[q8_I]", "p[q8_B]", "p[q8_E]", "p[q8_S]", "p[q8_T]", "p[q8_C]",
    "phi", "psi", "disorder"
]

rel_positions = list(range(-10, 0)) + list(range(1, 11))  # 20 positions total

def flatten_window(group):
    group = group.sort_values("n").reset_index(drop=True)

    feature_dict = {}
    for i, pos in enumerate(rel_positions):
        if i < len(group):  # If residue exists at this position
            for feat in features_per_residue:
                col_name = f"{feat}_{pos}"
                feature_dict[col_name] = group.iloc[i][feat]
        else:  # Missing residue: fill with NaN
            for feat in features_per_residue:
                col_name = f"{feat}_{pos}"
                feature_dict[col_name] = np.nan

    feature_dict["id"] = group.iloc[0]["id"]
    return pd.Series(feature_dict)

# Apply to all groups
df_flattened = (
    df_netsurf.groupby("id")
    .apply(flatten_window)
    .reset_index(drop=True)
)

# Save
df_flattened.to_csv("festa8_flattened_full.csv", index=False)


In [None]:
# Flatten all netsurfp files, for each mutation.
import pandas as pd
import numpy as np
import os

# Folder containing all netsurfp output CSV files
folder_path = "netsurfp_outputs"

# Features to extract per residue
features_per_residue = [
    "rsa", "asa", 
    "p[q3_H]", "p[q3_E]", "p[q3_C]",
    "p[q8_G]", "p[q8_H]", "p[q8_I]", "p[q8_B]", "p[q8_E]", "p[q8_S]", "p[q8_T]", "p[q8_C]",
    "phi", "psi", "disorder"
]

# Relative positions: -10 to -1 and 1 to 10 (excluding 0)
rel_positions = list(range(-10, 0)) + list(range(1, 11))

# Flattening function for one group
def flatten_window(group):
    group = group.sort_values("n").reset_index(drop=True)
    feature_dict = {}
    for i, pos in enumerate(rel_positions):
        if i < len(group):
            for feat in features_per_residue:
                col_name = f"{feat}_{pos}"
                feature_dict[col_name] = group.iloc[i][feat]
        else:
            for feat in features_per_residue:
                col_name = f"{feat}_{pos}"
                feature_dict[col_name] = np.nan
    feature_dict["id"] = group.iloc[0]["id"]
    return pd.Series(feature_dict)

# Collect all results
all_flattened = []

# Iterate over all CSV files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        df.columns = df.columns.str.strip()
        
        # Apply flattening per mutation group
        flattened = (
            df.groupby("id")
            .apply(flatten_window)
            .reset_index(drop=True)
        )
        all_flattened.append(flattened)

# Combine all data
combined_df = pd.concat(all_flattened, ignore_index=True)

# Save the combined flattened output
combined_df.to_csv("all_flattened_netsurf.csv", index=False)

In [None]:
# Decrease the number of columns "Surrounding neighbors of the mutated amino acid" 
# due to lack of memory and huge size.
import pandas as pd

# Load your dataset
df = pd.read_csv("all_flattened_netsurf.csv")

# Keep only columns from -5 to 5 and 'id'
columns_to_keep = []

# Check all columns except the last one (which is 'id')
for col in df.columns:
    if col == 'id':
        columns_to_keep.append(col)
    else:
        # Extract the position suffix from the column name
        try:
            pos = int(col.split('_')[-1])
            if -5 <= pos <= 5:
                columns_to_keep.append(col)
        except ValueError:
            pass  # Skip columns that don't end with a position

# Filter the DataFrame
filtered_df = df[columns_to_keep]

# Save to a new file
filtered_df.to_csv("smaller_all_flattened_netsurf.csv", index=False)

# Print confirmation
print(f"Columns kept: {len(filtered_df.columns)}")
print("Columns:")
for i, col in enumerate(filtered_df.columns, 1):
    print(f"{i}. {col}")


In [None]:
# Merging the 2 files of the 2 features based on the uniprot id and the mutation together.
import pandas as pd

# Load both files
netsurf_df = pd.read_csv("smaller_all_flattened_netsurf.csv")
mutation_df = pd.read_csv("mutation_matrix_features.csv")

# Extract UniProt ID and Mutation from netsurf 'id' column (format: >A0AVI4_R195W)
netsurf_df[['UniProt ID', 'Mutation']] = netsurf_df['id'].str.extract(r'>([^_]+)_([A-Z]\d+[A-Z])')

# Strip whitespace just in case
netsurf_df['UniProt ID'] = netsurf_df['UniProt ID'].str.strip()
netsurf_df['Mutation'] = netsurf_df['Mutation'].str.strip()
mutation_df['UniProt ID'] = mutation_df['UniProt ID'].str.strip()
mutation_df['Mutation'] = mutation_df['Mutation'].str.strip()

# Merge on both UniProt ID and Mutation
merged_df = pd.merge(netsurf_df, mutation_df, on=["UniProt ID", "Mutation"], how="inner")

# Save the merged result
merged_df.to_csv("D:/faraah_my_final_merged_featuress.csv", index=False)

print("Merging complete. Output saved as 'faraah_my_final_merged_featuress.csv'")
