In [1]:
import pandas as pd
import numpy as np

# Preprocess FAR data

In [2]:
orig_df = pd.read_csv("../data/FAR_Z11/FAR_scores.csv")
orig_df.head()

Unnamed: 0,id,mutant,performance,activity,mutated_sequence,Score
0,0.0,;,,1.0,MATQQQQNGASASGVLEQLRGKHVLITGTTGFLGKVVLEKLIRTVP...,-1902.592529
1,1.0,G410S;,+,1.25,MATQQQQNGASASGVLEQLRGKHVLITGTTGFLGKVVLEKLIRTVP...,-1902.985107
2,2.0,S283V;,+,1.25,MATQQQQNGASASGVLEQLRGKHVLITGTTGFLGKVVLEKLIRTVP...,-1906.831177
3,3.0,D198Q;,+,1.25,MATQQQQNGASASGVLEQLRGKHVLITGTTGFLGKVVLEKLIRTVP...,-1907.872253
4,4.0,S283K;,+,1.25,MATQQQQNGASASGVLEQLRGKHVLITGTTGFLGKVVLEKLIRTVP...,-1907.92804


In [3]:
# Rename some columns
far_df = orig_df.rename(columns={"id": "PID", "activity": "log_fitness", "mutated_sequence":"seq"})

# Convert PID to int
far_df["PID"] = far_df["PID"].astype(int)

# Replace the WT row with the correct fitness value
wt_row = far_df[far_df["mutant"] == ";"]
far_df.loc[wt_row.index, "log_fitness"] = 1
far_df.loc[wt_row.index, "performance"] = "-"
far_df.head()


Unnamed: 0,PID,mutant,performance,log_fitness,seq,Score
0,0,;,-,1.0,MATQQQQNGASASGVLEQLRGKHVLITGTTGFLGKVVLEKLIRTVP...,-1902.592529
1,1,G410S;,+,1.25,MATQQQQNGASASGVLEQLRGKHVLITGTTGFLGKVVLEKLIRTVP...,-1902.985107
2,2,S283V;,+,1.25,MATQQQQNGASASGVLEQLRGKHVLITGTTGFLGKVVLEKLIRTVP...,-1906.831177
3,3,D198Q;,+,1.25,MATQQQQNGASASGVLEQLRGKHVLITGTTGFLGKVVLEKLIRTVP...,-1907.872253
4,4,S283K;,+,1.25,MATQQQQNGASASGVLEQLRGKHVLITGTTGFLGKVVLEKLIRTVP...,-1907.92804


In [4]:
## Remove the rows that do not have "+" in the performance column or is nan for the first row
print(far_df.shape)
far_df = far_df[far_df["performance"].str.contains("\+") | far_df["performance"].str.contains("-")]
print(far_df.shape)
far_df.head()

(630, 6)
(424, 6)


Unnamed: 0,PID,mutant,performance,log_fitness,seq,Score
0,0,;,-,1.0,MATQQQQNGASASGVLEQLRGKHVLITGTTGFLGKVVLEKLIRTVP...,-1902.592529
1,1,G410S;,+,1.25,MATQQQQNGASASGVLEQLRGKHVLITGTTGFLGKVVLEKLIRTVP...,-1902.985107
2,2,S283V;,+,1.25,MATQQQQNGASASGVLEQLRGKHVLITGTTGFLGKVVLEKLIRTVP...,-1906.831177
3,3,D198Q;,+,1.25,MATQQQQNGASASGVLEQLRGKHVLITGTTGFLGKVVLEKLIRTVP...,-1907.872253
4,4,S283K;,+,1.25,MATQQQQNGASASGVLEQLRGKHVLITGTTGFLGKVVLEKLIRTVP...,-1907.92804


# Add other columns

In [5]:
# Find the number of mutations compared to the first sequence
far_df["n_mut"] = far_df["seq"].apply(lambda x: sum(1 for a, b in zip(far_df["seq"][0], x) if a != b))
far_df.head()

# Get mutated positions
far_df["mutated_position"] = far_df.apply(lambda x: [i for i in range(len(x["seq"])) if x["seq"][i] != far_df["seq"][0][i]], axis=1)
# Convert to string separated by commas
far_df["mutated_position"] = far_df["mutated_position"].apply(lambda x: ",".join(map(str, x)))
far_df.head()

# Assert if the number of mutations from the "mutant"  column is the same as the number of mutations
# calculated from the sequence
far_df["n_mut_2"] = far_df["mutant"].apply(lambda x: len(x.split(";"))-1)
# assert (far_df["n_mut"] == far_df["n_mut_2"]).all()
far_df.head()

Unnamed: 0,PID,mutant,performance,log_fitness,seq,Score,n_mut,mutated_position,n_mut_2
0,0,;,-,1.0,MATQQQQNGASASGVLEQLRGKHVLITGTTGFLGKVVLEKLIRTVP...,-1902.592529,0,,1
1,1,G410S;,+,1.25,MATQQQQNGASASGVLEQLRGKHVLITGTTGFLGKVVLEKLIRTVP...,-1902.985107,1,409.0,1
2,2,S283V;,+,1.25,MATQQQQNGASASGVLEQLRGKHVLITGTTGFLGKVVLEKLIRTVP...,-1906.831177,1,282.0,1
3,3,D198Q;,+,1.25,MATQQQQNGASASGVLEQLRGKHVLITGTTGFLGKVVLEKLIRTVP...,-1907.872253,1,197.0,1
4,4,S283K;,+,1.25,MATQQQQNGASASGVLEQLRGKHVLITGTTGFLGKVVLEKLIRTVP...,-1907.92804,1,282.0,1


In [6]:
# Find row with no muatations and write the seq to a fasta file named "wt.fasta"
wt_seq = far_df[far_df["n_mut"] == 0]["seq"].values[0]
with open("../data/FAR_Z11/wt.fasta", "w") as f:
    f.write(">wt\n")
    f.write(wt_seq)

# Remove rows with no mutations
far_df = far_df[far_df["n_mut"] > 0]

# Write to file
far_df.to_csv("../data/FAR_Z11/data.csv", index=False)