In [1]:
import os
import numpy as np
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import pickle
from itertools import product
from scipy.stats import spearmanr, pearsonr
from sklearn.linear_model import LinearRegression, Ridge
import logomaker

#from Bio.Seq import Seq
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
#from seqtools import *

font = {'family' : 'sans-serif',
        'size'   : 16}
mpl.rc('font', **font)
mpl.rc('lines', linewidth=2)
mpl.rcParams['axes.linewidth'] = 2
mpl.rcParams['xtick.major.width'] = 2
mpl.rcParams['ytick.major.width'] = 2

ALL_AAS = ("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")

codon2protein_ = {'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S',
                      'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*', 'TGT': 'C', 'TGC': 'C',
                      'TGA': '*', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P',
                      'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
                      'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I',
                      'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
                      'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 'GTT': 'V',
                      'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
                      'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G',
                      'GGG': 'G'}

def get_protein_seq(seq):
    if seq == '#PARENT#':
        return 'WYLQF'
    else:
        protein_seq = ''
        for i in range(0, len(seq), 3):
            codon = seq[i:i+3]
            protein_seq += codon2protein_[codon]
        return protein_seq

def shorten_seq(seq):
    if seq == '#PARENT#':
        return '#PARENT#'
    else:
        return seq[55*3:57*3] + seq[58*3:60*3] + seq[88*3:89*3]

def variant2codons(variant):    
    seq = 'TGGTACCTGCAGTTC'
    if variant == '#PARENT#':
        return seq
    else:
        dict = {166: 0, 167: 1, 168: 2, 169:3, 170:4, 171:5, 175:6, 176:7, 177:8, 178:9, 179:10, 180:11, 265:12, 266: 13, 267:14}
        split = variant.split('_')
        positions = []
        for mut in split:
            position = int(mut[1:-1])
            if position < 166:
                print('issue')
            elif position > 180 and position < 265:
                print('issue')
            elif position not in dict.keys(): #if it's outside of the range
                pass
            else:
                index = dict[position]
                seq = seq[:index] + mut[-1] + seq[index+1:]
        return seq

        

#check for deletion before 89*3
def check_deletion(variant):
    if variant == '#PARENT#':
        return True
    else:
        split = variant.split('_')
        values = []
        #print(split)
        for mut in split:
            if 'DEL' in mut:
                values.append(int(mut[1:-3]))
        if len(values) == 0:
            #print('no deletion')
            return True
        else:
            min = np.min(values)
            #print('deletion')
            return min > 89*3

def reformat(x):
    if (len(x) == 2):
        x = x[0] + '0' + x[1]
    return x


In [2]:
#cleaned_df = merged[merged['VariantsFound'] == 5]
#manually drop well D06
#cleaned_df = cleaned_df.drop(cleaned_df[cleaned_df['Well'] == 'D06'].index)

def process(cleaned_df, activity_col):
    #cleaned_df = cleaned_df[cleaned_df['WellSeqDepth'] > 10]
    #cleaned_df = cleaned_df[cleaned_df['AlignmentFrequency'] > 0.7]


    #there's a simpler way to normalize to parents on the plate but i forgot how
    # for plate in [1, 2, 3, 4]:
    #     plate_df =  cleaned_df[cleaned_df['Plate'] == plate]
    #     mean = plate_df[plate_df['SimpleCombo'] == 'WYLQF'][activity_col].mean()
    #     norm_std = plate_df[plate_df['SimpleCombo'] == 'WYLQF'][activity_col].std()/mean
    #     #print(norm_std)
    #     #divide rows in cleaned_df by mean for that plate
    #     cleaned_df.loc[cleaned_df['Plate'] == plate, activity_col] = cleaned_df.loc[cleaned_df['Plate'] == plate, activity_col]/mean

    expanded_df = cleaned_df.copy()

    indices = cleaned_df[cleaned_df['Combo'] == 'WYLQF'][~cleaned_df['Well'].isin(['B04', 'C05', 'D06', 'E07', 'F08', 'G09'])].index
    cleaned_df = cleaned_df.drop(indices)
    #parent_df = cleaned_df[cleaned_df['Combo'] == 'WYLQF'][cleaned_df['Well'].isin(['B04', 'C05', 'D06', 'E07', 'F08', 'G09'])]
    expanded_df['Control'] = expanded_df['Well'].isin(['B04', 'C05', 'D06', 'E07', 'F08', 'G09'])
    
    #cleaned_df = cleaned_df[cleaned_df['Plate'] == 4]
    counts_df = cleaned_df.groupby(['Combo']).size().reset_index()
    cleaned_df = cleaned_df.groupby(['Combo']).mean()
    cleaned_df['num_wells'] = counts_df.iloc[:, 1].values
    cleaned_df = cleaned_df.reset_index().drop('Plate', axis=1)
    cleaned_df[activity_col] = cleaned_df[activity_col]
    #cleaned_df['Fitness'] = cleaned_df['Integration ']/cleaned_df['Integration '].max()

    #take all rows with Combo containing *
    stop_df = cleaned_df[cleaned_df['Combo'].str.contains('\*')]
    mean = stop_df[activity_col].mean()
    std = stop_df[activity_col].std()
    print(mean, std)
    print(stop_df[activity_col].max())

    cleaned_df = cleaned_df[~cleaned_df['Combo'].str.contains('\*')]

    cleaned_df = cleaned_df.sort_values(by=[activity_col], ascending=False)
    return stop_df, expanded_df, cleaned_df

### For processing Yueming's output

In [3]:
seq_df2 = pd.read_csv('initial/ParPgb_minION2.csv')
seq_df2['Well'] = seq_df2['FBC']
seq_df2['Plate'] = seq_df2['RBC'].apply(lambda x: int(x[-2:])- 4)
seq_df2 = seq_df2[seq_df2['Plate'] < 5] #first four plates are 89X, not sure what the rest are
seq_df2.dropna(subset=['Variant'], inplace=True)
seq_df2 = seq_df2[seq_df2['Variant'].apply(check_deletion)]
seq_df2['CodonCombo'] = seq_df2['Variant'].apply(variant2codons)

#seq_df2['SimpleCombo_MinION'] = seq_df2['CodonCombo'].apply(get_protein_seq)
seq_df2['Combo'] = seq_df2['CodonCombo'].apply(get_protein_seq)
seq_df2.drop(['Position', 'RBC', 'FBC'], axis=1, inplace=True)
seq_df2


issue
issue
issue
issue
issue
issue
issue
issue
issue
issue
issue
issue
issue
issue
issue
issue
issue
issue
issue
issue
issue
issue
issue
issue
issue


Unnamed: 0.1,Unnamed: 0,Variant,Alignment Probability,Alignment Count,Row,Column,Well,Plate,CodonCombo,Combo
0,263,T166A_G167C_G168T_T169A_C171T_C175A_A179C_G180...,0.798821,248.0,A,1,A01,1,ACTAATATGCCTGAG,TNMPE
1,262,T166A_G167C_G168T_T169A_C171T_C175A_A179C_G180...,0.717116,300.0,A,2,A02,1,ACTAATATGCCTGAG,TNMPE
2,186,G167C_G168T_T169A_A170T_C171G_C175A_T176G_G177...,0.572425,410.0,A,3,A03,1,TCTATGAGTCATTGT,SMSHC
3,190,T166G_G167T_A170G_C171T_C175A_T176G_C178G_A179...,0.476841,297.0,A,4,A04,1,GTGTGTAGGGCGTCG,VCRAS
4,277,T166A_G168T_T169A_A170T_C171T_C175G_G177T_C178...,0.610185,424.0,A,5,A05,1,AGTATTGTTAAGACG,SIVKT
...,...,...,...,...,...,...,...,...,...,...
369,129,T166A_G167C_A170C_C171T_C175G_T176G_G177T_C178...,0.235028,156.0,H,8,H08,4,ACGTCTGGTATGTGT,TSGMC
370,120,T166A_G167T_A170G_C171G_T176G_C178A_A179C_C267...,0.363358,455.0,H,9,H09,4,ATGTGGCGGACGTTT,MWRTF
371,163,G167C_G168T_T169A_A170T_C171G_C175A_T176G_G177...,0.545897,270.0,H,10,H10,4,TCTATGAGTCATACT,SMSHT
372,136,T166A_G167C_G168T_T169A_C171T_C175A_A179C_G180...,0.783072,208.0,H,11,H11,4,ACTAATATGCCTCAG,TNMPQ


In [4]:
parent_df = seq_df2[seq_df2['Combo'] == 'WYLQF'][seq_df2['Well'].isin(['B04', 'C05', 'D06', 'E07', 'F08', 'G09'])]
#parent_df.to_csv('parents_sanger.csv', index=False)

  parent_df = seq_df2[seq_df2['Combo'] == 'WYLQF'][seq_df2['Well'].isin(['B04', 'C05', 'D06', 'E07', 'F08', 'G09'])]


In [5]:
# index = parent_df.index
# #remove these indices from seq_df2
# seq_df2.drop(index, inplace=True)
# seq_df2

### Now process fitness

In [6]:
#seq_df = pd.read_csv('merged_seqs_outer.csv')
seq_df = seq_df2
seq_df

Unnamed: 0.1,Unnamed: 0,Variant,Alignment Probability,Alignment Count,Row,Column,Well,Plate,CodonCombo,Combo
0,263,T166A_G167C_G168T_T169A_C171T_C175A_A179C_G180...,0.798821,248.0,A,1,A01,1,ACTAATATGCCTGAG,TNMPE
1,262,T166A_G167C_G168T_T169A_C171T_C175A_A179C_G180...,0.717116,300.0,A,2,A02,1,ACTAATATGCCTGAG,TNMPE
2,186,G167C_G168T_T169A_A170T_C171G_C175A_T176G_G177...,0.572425,410.0,A,3,A03,1,TCTATGAGTCATTGT,SMSHC
3,190,T166G_G167T_A170G_C171T_C175A_T176G_C178G_A179...,0.476841,297.0,A,4,A04,1,GTGTGTAGGGCGTCG,VCRAS
4,277,T166A_G168T_T169A_A170T_C171T_C175G_G177T_C178...,0.610185,424.0,A,5,A05,1,AGTATTGTTAAGACG,SIVKT
...,...,...,...,...,...,...,...,...,...,...
369,129,T166A_G167C_A170C_C171T_C175G_T176G_G177T_C178...,0.235028,156.0,H,8,H08,4,ACGTCTGGTATGTGT,TSGMC
370,120,T166A_G167T_A170G_C171G_T176G_C178A_A179C_C267...,0.363358,455.0,H,9,H09,4,ATGTGGCGGACGTTT,MWRTF
371,163,G167C_G168T_T169A_A170T_C171G_C175A_T176G_G177...,0.545897,270.0,H,10,H10,4,TCTATGAGTCATACT,SMSHT
372,136,T166A_G167C_G168T_T169A_C171T_C175A_A179C_G180...,0.783072,208.0,H,11,H11,4,ACTAATATGCCTCAG,TNMPQ


In [7]:
df = pd.DataFrame()
for i in range(4):
    fitness_df = pd.read_excel('initial/RL-7-49_Integrations.xlsx', sheet_name=i)
    fitness_df['Plate'] = i + 1
    fitness_df['Well'] = fitness_df['Well'].apply(reformat)
    normalizer = fitness_df[fitness_df['Well'].isin(['B04', 'C05', 'D06', 'E07', 'F08', 'G09'])]['NormIso1'].mean()
    fitness_df['PlateNormIso1'] = fitness_df['NormIso1']/normalizer
    fitness_df['PlateNormIso2'] = fitness_df['NormIso2']/normalizer
    df = df.append(fitness_df)

#move plate column to the front
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]
combined = pd.merge(seq_df, df, on=['Plate', 'Well'], how = 'left')
#rename SimpleCombo_evSeq to SimpleCombo
#merged.rename(columns={'SimpleCombo_evSeq': 'SimpleCombo'}, inplace=True)
combined 

  df = df.append(fitness_df)
  df = df.append(fitness_df)
  df = df.append(fitness_df)
  df = df.append(fitness_df)


Unnamed: 0.1,Unnamed: 0,Variant,Alignment Probability,Alignment Count,Row,Column,Well,Plate,CodonCombo,Combo,PlateNormIso2,StdArea,Iso1Area,Iso2Area,NormIso1,NormIso2,PlateNormIso1
0,263,T166A_G167C_G168T_T169A_C171T_C175A_A179C_G180...,0.798821,248.0,A,1,A01,1,ACTAATATGCCTGAG,TNMPE,4.949078,501,149.0,3585.0,0.297405,7.155689,0.205694
1,262,T166A_G167C_G168T_T169A_C171T_C175A_A179C_G180...,0.717116,300.0,A,2,A02,1,ACTAATATGCCTGAG,TNMPE,4.435962,493,224.0,3162.0,0.454361,6.413793,0.314249
2,186,G167C_G168T_T169A_A170T_C171G_C175A_T176G_G177...,0.572425,410.0,A,3,A03,1,TCTATGAGTCATTGT,SMSHC,3.912760,499,588.0,2823.0,1.178357,5.657315,0.814985
3,190,T166G_G167T_A170G_C171T_C175A_T176G_C178G_A179...,0.476841,297.0,A,4,A04,1,GTGTGTAGGGCGTCG,VCRAS,4.714394,501,200.0,3415.0,0.399202,6.816367,0.276099
4,277,T166A_G168T_T169A_A170T_C171T_C175G_G177T_C178...,0.610185,424.0,A,5,A05,1,AGTATTGTTAAGACG,SIVKT,4.623207,504,356.0,3369.0,0.706349,6.684524,0.488531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,129,T166A_G167C_A170C_C171T_C175G_T176G_G177T_C178...,0.235028,156.0,H,8,H08,4,ACGTCTGGTATGTGT,TSGMC,2.943782,514,326.0,2176.0,0.634241,4.233463,0.441026
339,120,T166A_G167T_A170G_C171G_T176G_C178A_A179C_C267...,0.363358,455.0,H,9,H09,4,ATGTGGCGGACGTTT,MWRTF,2.053864,518,640.0,1530.0,1.235521,2.953668,0.859132
340,163,G167C_G168T_T169A_A170T_C171G_C175A_T176G_G177...,0.545897,270.0,H,10,H10,4,TCTATGAGTCATACT,SMSHT,3.390388,515,439.0,2511.0,0.852427,4.875728,0.592744
341,136,T166A_G167C_G168T_T169A_C171T_C175A_A179C_G180...,0.783072,208.0,H,11,H11,4,ACTAATATGCCTCAG,TNMPQ,3.761516,508,217.0,2748.0,0.427165,5.409449,0.297034


In [8]:
combined.to_csv('inital_well2seq.csv', index=False)

In [82]:
stop_df1, expanded_df1, merged_cleaned = process(combined, 'NormIso1')
stop_df2, expanded_df2, cleaned_df2 = process(combined, 'NormIso2')

0.29303384586021164 0.1735153518479471
0.8470824949698189
2.7894086501111777 1.7789757084717752
6.390243902439025


  indices = cleaned_df[cleaned_df['Combo'] == 'WYLQF'][~cleaned_df['Well'].isin(['B04', 'C05', 'D06', 'E07', 'F08', 'G09'])].index
  cleaned_df = cleaned_df.groupby(['Combo']).mean()
  indices = cleaned_df[cleaned_df['Combo'] == 'WYLQF'][~cleaned_df['Well'].isin(['B04', 'C05', 'D06', 'E07', 'F08', 'G09'])].index
  cleaned_df = cleaned_df.groupby(['Combo']).mean()


In [83]:
merged_cleaned = pd.merge(merged_cleaned.drop('PlateNormIso2', axis=1), cleaned_df2[['Combo', 'PlateNormIso2']])
merged_cleaned['Diff'] = merged_cleaned['PlateNormIso1'] - merged_cleaned['PlateNormIso2']
merged_cleaned = merged_cleaned.sort_values(by=['Diff'], ascending=False)

In [84]:
merged_cleaned = merged_cleaned[['Combo', 'NormIso1', 'NormIso2', 'PlateNormIso1', 'PlateNormIso2', 'Diff']].reset_index(drop=True)
merged_cleaned['round'] = 'initial'
merged_cleaned

Unnamed: 0,Combo,NormIso1,NormIso2,PlateNormIso1,PlateNormIso2,Diff,round
0,FRMNY,7.020661,1.171488,4.586441,0.765307,3.821134,initial
1,SAFRY,7.757700,1.977413,5.067932,1.291800,3.776133,initial
2,GIDLY,4.960080,0.854291,3.449043,0.594040,2.855002,initial
3,TSGMY,6.152918,2.136821,4.019563,1.395937,2.623626,initial
4,TNMPY,3.694215,0.758264,3.260248,0.669190,2.591058,initial
...,...,...,...,...,...,...,...
211,LQSGA,0.582278,5.748945,0.513877,5.073604,-4.559727,initial
212,QYKGD,0.242081,5.511312,0.213644,4.863887,-4.650243,initial
213,PCLTD,0.173913,6.942688,0.120283,4.801761,-4.681477,initial
214,NNVER,0.208251,5.593320,0.183788,4.936261,-4.752473,initial


In [85]:
merged_cleaned.to_csv('fitness_initial.csv', index=False)

In [86]:
inital = merged_cleaned

### Process round 1

In [36]:
df = pd.read_csv('round1/RL-7-65_Elegen1_Integrations.csv')
dfs = []
for plate in df['Replicate'].unique():
    sub_df = df[df['Replicate'] == plate]
    normalizer = sub_df[sub_df['Type'] == 'Parent']['NormIso1'].mean()
    print(normalizer)
    sub_df['PlateNormIso1'] = sub_df['NormIso1'] / normalizer
    sub_df['PlateNormIso2'] = sub_df['NormIso2'] / normalizer
    dfs.append(sub_df)

1.9502449585000001
1.73073007175
2.0723383265


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['PlateNormIso1'] = sub_df['NormIso1'] / normalizer
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['PlateNormIso2'] = sub_df['NormIso2'] / normalizer
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['PlateNormIso1'] = sub_df['NormIso1'] / normalizer
A value is trying to be set on 

In [41]:
round2  = pd.concat(dfs).groupby('Variant', as_index=False).mean()
round2['Diff'] = round2['PlateNormIso1'] - round2['PlateNormIso2']
round2 = round2.sort_values('Diff', ascending=False)
round2 = round2[round2['Variant'] != 'Sterile']
round2 = round2[round2['Variant'] != 'WYLQF'] #drop parent from this round (just use old)
round2
round2.rename(columns={'Variant': 'Combo'}, inplace=True)
round2['round'] = 'round1'

round2 = round2[['Combo', 'NormIso1', 'NormIso2', 'PlateNormIso1', 'PlateNormIso2', 'Diff', 'round']]
round1 = round2
round2

  round2  = pd.concat(dfs).groupby('Variant', as_index=False).mean()


Unnamed: 0,Combo,NormIso1,NormIso2,PlateNormIso1,PlateNormIso2,Diff,round
74,MKFNY,9.432613,0.856375,4.931448,0.446740,4.484708,round1
50,FYFNY,9.008440,1.135131,4.721267,0.595250,4.126016,round1
31,FKFNY,8.780988,1.111163,4.580466,0.578125,4.002340,round1
33,FKTNY,8.959874,1.541843,4.697543,0.808106,3.889436,round1
59,GKFNY,8.463420,1.033273,4.415658,0.538264,3.877394,round1
...,...,...,...,...,...,...,...
47,FTFLY,3.053174,1.948142,1.597961,1.021748,0.576213,round1
25,FIFLY,2.244332,1.722342,1.175614,0.904734,0.270880,round1
20,FFFLY,1.711009,1.487548,0.901995,0.784286,0.117709,round1
17,FCFLY,2.087210,2.290904,1.093142,1.201747,-0.108605,round1


### Now process the second round results

In [42]:
def normalize_plate(sub_df):
    normalizer = sub_df[sub_df['Type'] == 'Parent']['NormIso1'].mean()
    print(normalizer)
    sub_df['PlateNormIso1'] = sub_df['NormIso1'] / normalizer
    sub_df['PlateNormIso2'] = sub_df['NormIso2'] / normalizer
    return sub_df

df1 = pd.read_excel('round2/RL-7-77_Elegen2_Trip1_Data.xls')
df2 = pd.read_excel('round2/RL-7-77_Elegen2_Trip2_Data.xls')
df3 = pd.read_excel('round2/RL-7-77_Elegen2_Trip3_Data.xls')

df1 = normalize_plate(df1)
df2 = normalize_plate(df2)
df3 = normalize_plate(df3)

df = df1[['Well', 'Type', 'Variant']]

#concatenate the normiso1 columns from the three dataframes
for feature in ['NormIso1', 'NormIso2', 'PlateNormIso1', 'PlateNormIso2']:
    df[feature] = pd.concat([df1[feature], df2[feature], df3[feature]], axis=1).mean(axis=1)

round2 = df
round2['Diff'] = round2['PlateNormIso1'] - round2['PlateNormIso2']
round2 = round2.sort_values('Diff', ascending=False)
round2 = round2[round2['Type'] != 'Sterile']
round2 = round2[round2['Type'] != 'Parent']
round2
round2.rename(columns={'Variant': 'Combo'}, inplace=True)
round2['round'] = 'round2'

round2 = round2[['Combo', 'NormIso1', 'NormIso2', 'PlateNormIso1', 'PlateNormIso2', 'Diff', 'round']]
round2

round2

1.7348956060437493
0.9837005214918702
1.6163460949461421


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feature] = pd.concat([df1[feature], df2[feature], df3[feature]], axis=1).mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feature] = pd.concat([df1[feature], df2[feature], df3[feature]], axis=1).mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feature] = pd.concat([df

Unnamed: 0,Combo,NormIso1,NormIso2,PlateNormIso1,PlateNormIso2,Diff,round
57,MPFDY,10.180141,0.805376,7.494145,0.589870,6.904275,round2
74,MGFDY,9.195301,0.542126,6.632599,0.394564,6.238035,round2
11,FKMDY,9.531129,1.417157,7.092939,1.043002,6.049937,round2
25,HKFNY,8.940092,0.819603,6.571582,0.602854,5.968727,round2
44,FKMAY,8.958074,0.899219,6.432142,0.646406,5.785736,round2
...,...,...,...,...,...,...,...
64,MYFAW,2.728117,1.853367,1.975724,1.355883,0.619842,round2
90,MEMDW,3.325736,2.541647,2.408365,1.840167,0.568198,round2
65,MYMDW,2.098371,2.363978,1.466306,1.693301,-0.226994,round2
66,GPFAW,0.218307,1.211705,0.154663,0.866176,-0.711513,round2


### Now put it all together

In [96]:
total = pd.concat([inital, round1, round2])
total

Unnamed: 0,Combo,NormIso1,NormIso2,PlateNormIso1,PlateNormIso2,Diff,round
0,FRMNY,7.020661,1.171488,4.586441,0.765307,3.821134,initial
1,SAFRY,7.757700,1.977413,5.067932,1.291800,3.776133,initial
2,GIDLY,4.960080,0.854291,3.449043,0.594040,2.855002,initial
3,TSGMY,6.152918,2.136821,4.019563,1.395937,2.623626,initial
4,TNMPY,3.694215,0.758264,3.260248,0.669190,2.591058,initial
...,...,...,...,...,...,...,...
64,MYFAW,2.728117,1.853367,1.975724,1.355883,0.619842,round2
90,MEMDW,3.325736,2.541647,2.408365,1.840167,0.568198,round2
65,MYMDW,2.098371,2.363978,1.466306,1.693301,-0.226994,round2
66,GPFAW,0.218307,1.211705,0.154663,0.866176,-0.711513,round2


In [97]:
#from ravi's benchling calibration curve
def normiso1toyield(x):
    return (x - 0.0738)/1.6361 * 10 * 1.5

def normiso2toyield(x):
    return (x - 0.0978)/1.6067 * 10 * 1.5

In [98]:
total['yield1'] = total['NormIso1'].apply(normiso1toyield)
total['yield2'] = total['NormIso2'].apply(normiso2toyield)
total

Unnamed: 0,Combo,NormIso1,NormIso2,PlateNormIso1,PlateNormIso2,Diff,round,yield1,yield2
0,FRMNY,7.020661,1.171488,4.586441,0.765307,3.821134,initial,63.689822,10.023846
1,SAFRY,7.757700,1.977413,5.067932,1.291800,3.776133,initial,70.447102,17.547888
2,GIDLY,4.960080,0.854291,3.449043,0.594040,2.855002,initial,44.798116,7.062533
3,TSGMY,6.152918,2.136821,4.019563,1.395937,2.623626,initial,55.734223,19.036107
4,TNMPY,3.694215,0.758264,3.260248,0.669190,2.591058,initial,33.192484,6.166034
...,...,...,...,...,...,...,...,...,...
64,MYFAW,2.728117,1.853367,1.975724,1.355883,0.619842,round2,24.335155,16.389812
90,MEMDW,3.325736,2.541647,2.408365,1.840167,0.568198,round2,29.814219,22.815524
65,MYMDW,2.098371,2.363978,1.466306,1.693301,-0.226994,round2,18.561559,21.156829
66,GPFAW,0.218307,1.211705,0.154663,0.866176,-0.711513,round2,1.324859,10.399309


In [99]:
total.loc[total['Combo'] == 'WYLQF', 'round'] = 'parent'
#move row to top
#total = pd.concat([total[total['round'] == 'parent'], total[total['round'] != 'parent']])

In [94]:
total.to_csv('fitness_all.csv', index=False)

### some old stuff to consider

In [14]:
old_df = pd.read_csv('/disk1/jyang4/repos/data/Pgb_fitness_unnormalized.csv')
new = pd.merge(old_df.drop(['NormIso1', 'NormIso2', 'Diff'], axis = 1), merged_cleaned[['Combo', 'NormIso1', 'NormIso2', 'Diff']], on=['Combo'], how='left')
new = new.sort_values(by=['Diff'], ascending=False)
new

Unnamed: 0.1,Combo,Unnamed: 0,Alignment Probability,Alignment Count,Column,StdArea,Iso1Area,Iso2Area,num_wells,EVMutation,EVMutation Rank,Triad Score,Triad Rank,NormIso1_recomb,NormIso2_recomb,NormIso1_recomb_rank,NormIso2_recomb_rank,NormIso1,NormIso2,Diff
0,FRMNY,7.000000,0.624151,339.00,2.0,484.000000,3398.000000,567.000000,1,-28.982647,69.0,-648.97722,33,,,,,4.586441,0.765307,3.821134
1,SAFRY,2.000000,0.755508,128.00,8.0,487.000000,3778.000000,963.000000,1,-31.842763,118.0,-646.67160,48,0.835324,1.329356,20.0,70.0,5.067932,1.291800,3.776133
2,GIDLY,165.000000,0.086459,197.00,5.0,501.000000,2485.000000,428.000000,1,-31.299863,112.0,-642.32678,97,0.000000,0.000000,155.5,155.5,3.449043,0.594040,2.855002
3,TSGMY,28.000000,0.244922,154.00,3.0,497.000000,3058.000000,1062.000000,1,-29.967584,90.0,-645.07324,60,,,,,4.019563,1.395937,2.623626
5,TNMPY,331.000000,0.867393,241.00,11.0,484.000000,1788.000000,367.000000,1,-33.636411,146.0,-633.74115,162,0.262932,0.495441,52.0,132.0,3.260248,0.669190,2.591058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186,QYKGD,286.000000,0.001800,177.00,4.0,442.000000,107.000000,2436.000000,1,-29.577335,80.0,-641.25164,113,0.390436,4.028886,38.0,2.0,0.213644,4.863887,-4.650243
213,PCLTD,251.000000,0.717479,513.00,1.0,506.000000,88.000000,3513.000000,1,-38.626022,216.0,-629.35627,187,0.075245,2.350554,87.0,19.0,0.120283,4.801761,-4.681477
190,NNVER,306.000000,0.687294,230.00,7.0,509.000000,106.000000,2847.000000,1,-34.952975,176.0,-640.64290,120,,,,,0.183788,4.936261,-4.752473
215,TSGRE,193.000000,0.001915,556.00,2.0,490.000000,107.000000,3496.000000,1,-35.417786,185.0,-639.24294,135,0.093254,1.108917,79.0,83.0,0.151029,4.934558,-4.783529


### Add on the triad scores

In [16]:
def load_input(triad_output_file, WT_combo, num_seqs):
    
    # Load the output file
    with open(triad_output_file) as f:

        # Set some flags for starting analysis
        solutions_started = False
        record_start = False

        # Begin looping over the file
        summary_lines = []
        for line in f:

            # Start looking at data once we hit "solution"
            #if "Solution" in line:
            if "All sequences:" in line:
                solutions_started = True

            # Once we have "Index" we can start recording the rest
            if solutions_started and "Index" in line:
                record_start = True

            # Record appropriate data
            if record_start:

                # Strip the newline and split on whitespace
                summary_line = line.strip().split()

                # If we have hit "Average" stop the whole look
                #if summary_line[0] == "Average":
                #    break

                # Otherwise, append the line
                summary_lines.append(summary_line)

                #if summary_line[0] == '7999':
                if summary_line[0] == str(num_seqs):
                    break

    # Build the dataframe
    all_results = pd.DataFrame(summary_lines[1:], columns = summary_lines[0])
    all_results["Triad Score"] = all_results.Score.astype(float)
    
    wt_chars = WT_combo
    reconstructed_combos = ["".join([char if char != "-" else wt_chars[i] for i, char in enumerate(seq)])
                            for seq in all_results.Seq.values]
    all_results["Combo"] = reconstructed_combos

    # Attach fitness
    #all_results["Fitness"] = all_results.Fitness.values / all_results.Fitness.values.max()
    
    #all_results.loc[all_results['Muts'] == 'WT', 'Fitness'] = 1
    #all_results = all_results[all_results['Fitness'].notna()]
    # Get the order
    all_results["Triad Rank"] = np.arange(1, len(all_results) + 1)
    
    # Split aas into different columns
    #all_results[["AA1", "AA2", "AA3", "AA4"]] = all_results.Combo.apply(lambda x: pd.Series(list(x)))
    
    return all_results

all_df = load_input('ParPgb_5site_triad_fixed.txt', 'WYWVF', 216)

In [17]:
all_df

Unnamed: 0,Index,Tags,Score,Seq,Muts,Triad Score,Combo,Triad Rank
0,0,WT,-666.30091,-----,WT,-666.30091,WYWVF,1
1,1,"A_59L+A_60Q,59",-663.60981,--LQ-,A_59L+A_60Q,-663.60981,WYLQF,2
2,2,"A_59Q+A_60Q,181",-661.23234,--QQ-,A_59Q+A_60Q,-661.23234,WYQQF,3
3,3,"A_56H+A_57L+A_59V+A_60L+A_89W,121",-656.70887,HLVLW,A_56H+A_57L+A_59V+A_60L+A_89W,-656.70887,HLVLW,4
4,4,"A_56S+A_57W+A_60S+A_89W,138",-656.62676,SW-SW,A_56S+A_57W+A_60S+A_89W,-656.62676,SWWSW,5
...,...,...,...,...,...,...,...,...
212,212,"A_56P+A_57P+A_59H+A_60N+A_89G,63",-615.69023,PPHNG,A_56P+A_57P+A_59H+A_60N+A_89G,-615.69023,PPHNG,213
213,213,"A_56P+A_57A+A_59S+A_60L+A_89P,130",-615.60940,PASLP,A_56P+A_57A+A_59S+A_60L+A_89P,-615.60940,PASLP,214
214,214,"A_56T+A_57N+A_59M+A_60P+A_89P,186",-615.12883,TNMPP,A_56T+A_57N+A_59M+A_60P+A_89P,-615.12883,TNMPP,215
215,215,"A_56A+A_57K+A_59P+A_60P+A_89R,164",-614.46968,AKPPR,A_56A+A_57K+A_59P+A_60P+A_89R,-614.46968,AKPPR,216


In [7]:
merged = pd.read_csv('/disk1/jyang4/repos/data/Pgb_fitness.csv')
merged

Unnamed: 0.1,Combo,Unnamed: 0,Alignment Probability,Alignment Count,Column,StdArea,Iso1Area,Iso2Area,NormIso1,num_wells,NormIso2,Diff,EVMutation,EVMutation Rank
0,FRMNY,7.0,0.624151,339.0,2.0,484.0,3398.0,567.0,7.020661,1,1.171488,5.849174,-28.982647,69.0
1,SAFRY,2.0,0.755508,128.0,8.0,487.0,3778.0,963.0,7.757700,1,1.977413,5.780287,-31.842763,118.0
2,GIDLY,165.0,0.086459,197.0,5.0,501.0,2485.0,428.0,4.960080,1,0.854291,4.105788,-31.299863,112.0
3,TSGMY,28.0,0.244922,154.0,3.0,497.0,3058.0,1062.0,6.152918,1,2.136821,4.016097,-29.967584,90.0
4,AKPPY,75.0,0.774387,175.0,7.0,497.0,2692.0,1163.0,5.416499,1,2.340040,3.076459,-35.509484,187.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,VCRAS,190.0,0.476841,297.0,4.0,501.0,200.0,3415.0,0.399202,1,6.816367,-6.417166,-29.500047,78.0
212,HDNMN,173.0,0.155533,277.0,4.0,501.0,148.0,3378.0,0.295409,1,6.742515,-6.447106,-29.217710,72.0
213,PCLTD,251.0,0.717479,513.0,1.0,506.0,88.0,3513.0,0.173913,1,6.942688,-6.768775,-38.626022,216.0
214,ACSEK,60.0,0.655179,483.0,8.0,490.0,243.0,3579.0,0.495918,1,7.304082,-6.808163,-34.614352,172.0


In [8]:
merged_new = merged.merge(all_df[['Combo', 'Triad Score', 'Triad Rank']],on='Combo')
merged_new

Unnamed: 0.1,Combo,Unnamed: 0,Alignment Probability,Alignment Count,Column,StdArea,Iso1Area,Iso2Area,NormIso1,num_wells,NormIso2,Diff,EVMutation,EVMutation Rank,Triad Score,Triad Rank
0,FRMNY,7.0,0.624151,339.0,2.0,484.0,3398.0,567.0,7.020661,1,1.171488,5.849174,-28.982647,69.0,-648.97722,33
1,SAFRY,2.0,0.755508,128.0,8.0,487.0,3778.0,963.0,7.757700,1,1.977413,5.780287,-31.842763,118.0,-646.67160,48
2,GIDLY,165.0,0.086459,197.0,5.0,501.0,2485.0,428.0,4.960080,1,0.854291,4.105788,-31.299863,112.0,-642.32678,97
3,TSGMY,28.0,0.244922,154.0,3.0,497.0,3058.0,1062.0,6.152918,1,2.136821,4.016097,-29.967584,90.0,-645.07324,60
4,AKPPY,75.0,0.774387,175.0,7.0,497.0,2692.0,1163.0,5.416499,1,2.340040,3.076459,-35.509484,187.0,-618.95309,207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,VCRAS,190.0,0.476841,297.0,4.0,501.0,200.0,3415.0,0.399202,1,6.816367,-6.417166,-29.500047,78.0,-641.07042,115
212,HDNMN,173.0,0.155533,277.0,4.0,501.0,148.0,3378.0,0.295409,1,6.742515,-6.447106,-29.217710,72.0,-642.37494,94
213,PCLTD,251.0,0.717479,513.0,1.0,506.0,88.0,3513.0,0.173913,1,6.942688,-6.768775,-38.626022,216.0,-629.35627,187
214,ACSEK,60.0,0.655179,483.0,8.0,490.0,243.0,3579.0,0.495918,1,7.304082,-6.808163,-34.614352,172.0,-639.54535,132


In [9]:
merged_new.to_csv('ParPgb_finess_all.csv', index=None)

In [9]:
# df = pd.read_csv('fitness_all.csv')
# initial = df[df['round'].isin(['initial', 'parent'])]
# initial.to_csv('fitness_round1_training.csv', index=False)