## Location-Based Data Frame: Reshape data frame such that location and mutations are both represented in feature columns

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('amino_acid_genotypes_to_brightness.tsv', sep = '\t', engine = 'python')
df.head()

Unnamed: 0,aaMutations,uniqueBarcodes,medianBrightness,std
0,,3645,3.719212,0.106992
1,SA108D,1,1.30103,
2,SA108D:SN144D:SI186V:SM231T:SL234P,1,1.301031,
3,SA108D:SY141N:SH146L:SE220G,1,1.301189,
4,SA108G,2,3.708478,0.020613


In [3]:
mutants = df['aaMutations']       #split to list of aminoacids
mutants = mutants.str.split(':')
mutants = mutants[1:]
mutants.head()

1                                    [SA108D]
2    [SA108D, SN144D, SI186V, SM231T, SL234P]
3            [SA108D, SY141N, SH146L, SE220G]
4                                    [SA108G]
5            [SA108G, SK138M, SR166C, SL234P]
Name: aaMutations, dtype: object

Separating the 2ond entry to aaMut and position 

In [5]:
pd.Series(mutants[2])

0    SA108D
1    SN144D
2    SI186V
3    SM231T
4    SL234P
dtype: object

In [7]:
mut_separated = pd.Series(mutants[2]).str.extract('(?P<position>^[A-Z]{2}\d+)(?P<mutation>[A-Z]$)')

mut_separated

  if __name__ == '__main__':


Unnamed: 0,position,mutation
0,SA108,D
1,SN144,D
2,SI186,V
3,SM231,T
4,SL234,P


In [8]:
mut_position = []
for i in xrange(1,len(mutants)):
    mut_position.append(pd.Series(mutants[i]).str.extract('(^[A-Z]{2}\d+)'))

  app.launch_new_instance()


In [10]:
df_mut_position = pd.DataFrame(mut_position)
df_mut_position.head()

In [18]:
uniq_mut_pos=np.unique(df_mut_position.values.ravel()) #number of columns 

uniq_mut_pos = uniq_mut_pos[1:] #droping the NaN entry 
len(uniq_mut_pos)

233

In [23]:
df_dummies = pd.get_dummies(df_mut_position, prefix='',prefix_sep='')

In [24]:
df_dummies.head()

Unnamed: 0,SA108,SA152,SA177,SA204,SA224,SA225,SA35,SA85,SC46,SC68,...,SY235,SD208,SD214,SQ202,ST223,ST228,SY198,SY235.1,SH229,SL205
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
np.unique(df_dummies.columns)==uniq_mut_pos

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [34]:
uniq_mut_pos

array(['SA108', 'SA152', 'SA177', 'SA204', 'SA224', 'SA225', 'SA35',
       'SA85', 'SC46', 'SC68', 'SD100', 'SD101', 'SD115', 'SD127', 'SD131',
       'SD153', 'SD17', 'SD171', 'SD178', 'SD188', 'SD19', 'SD195',
       'SD208', 'SD214', 'SD232', 'SD34', 'SD74', 'SD80', 'SE109', 'SE113',
       'SE122', 'SE130', 'SE140', 'SE15', 'SE170', 'SE211', 'SE220',
       'SE233', 'SE3', 'SE30', 'SE32', 'SE4', 'SE88', 'SE93', 'SF112',
       'SF128', 'SF163', 'SF221', 'SF25', 'SF44', 'SF6', 'SF69', 'SF81',
       'SF82', 'SF97', 'SF98', 'SG102', 'SG114', 'SG125', 'SG132', 'SG136',
       'SG158', 'SG172', 'SG18', 'SG187', 'SG189', 'SG2', 'SG22', 'SG226',
       'SG230', 'SG29', 'SG31', 'SG33', 'SG38', 'SG49', 'SG65', 'SG8',
       'SG89', 'SH137', 'SH146', 'SH167', 'SH179', 'SH197', 'SH215',
       'SH229', 'SH23', 'SH75', 'SH79', 'SI12', 'SI121', 'SI126', 'SI134',
       'SI150', 'SI159', 'SI165', 'SI169', 'SI186', 'SI227', 'SI45',
       'SI96', 'SK1', 'SK105', 'SK111', 'SK124', 'SK129', 'SK13

In [38]:
def duplicate_columns(frame):
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []
    for t, v in groups.items():
        dcols = frame[v].to_dict(orient="list")

        vs = dcols.values()
        ks = dcols.keys()
        lvs = len(vs)

        for i in range(lvs):
            for j in range(i+1,lvs):
                if vs[i] == vs[j]: 
                    dups.append(ks[i])
                    break

    return dups       

In [40]:
dups = duplicate_columns(df_dummies)
df_dummies = df_dummies.drop(dups, axis=1)



In [41]:
df_dummies

Unnamed: 0,SA108,SA152,SA177,SA204,SA224,SA225,SA35,SA85,SC46,SD100,...,SL234,SM231,SQ202,SV222,SY235,SD208,SQ202.1,ST228,SY198,SY235.1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
drop