# Clean up the SAXS data
This is an ad-hoc generated notebook to ensure scattering data all conforms to the same format in terms of:

1. q being in Å
2. Files being *bona fide* TSV files (e.g. separated using a tab character, not a comma)
3. Remove comment lines (e.g. lines that start with a `#` character)

Note that there should be no need to re-run this code, but we include it for completeness.

In [5]:
import protfasta

# get all the names for the sequences we're gonna look at
names = list(protfasta.read_fasta('experiment/sequences.fasta').keys())

In [6]:
# one type of fixing - sequences where q is n nanometers

converter = ['a1_lcd', 'alpha_syn', 'ghr_icd', 'anac046', 
             'hst5', 'nurs_red1','eif4f_p150', 'tir_ctd', 'tau_504_758', 'n_fatz1', 'fatz1_delta91', 'bmal1_530_625', 'n_cornid',
             'atcp12', 'mbp','ANAC013_161_274']
for n in converter:
  
    multiplier = 0.1

    # read in old file
    with open(f'experiment/{n}/{n}.dat','r') as fh:
        content = fh.readlines()

    # write out new file
    with open(f'experiment/{n}/{n}_clean.dat','w') as fh:

        # for each line in the original file
        for line in content:        
            sline = line.strip()
            sline = sline.split()            

            # skip any comment lines
            if len(sline) == 0 or sline[0] == '#':
                continue

            # if a line doesn't break down into 3 values
            if len(sline) == 3:
                fh.write(f"{float(sline[0])*multiplier}\t{sline[1]}\t{sline[2]}"+"\n")
            else:
                print(line)    

In [7]:
# Another type of fixing for files with four columns instead of three (we skip 4th column).
converter = ['ebna1_381_455']
for n in converter:
  
    multiplier = 1
    
    with open(f'experiment/{n}/{n}.dat','r') as fh:
        content = fh.readlines()
    
    with open(f'experiment/{n}/{n}_clean.dat','w') as fh:
          
        for line in content:        
            sline = line.strip()
            sline = sline.split()            
            if len(sline) == 0 or sline[0] == '#':
                continue
            if len(sline) == 4:
                try:
                    fh.write(f"{float(sline[0])*multiplier}\t{sline[1]}\t{sline[2]}"+"\n")
                except ValueError as e:
                    print(f'ERROR WITH {n}')
                    raise(e)
                    
            else:
                print(line)    

In [8]:
# a third type of fixing, where we are just removing comment lines but not converting
# nm to angstroms
converter = ['sfafp','prota','sic1','nhE6cmdd','dss1','ash1','pol2_ctd', 
             'serf','hev_pnt3', 'hev_pnt3_yyy_aaa', 'hev_pnt3_200_314',
             'e1a_36_146','syndecan3_ed','syndecan4_ed','ul11','trf2_ntd',
             'laf1_rgg_ysg2max']
for n in converter:
  
    multiplier = 1
    
    with open(f'experiment/{n}/{n}.dat','r') as fh:
        content = fh.readlines()
    
    with open(f'experiment/{n}/{n}_clean.dat','w') as fh:
          
        for line in content:        
            sline = line.strip()
            sline = sline.split()            
            if len(sline) == 0 or sline[0] == '#':
                continue
            if len(sline) == 3:
                try:
                    fh.write(f"{float(sline[0])*multiplier}\t{sline[1]}\t{sline[2]}"+"\n")
                except ValueError as e:
                    print(f'ERROR WITH {n}')
                    raise(e)
                    
            else:
                print(line)    

### HEADER:

#{

#}

### DATA:

#471

