# Data cleanup

Sometimes MATLAB writes individual entries into the localization files that are not easily parsed (or impossible to parse in ThunderSTORM). For this reason, I need to find a way to clean up the data files.

### Clean up of data

In [1]:
%pylab
import DataSTORM.processors as ds
import pandas as pd
from pathlib import Path

Using matplotlib backend: Qt4Agg
Populating the interactive namespace from numpy and matplotlib


In [2]:
filename = Path('../test-data/MicroTubules_LargeFOV/FOV1_1500_10ms_1_MMStack_locResults.dat')
with open(str(filename.resolve()), 'r') as file:
    df = pd.read_csv(file, engine = 'c')

  interactivity=interactivity, compiler=compiler, result=result)


Unfortunately, the data in the uncertainty column was saved such that most of the numbers are floats, but some are strings representing floats and some are in a strange complex exponential form. Let's filter out these rows to make working with the data frame easier and to protoype a clean up routine.

First, I'll generate a mask to pick out the rows containing strings.

In [3]:
stringMask = df['uncertainty [nm]'].map(lambda x: isinstance(x, str)).as_matrix()

Let's see what the strings look like:

In [None]:
df['uncertainty [nm]'][stringMask]

So, strangely there were about 11,000 localizations that were interpreted as strings in this data set. Let's cast them as numeric data types. Some of the strings cannot be recognized by the parser, so we'll convert those to NaN's by using the `errors='coerce'` argument.

In [4]:
df['uncertainty [nm]'] = pd.to_numeric(df['uncertainty [nm]'], errors='coerce')

Finally, we need to replace any Inf's with NaN's and then drop the NaN's. We'll reindex the final result.

In [5]:
df.replace([np.inf, -np.inf], np.nan, inplace = True)
df.dropna().describe()

Unnamed: 0,x [nm],y [nm],z [nm],frame,uncertainty [nm],intensity [photon],offset [photon],loglikelihood,sigma [nm]
count,8667397.0,8667397.0,8667397,8667397.0,8667397.0,8667397.0,8667397.0,8667397.0,8667397.0
mean,34426.251067,34906.420891,0,20088.62312,805529200000.0,3642.044266,344.474718,203.382508,134.408034
std,22145.547453,13027.020305,0,14416.319868,1344216000000000.0,2569.207387,69.663961,824.384742,18.041903
min,7.4708,2.1061,0,100.0,0.44138,1.0,98.01,-29.974,54.0
25%,12235.0,24933.0,0,6720.0,4.9379,2256.8,289.83,96.946,123.7
50%,36195.0,35067.0,0,19463.0,6.6686,3023.2,336.66,124.96,131.56
75%,55460.0,45292.0,0,31084.0,8.4971,4289.3,391.2,174.81,141.41
max,67067.0,67033.0,0,49999.0,3.5605e+18,86565.0,2294.8,454360.0,378.0


In [6]:
df.reindex()
print('Done.')

Unnamed: 0,x [nm],y [nm],z [nm],frame,uncertainty [nm],intensity [photon],offset [photon],loglikelihood,sigma [nm]
0,151.05,19343.0,0,100,8.6886,4111.8,472.33,137.820,170.48
1,367.18,21417.0,0,100,6.6719,3815.7,394.68,146.180,150.83
2,422.42,28225.0,0,100,8.6193,1847.1,388.10,190.480,113.18
3,519.29,15155.0,0,100,10.2410,2570.1,372.53,79.677,162.12
4,590.09,24756.0,0,100,8.2979,2400.6,365.81,92.517,133.72
5,685.17,2734.3,0,100,3.3740,7808.7,379.33,243.630,142.11
6,607.30,7347.8,0,100,6.1314,3491.5,332.58,120.590,140.65
7,701.78,29090.0,0,100,9.5573,1899.1,357.28,135.030,127.62
8,822.21,31915.0,0,100,7.6322,5612.2,354.24,575.820,210.79
9,745.04,56281.0,0,100,7.0961,1887.1,324.88,180.240,104.79


## Normal localization processing
Now that the data is cleaned up a bit, we'll proceed with our normal processing.

In [None]:
df.describe()

In [7]:
FilterLLR  = ds.Filter('loglikelihood', '<', 400)
FilterSig1 = ds.Filter('sigma [nm]',    '>', 100)
FilterSig2 = ds.Filter('sigma [nm]',    '<', 180)
df2        = FilterSig2(FilterSig1(FilterLLR(df)))

In [None]:
df2.describe()

## Display the 2D histogram to visually identify fiducials

Now we need to make a 2D histogram to see whether the fiducial localizations are apparent.

In [76]:
import importlib
importlib.reload(ds)

<module 'DataSTORM.processors' from '/home/douglass/src/DataSTORM/DataSTORM/processors.py'>

In [66]:
# Find maximum x or y coordinate
maxPos    = np.max([df2['x [nm]'].max(), df2['y [nm]'].max()])
pixelSize = 100 # nm

numBins = int(maxPos / pixelSize)
plt.hist2d(df2['x [nm]'], df2['y [nm]'], bins = numBins)
plt.show()

In [67]:
plt.close()

In [77]:
corrector = ds.FiducialDriftCorrect(mergeRadius           = 50,
                                    offTime               = 1,
                                    minSegmentLength      = 20,
                                    minFracFiducialLength = 0.4,
                                    neighborRadius        = 500,
                                    smoothingWindowSize   = 625,
                                    smoothingFilterSize   = 500,
                                    searchRegions         = {'x' : [(2200, 2800)], 'y' : [(33200, 33700)]})

In [78]:
df3 = corrector(df2)

Frame 49999: 1 trajectories present


In [83]:
# Find maximum x or y coordinate
maxPos    = np.max([df3['x [nm]'].max(), df3['y [nm]'].max()])
pixelSize = 100 # nm

numBins = int(maxPos / pixelSize)
plt.hist2d(df3['x [nm]'], df3['y [nm]'], bins = numBins)
plt.show()

In [82]:
x0 = corrector.fiducialTrajectories[0]['x'].iloc[[0]].as_matrix()
plt.plot(corrector.fiducialTrajectories[0]['frame'], corrector.fiducialTrajectories[0]['x'] - x0)
plt.plot(corrector.avgSpline.index,corrector.avgSpline['xS'])
plt.plot(np.arange(100,50000), corrector.splines['xS'][0](np.arange(100,50000)) - x0)
plt.show()

In [84]:
# BE SURE TO DROP NULLS FIRST
df3.dropna(inplace = True)
df3.to_csv('fullData.csv', index = False)

In [None]:
df5 = df2[df2['frame'] > 4000]

In [None]:
df5.to_csv('partialData.csv', index = False)

In [None]:
df2.dropna(inplace = True)