Lets locate the data we want and then export it into a .csv

In [2]:
import os
import csv
import pandas as pd
import numpy as np
#from io import StringIO

In [3]:
file_path = "Sample_Data"
data_path = "Sample_Data\\s15-3\\s15-3\\_TC2340ms_resX100um_resY100um.t2t"

Format:\
Sample,Scan,Location,AC,x,y,1,2,...,n-1, n\
We are looking for the ".t2t" file

Here is an example of the opened .t2t file as raw data. We need to remove the first few rows and then update their formatting.

In [4]:
with open(data_path, "r") as file:
    content = file.read()

#Some prints so we can help understand the data better
#print(type(content))
#print(len(content))
#print(content)

lets process our input and try to remove the first few rows

In [5]:
#Generate a list of strings of the content string
lines = content.strip().split("\n")
#print(type(lines))
#print(len(lines))

#Remove the first few lines which are not data
del lines[0:4]
#print(lines[0])

# Split each string by commas to create a list of lists
# This process takes a lot of time. 
# perhaps this can be optimized by splitting the array with the string handling
list_of_lists = [s.split(',') for s in lines]

# Convert the list of lists into a NumPy array
numpy_array = np.array(list_of_lists, dtype=np.float32)
#print(numpy_array)

df = pd.DataFrame(numpy_array)

Here is our dataframe, df. It has 1003 columns for XYZ and then the 1000 features.

In [6]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,994,995,996,997,998,999,1000,1001,1002,1003
0,-165397.0,618601.0,-605104.0,-0.001,-0.006,-0.013,-0.008,0.028,-0.024,-0.005,...,0.061,0.058,0.034,0.079,0.039,0.057,0.063,0.066,0.060,0.091
1,-165397.0,618601.0,-605104.0,0.001,-0.009,0.003,0.005,0.001,0.002,0.002,...,0.002,-0.009,0.009,0.004,-0.007,0.001,0.003,-0.001,-0.000,0.007
2,-165397.0,618601.0,-184142.0,0.002,-0.007,0.017,-0.002,-0.010,-0.021,-0.004,...,0.023,0.041,0.037,0.022,0.029,0.028,0.046,0.026,0.066,0.008
3,-165397.0,618601.0,-184142.0,0.006,-0.005,0.002,-0.001,-0.001,0.001,0.005,...,0.007,0.001,0.001,0.002,0.003,0.004,0.006,0.000,0.003,0.003
4,-165397.0,618601.0,-439142.0,-0.012,0.016,-0.016,0.004,0.008,-0.017,0.021,...,0.063,0.047,0.072,0.073,0.033,0.056,0.059,0.036,0.058,0.062
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21499,-167398.0,787588.0,-607623.0,-0.000,-0.006,-0.002,0.007,0.001,-0.002,-0.001,...,-0.001,-0.009,-0.003,0.001,-0.007,-0.001,0.002,0.004,-0.011,-0.005
21500,-166399.0,787588.0,-607461.0,0.016,-0.005,-0.012,-0.012,0.013,-0.006,0.012,...,0.065,0.094,0.070,0.085,0.052,0.091,0.068,0.035,0.033,0.071
21501,-166399.0,787588.0,-607461.0,-0.004,0.003,0.000,-0.000,0.001,0.003,-0.001,...,0.009,-0.006,-0.011,0.010,-0.001,-0.002,0.002,0.001,-0.010,-0.011
21502,-165399.0,787588.0,-607740.0,-0.015,0.003,-0.015,0.015,0.012,0.011,0.009,...,0.019,0.024,0.014,0.043,0.071,0.061,0.086,0.050,0.056,0.090


Now we need to add the columns for Sample, Scan, Location, AC  
Sample : Gotten from file name  
Scan : Gotten from file name  
Location : row count  
AC : figure this out later. Counterfeit or Authentic label

In [8]:
# Extract Sample and Scan from the file path
path = data_path.split("\\")[-3]
scan = path.split("-")[1]
sample = path.split("-")[0]
sample = sample[1:]

#print(scan)
#print(sample)

df2 = df.copy()
# Add the columns to the dataframe
df2.insert(0, 'Sample', sample)
df2.insert(1, 'Scan', scan)
df2.insert(2, 'Location', range(1, len(df) + 1))
df2.insert(3, 'AC', 0)

# Rename the columns after 'AC' to 'X', 'Y', 'Z', and then start counting from 1
df2.columns = list(df2.columns[:4]) + ['X', 'Y', 'Z'] + list(range(1, len(df2.columns) - 6))

df2.head()

Unnamed: 0,Sample,Scan,Location,AC,X,Y,Z,1,2,3,...,992,993,994,995,996,997,998,999,1000,1001
0,15,3,1,0,-165397.0,618601.0,-605104.0,-0.001,-0.006,-0.013,...,0.061,0.058,0.034,0.079,0.039,0.057,0.063,0.066,0.06,0.091
1,15,3,2,0,-165397.0,618601.0,-605104.0,0.001,-0.009,0.003,...,0.002,-0.009,0.009,0.004,-0.007,0.001,0.003,-0.001,-0.0,0.007
2,15,3,3,0,-165397.0,618601.0,-184142.0,0.002,-0.007,0.017,...,0.023,0.041,0.037,0.022,0.029,0.028,0.046,0.026,0.066,0.008
3,15,3,4,0,-165397.0,618601.0,-184142.0,0.006,-0.005,0.002,...,0.007,0.001,0.001,0.002,0.003,0.004,0.006,0.0,0.003,0.003
4,15,3,5,0,-165397.0,618601.0,-439142.0,-0.012,0.016,-0.016,...,0.063,0.047,0.072,0.073,0.033,0.056,0.059,0.036,0.058,0.062


In [11]:
"""
output_folder = "Outputs"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

df2.to_csv(os.path.join(output_folder, "output.csv"), index=False)
"""
df2.to_csv("output.csv", index=False)