## CS 155 Project 1 Training Extractor.
Extracts Training Data features into a panda data frame. Also saved as a CSV for reference.


In [5]:
import warnings
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import os

In [50]:
'''
Takes in lab, video, track specification for a training csv file.


returns a panda row with some features extracted from that csv file.
'''
def filepuller(lab,video,track):
    
    # get the specified csv
    file = "train_csvs/"
    if lab == 1:
        file += "lab_"
    else: 
        file += "sim_"
        
    file += str(video)
    file += "_"
    file += str(track)
    file += ".csv"
    
    data = pd.read_csv(file)
    data['dx'] = data['x'].diff()
    data['dy'] = data['y'].diff()
    data = data.drop(labels=0, axis=0)
    
    
    # feature engineering
    displacementX = np.sum(data['dx'])
    displacementY = np.sum(data['dy'])
    displacement = np.sqrt(displacementX**2+displacementY**2)
    
    totalTravelX = np.sum(abs(data['dx']))
    totalTravelY = np.sum(abs(data['dy']))
    totalTravel = np.sqrt(totalTravelX**2 + totalTravelY**2)
    
    timeElapsed = np.sum(data.tail(1)['t'])
    
    meanStepDisplacementX = np.mean(data['dx'])
    meanStepDisplacementY = np.mean(data['dy'])
    meanStepDisplacement = np.sqrt(meanStepDisplacementX**2 + meanStepDisplacementY**2)
    
    sdStepDisplacementX = np.std(data['dx'])
    sdStepDisplacementY = np.std(data['dy'])
    sdStepDisplacement = np.sqrt(sdStepDisplacementX**2 + sdStepDisplacementY**2)
    
    meanStepTravelX = np.mean(abs(data['dx']))
    meanStepTravelY = np.mean(abs(data['dy']))
    meanStepTravel = np.sqrt(meanStepTravelX**2 + meanStepTravelY**2)
    
    sdStepTravelX = np.std(abs(data['dx']))
    sdStepTravelY = np.std(abs(data['dy']))
    sdStepTravel = np.sqrt(sdStepTravelX**2 + sdStepTravelY**2)
    
    # Shapiro Wilk test of Normality on displacement; get p value of test
    # Remember if P is low then the null hypothesis must go! 
    # (Low p => not normal)
    # Expect a normal distribution centered on 0 for Brownian motion.
    # https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test
    # https://en.wikipedia.org/wiki/Brownian_motion
    if len(data) >=3:
        shapiroDisplacementX = stats.shapiro(data['dx']).pvalue
        shapiroDisplacementY = stats.shapiro(data['dy']).pvalue
        shapiroDisplacement = stats.shapiro(np.sqrt(data['dx']**2+data['dy']**2)).pvalue
    else:
        # We can't collect Shapiro Wilk P-Value Data.
        # This is a problem. I will just assign these variables 0.05
        # on the cusp of 5% significance for normality vs non normality. perhaps a bad 
        # decision- to discuss.
        shapiroDisplacementX = 0.05
        shapiroDisplacementY = 0.05
        shapiroDisplacement = 0.05

    

    
    row = {
            'displacementX': displacementX, 
            'displacementY': displacementY,
            'displacement': displacement,
            
            'totalTravelX': totalTravelX,
            'totalTravelY': totalTravelY,
            'totalTravel': totalTravel,
        
            'timeElapsed': timeElapsed,
        
            'meanStepDisplacementX': meanStepDisplacementX,
            'meanStepDisplacementY': meanStepDisplacementY,
            'meanStepDisplacement': meanStepDisplacement,
            
            'meanStepTravelX': meanStepTravelX,
            'meanStepTravelY': meanStepTravelY,
            'meanStepTravel': meanStepTravel,
        
            'sdStepTravelX': sdStepTravelX,
            'sdStepTravelY': sdStepTravelY,
            'sdStepTravel': sdStepTravel,
            
            'shapiroDisplacementX': shapiroDisplacementX,
            'shapiroDisplacementY': shapiroDisplacementY,
            'shapiroDisplacement': shapiroDisplacement,
                
            
            'lab': lab}
    
    #return pd.DataFrame.from_dict(row)
    return row

In [51]:
# Getting training data into a csv.
directory = "train_csvs/"
video0 = "0"
trainingData = pd.DataFrame()
rows = pd.DataFrame()
warnings.filterwarnings("ignore")

for root, dirs, files in os.walk(directory):  
    for filename in tqdm(sorted(files)):
        parts = filename.split('_')
        lab = (parts[0] == 'lab') * 1.0
        video = parts[1]
        track = parts[2][:-4]
        
        # We should collect features on videos. (Track particles' neighbors behavior in same video)
        if video != video0:
            video0 = video
            meanVideoStepDisplacementX = np.mean(rows['meanStepDisplacementX'])
            meanVideoStepDisplacementY = np.mean(rows['meanStepDisplacementY'])
            meanVideoStepDisplacement = np.mean(rows['meanStepDisplacement'])

            sdVideoMeanStepDisplacementX = np.std(rows['meanStepDisplacementX'])
            sdVideoMeanStepDisplacementY = np.std(rows['meanStepDisplacementY'])
            sdVideoMeanStepDisplacement = np.std(rows['meanStepDisplacement'])
                
            # Maybe we should get standard deviation and mean of standard deviations but I am unsure
            # I have not done this. Could be a thing to add. 

            meanStepTravelX = np.mean(rows['meanStepTravelX'])
            meanStepTravelY = np.mean(rows['meanStepTravelY'])
            meanStepTravel = np.mean(rows['meanStepTravel'])

            sdVideoMeanStepTravelX = np.std(rows['meanStepTravelX'])
            sdVideoMeanStepTravelY = np.std(rows['meanStepTravelY'])
            sdVideoMeanStepTravel = np.std(rows['meanStepTravel'])
                
            
            rows['meanVideoStepDisplacementX'] = meanVideoStepDisplacementX
            rows['meanVideoStepDisplacementY'] = meanVideoStepDisplacementY
            rows['meanVideoStepDisplacement'] = meanVideoStepDisplacement
                
            rows['sdVideoMeanStepDisplacementX'] = sdVideoMeanStepDisplacementX
            rows['sdVideoMeanStepDisplacementY'] = sdVideoMeanStepDisplacementY
            rows['sdVideoMeanStepDisplacement'] = sdVideoMeanStepDisplacement
                
            rows['meanStepTravelX'] = meanStepTravelX
            rows['meanStepTravelY'] = meanStepTravelY
            rows['meanStepTravel'] = meanStepTravel
                
            rows['sdVideoMeanStepTravelX'] = sdVideoMeanStepTravelX
            rows['sdVideoMeanStepTravelY'] = sdVideoMeanStepTravelY
            rows['sdVideoMeanStepTravel'] = sdVideoMeanStepTravel
                
            trainingData = trainingData.append(rows)
                
            rows = pd.DataFrame()
            rows = rows.append(filepuller(lab, video, track), ignore_index = True)
                
        else: 
            rows = rows.append(filepuller(lab, video, track), ignore_index = True)
            
        

100%|████████████████████████████████████████████████████████████████████████████████████████████| 16080/16080 [00:34<00:00, 466.06it/s]


In [52]:
trainingData 

Unnamed: 0,displacementX,displacementY,displacement,totalTravelX,totalTravelY,totalTravel,timeElapsed,meanStepDisplacementX,meanStepDisplacementY,meanStepDisplacement,...,lab,meanVideoStepDisplacementX,meanVideoStepDisplacementY,meanVideoStepDisplacement,sdVideoMeanStepDisplacementX,sdVideoMeanStepDisplacementY,sdVideoMeanStepDisplacement,sdVideoMeanStepTravelX,sdVideoMeanStepTravelY,sdVideoMeanStepTravel
0,-37.353,-15.415,40.408772,40.603,24.497,47.420529,301.0,-0.124510,-0.051383,0.134696,...,1.0,0.018072,-0.297990,0.620333,0.285619,1.081449,0.977457,0.65031,0.958145,1.142472
1,-27.248,-18.540,32.957322,33.082,28.292,43.529944,301.0,-0.090827,-0.061800,0.109858,...,1.0,0.018072,-0.297990,0.620333,0.285619,1.081449,0.977457,0.65031,0.958145,1.142472
2,-29.438,-11.623,31.649486,38.020,26.041,46.083121,301.0,-0.098127,-0.038743,0.105498,...,1.0,0.018072,-0.297990,0.620333,0.285619,1.081449,0.977457,0.65031,0.958145,1.142472
3,-39.837,-14.879,42.524948,46.421,34.359,57.753356,301.0,-0.132790,-0.049597,0.141750,...,1.0,0.018072,-0.297990,0.620333,0.285619,1.081449,0.977457,0.65031,0.958145,1.142472
4,-1.616,0.055,1.616936,17.134,20.303,26.566629,301.0,-0.005387,0.000183,0.005390,...,1.0,0.018072,-0.297990,0.620333,0.285619,1.081449,0.977457,0.65031,0.958145,1.142472
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29,201.000,37.000,204.377102,557.000,509.000,754.539595,300.0,0.670000,0.123333,0.681257,...,0.0,0.891999,1.019608,1.552108,0.778576,0.596633,0.623197,0.49163,0.552429,0.672781
30,530.000,137.000,547.420314,548.000,311.000,630.099199,181.0,2.928177,0.756906,3.024422,...,0.0,0.891999,1.019608,1.552108,0.778576,0.596633,0.623197,0.49163,0.552429,0.672781
31,2.000,140.000,140.014285,564.000,602.000,824.924239,300.0,0.006667,0.466667,0.466714,...,0.0,0.891999,1.019608,1.552108,0.778576,0.596633,0.623197,0.49163,0.552429,0.672781
32,21.000,24.000,31.890437,21.000,24.000,31.890437,12.0,1.750000,2.000000,2.657536,...,0.0,0.891999,1.019608,1.552108,0.778576,0.596633,0.623197,0.49163,0.552429,0.672781


In [54]:
trainingData.to_csv('trainingFeatures.csv')