## CS 155 Project 1 Training Extractor.
Extracts Training Data features into a panda data frame. Also saved as a CSV for reference.


In [53]:
import warnings
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import os

In [54]:
'''
Takes in lab, video, track specification for a training csv file.


returns a panda row with some features extracted from that csv file.
'''
def filepuller(lab,video,track):
    
    # get the specified csv
    file = "train_csvs/"
    if lab == 1:
        file += "lab_"
    else: 
        file += "sim_"
        
    file += str(video)
    file += "_"
    file += str(track)
    file += ".csv"
    
    data = pd.read_csv(file)
    data['dx'] = data['x'].diff()
    data['dy'] = data['y'].diff()
    
    # data = data.drop(labels=0, axis=0) not sure this is safe because we might have 1 row test data 
    # going to set to 0 which may not be best decision- up for debate
    data = data.replace(np. nan,0)
    
    # feature engineering
    displacementX = np.sum(data['dx'])
    displacementY = np.sum(data['dy'])
    displacement = np.sqrt(displacementX**2+displacementY**2)
    
    totalTravelX = np.sum(abs(data['dx']))
    totalTravelY = np.sum(abs(data['dy']))
    totalTravel = np.sqrt(totalTravelX**2 + totalTravelY**2)
    
    timeElapsed = np.sum(data.tail(1)['t'])
    
    meanStepDisplacementX = np.mean(data['dx'])
    meanStepDisplacementY = np.mean(data['dy'])
    meanStepDisplacement = np.sqrt(meanStepDisplacementX**2 + meanStepDisplacementY**2)
    if(pd.isnull(meanStepDisplacementX)):
        print(lab)
        print(video)
        print(track)
    
    sdStepDisplacementX = np.std(data['dx'])
    sdStepDisplacementY = np.std(data['dy'])
    sdStepDisplacement = np.sqrt(sdStepDisplacementX**2 + sdStepDisplacementY**2)
    
    meanStepTravelX = np.mean(abs(data['dx']))
    meanStepTravelY = np.mean(abs(data['dy']))
    meanStepTravel = np.sqrt(meanStepTravelX**2 + meanStepTravelY**2)
    
    sdStepTravelX = np.std(abs(data['dx']))
    sdStepTravelY = np.std(abs(data['dy']))
    sdStepTravel = np.sqrt(sdStepTravelX**2 + sdStepTravelY**2)
    
    # Shapiro Wilk test of Normality on displacement; get p value of test
    # Remember if P is low then the null hypothesis must go! 
    # (Low p => not normal)
    # Expect a normal distribution centered on 0 for Brownian motion.
    # https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test
    # https://en.wikipedia.org/wiki/Brownian_motion
    if len(data) >=3:
        shapiroDisplacementX = stats.shapiro(data['dx']).pvalue
        shapiroDisplacementY = stats.shapiro(data['dy']).pvalue
        shapiroDisplacement = stats.shapiro(np.sqrt(data['dx']**2+data['dy']**2)).pvalue
    else:
        # We can't collect Shapiro Wilk P-Value Data.
        # This is a problem. I will just assign these variables 0.05
        # on the cusp of 5% significance for normality vs non normality. perhaps a bad 
        # decision- to discuss.
        shapiroDisplacementX = 0.05
        shapiroDisplacementY = 0.05
        shapiroDisplacement = 0.05

    

    
    row = {
            'displacementX': displacementX, 
            'displacementY': displacementY,
            'displacement': displacement,
            
            'totalTravelX': totalTravelX,
            'totalTravelY': totalTravelY,
            'totalTravel': totalTravel,
        
            'timeElapsed': timeElapsed,
        
            'meanStepDisplacementX': meanStepDisplacementX,
            'meanStepDisplacementY': meanStepDisplacementY,
            'meanStepDisplacement': meanStepDisplacement,
            
            'meanStepTravelX': meanStepTravelX,
            'meanStepTravelY': meanStepTravelY,
            'meanStepTravel': meanStepTravel,
        
            'sdStepTravelX': sdStepTravelX,
            'sdStepTravelY': sdStepTravelY,
            'sdStepTravel': sdStepTravel,
            
            'shapiroDisplacementX': shapiroDisplacementX,
            'shapiroDisplacementY': shapiroDisplacementY,
            'shapiroDisplacement': shapiroDisplacement,
                
            
            'lab': lab,
            'label': np.sum(data.tail(1)['label'])
            }
    
    #return pd.DataFrame.from_dict(row)
    return row

In [59]:
# Getting training data into a csv.
directory = "train_csvs/"
video0 = "0"
trainingData = pd.DataFrame()
rows = pd.DataFrame()
warnings.filterwarnings("ignore")
count = 0

for root, dirs, files in os.walk(directory):  
    for filename in tqdm(sorted(files)):
        parts = filename.split('_')
        lab = (parts[0] == 'lab') * 1.0
        video = parts[1]
        track = parts[2][:-4]
        
        # last row; add to rows for mean evaluation. nb I make the assumption that the last file is 
        # part of the video of the penultimate file. This is valid, looking at the test and train csvs.
        if count == len(files)-1:
            rows = rows.append(filepuller(lab, video, track), ignore_index = True)
        
        # We should collect features on videos. (Track particles' neighbors behavior in same video)
        if video != video0 or count == len(files)-1:
            video0 = video
            meanVideoStepDisplacementX = np.mean(rows['meanStepDisplacementX'])
            meanVideoStepDisplacementY = np.mean(rows['meanStepDisplacementY'])
            meanVideoStepDisplacement = np.mean(rows['meanStepDisplacement'])

            sdVideoMeanStepDisplacementX = np.std(rows['meanStepDisplacementX'])
            sdVideoMeanStepDisplacementY = np.std(rows['meanStepDisplacementY'])
            sdVideoMeanStepDisplacement = np.std(rows['meanStepDisplacement'])
                
            # Maybe we should get standard deviation and mean of standard deviations but I am unsure
            # I have not done this. Could be a thing to add. 

            meanStepTravelX = np.mean(rows['meanStepTravelX'])
            meanStepTravelY = np.mean(rows['meanStepTravelY'])
            meanStepTravel = np.mean(rows['meanStepTravel'])

            sdVideoMeanStepTravelX = np.std(rows['meanStepTravelX'])
            sdVideoMeanStepTravelY = np.std(rows['meanStepTravelY'])
            sdVideoMeanStepTravel = np.std(rows['meanStepTravel'])
                
            
            rows['meanVideoStepDisplacementX'] = meanVideoStepDisplacementX
            rows['meanVideoStepDisplacementY'] = meanVideoStepDisplacementY
            rows['meanVideoStepDisplacement'] = meanVideoStepDisplacement
                
            rows['sdVideoMeanStepDisplacementX'] = sdVideoMeanStepDisplacementX
            rows['sdVideoMeanStepDisplacementY'] = sdVideoMeanStepDisplacementY
            rows['sdVideoMeanStepDisplacement'] = sdVideoMeanStepDisplacement
                
            rows['meanStepTravelX'] = meanStepTravelX
            rows['meanStepTravelY'] = meanStepTravelY
            rows['meanStepTravel'] = meanStepTravel
                
            rows['sdVideoMeanStepTravelX'] = sdVideoMeanStepTravelX
            rows['sdVideoMeanStepTravelY'] = sdVideoMeanStepTravelY
            rows['sdVideoMeanStepTravel'] = sdVideoMeanStepTravel
                
            trainingData = trainingData.append(rows)
                
            if(count != len(files)-1): # Don't create new rows for video if we are over
                rows = pd.DataFrame()
                rows = rows.append(filepuller(lab, video, track), ignore_index = True)
                
        else: 
            rows = rows.append(filepuller(lab, video, track), ignore_index = True)
        
        count += 1
            
        

100%|████████████████████████████████████████████████████████████████████████████████████████████| 16080/16080 [00:33<00:00, 475.55it/s]


In [60]:
trainingData

Unnamed: 0,displacementX,displacementY,displacement,totalTravelX,totalTravelY,totalTravel,timeElapsed,meanStepDisplacementX,meanStepDisplacementY,meanStepDisplacement,...,label,meanVideoStepDisplacementX,meanVideoStepDisplacementY,meanVideoStepDisplacement,sdVideoMeanStepDisplacementX,sdVideoMeanStepDisplacementY,sdVideoMeanStepDisplacement,sdVideoMeanStepTravelX,sdVideoMeanStepTravelY,sdVideoMeanStepTravel
0,-37.353,-15.415,40.408772,40.603,24.497,47.420529,301.0,-0.124096,-0.051213,0.134248,...,0.0,0.017546,-0.295327,0.615731,0.283373,1.072575,0.969082,0.645286,0.949995,1.132996
1,-27.248,-18.540,32.957322,33.082,28.292,43.529944,301.0,-0.090525,-0.061595,0.109493,...,0.0,0.017546,-0.295327,0.615731,0.283373,1.072575,0.969082,0.645286,0.949995,1.132996
2,-29.438,-11.623,31.649486,38.020,26.041,46.083121,301.0,-0.097801,-0.038615,0.105148,...,0.0,0.017546,-0.295327,0.615731,0.283373,1.072575,0.969082,0.645286,0.949995,1.132996
3,-39.837,-14.879,42.524948,46.421,34.359,57.753356,301.0,-0.132349,-0.049432,0.141279,...,0.0,0.017546,-0.295327,0.615731,0.283373,1.072575,0.969082,0.645286,0.949995,1.132996
4,-1.616,0.055,1.616936,17.134,20.303,26.566629,301.0,-0.005369,0.000183,0.005372,...,0.0,0.017546,-0.295327,0.615731,0.283373,1.072575,0.969082,0.645286,0.949995,1.132996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6,601.000,22.000,601.402527,767.000,374.000,853.325846,300.0,1.996678,0.073090,1.998015,...,1.0,1.076571,0.957998,1.572443,0.471308,0.492598,0.262624,0.588527,0.399557,0.646518
7,143.000,407.000,431.390774,427.000,529.000,679.830861,300.0,0.475083,1.352159,1.433192,...,1.0,1.076571,0.957998,1.572443,0.471308,0.492598,0.262624,0.588527,0.399557,0.646518
8,69.000,63.000,93.434469,69.000,63.000,93.434469,300.0,1.029851,0.940299,1.394544,...,0.0,1.076571,0.957998,1.572443,0.471308,0.492598,0.262624,0.588527,0.399557,0.646518
9,141.000,138.000,197.294197,141.000,138.000,197.294197,300.0,1.007143,0.985714,1.409244,...,0.0,1.076571,0.957998,1.572443,0.471308,0.492598,0.262624,0.588527,0.399557,0.646518


In [63]:
trainingData.to_csv('trainingData.csv', index=False)

In [64]:
trainingData.isnull().values.any()

False