## CS 155 Project 1 Testing Extractor.
Extracts Testing Data features into a panda data frame. Also saved as a CSV for reference.


In [10]:
import warnings
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import os

directory = "test_csvs/"

In [57]:
def filepuller(lab,video,track):
    
    # get the specified csv
    file = directory
    if lab == 1:
        file += "lab_"
    else: 
        file += "sim_"
        
    file += str(video)
    file += "_"
    file += str(track)
    file += ".csv"
    
    data = pd.read_csv(file)
    data['dx'] = data['x'].diff()
    data['dy'] = data['y'].diff()
    
    # data = data.drop(labels=0, axis=0) not sure this is safe because we might have 1 row test data 
    # going to set to 0 which may not be best decision- up for debate
    data = data.replace(np. nan,0)
    
    # feature engineering
    displacementX = np.sum(data['dx'])
    displacementY = np.sum(data['dy'])
    displacement = np.sqrt(displacementX**2+displacementY**2)
    
    totalTravelX = np.sum(abs(data['dx']))
    totalTravelY = np.sum(abs(data['dy']))
    totalTravel = np.sqrt(totalTravelX**2 + totalTravelY**2)
    
    timeElapsed = np.sum(data.tail(1)['t'])
    
    meanStepDisplacementX = np.mean(data['dx'])
    meanStepDisplacementY = np.mean(data['dy'])
    meanStepDisplacement = np.sqrt(meanStepDisplacementX**2 + meanStepDisplacementY**2)
    if(pd.isnull(meanStepDisplacementX)):
        print(lab)
        print(video)
        print(track)
    
    sdStepDisplacementX = np.std(data['dx'])
    sdStepDisplacementY = np.std(data['dy'])
    sdStepDisplacement = np.sqrt(sdStepDisplacementX**2 + sdStepDisplacementY**2)
    
    meanStepTravelX = np.mean(abs(data['dx']))
    meanStepTravelY = np.mean(abs(data['dy']))
    meanStepTravel = np.sqrt(meanStepTravelX**2 + meanStepTravelY**2)
    
    sdStepTravelX = np.std(abs(data['dx']))
    sdStepTravelY = np.std(abs(data['dy']))
    sdStepTravel = np.sqrt(sdStepTravelX**2 + sdStepTravelY**2)
    
    # Shapiro Wilk test of Normality on displacement; get p value of test
    # Remember if P is low then the null hypothesis must go! 
    # (Low p => not normal)
    # Expect a normal distribution centered on 0 for Brownian motion.
    # https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test
    # https://en.wikipedia.org/wiki/Brownian_motion
    if len(data) >=3:
        shapiroDisplacementX = stats.shapiro(data['dx']).pvalue
        shapiroDisplacementY = stats.shapiro(data['dy']).pvalue
        shapiroDisplacement = stats.shapiro(np.sqrt(data['dx']**2+data['dy']**2)).pvalue
    else:
        # We can't collect Shapiro Wilk P-Value Data.
        # This is a problem. I will just assign these variables 0.05
        # on the cusp of 5% significance for normality vs non normality. perhaps a bad 
        # decision- to discuss.
        shapiroDisplacementX = 0.05
        shapiroDisplacementY = 0.05
        shapiroDisplacement = 0.05

    

    
    row = {
            'UID': file[10:-4],
        
            'displacementX': displacementX, 
            'displacementY': displacementY,
            'displacement': displacement,
            
            'totalTravelX': totalTravelX,
            'totalTravelY': totalTravelY,
            'totalTravel': totalTravel,
        
            'timeElapsed': timeElapsed,
        
            'meanStepDisplacementX': meanStepDisplacementX,
            'meanStepDisplacementY': meanStepDisplacementY,
            'meanStepDisplacement': meanStepDisplacement,
            
            'meanStepTravelX': meanStepTravelX,
            'meanStepTravelY': meanStepTravelY,
            'meanStepTravel': meanStepTravel,
        
            'sdStepTravelX': sdStepTravelX,
            'sdStepTravelY': sdStepTravelY,
            'sdStepTravel': sdStepTravel,
            
            'shapiroDisplacementX': shapiroDisplacementX,
            'shapiroDisplacementY': shapiroDisplacementY,
            'shapiroDisplacement': shapiroDisplacement,
                
            
            'lab': lab
            }
    
    return row

In [65]:
# Getting training data into a csv.
video0 = "19"
testingData = pd.DataFrame()
rows = pd.DataFrame()
warnings.filterwarnings("ignore")
count = 0

for root, dirs, files in os.walk(directory):  
    for filename in tqdm(sorted(files)):
        parts = filename.split('_')
        lab = (parts[0] == 'lab') * 1.0
        video = parts[1]
        track = parts[2][:-4]
        
        # last row; add to rows for mean evaluation. nb I make the assumption that the last file is 
        # part of the video of the penultimate file. This is valid, looking at the test and train csvs.
        if count == len(files)-1: 
            rows = rows.append(filepuller(lab, video, track), ignore_index = True)
        
        # We should collect features on videos. (Track particles' neighbors behavior in same video)
        if video != video0 or count == len(files)-1:
            video0 = video
            meanVideoStepDisplacementX = np.mean(rows['meanStepDisplacementX'])
            meanVideoStepDisplacementY = np.mean(rows['meanStepDisplacementY'])
            meanVideoStepDisplacement = np.mean(rows['meanStepDisplacement'])

            sdVideoMeanStepDisplacementX = np.std(rows['meanStepDisplacementX'])
            sdVideoMeanStepDisplacementY = np.std(rows['meanStepDisplacementY'])
            sdVideoMeanStepDisplacement = np.std(rows['meanStepDisplacement'])
                
            # Maybe we should get standard deviation and mean of standard deviations but I am unsure
            # I have not done this. Could be a thing to add. 

            meanStepTravelX = np.mean(rows['meanStepTravelX'])
            meanStepTravelY = np.mean(rows['meanStepTravelY'])
            meanStepTravel = np.mean(rows['meanStepTravel'])

            sdVideoMeanStepTravelX = np.std(rows['meanStepTravelX'])
            sdVideoMeanStepTravelY = np.std(rows['meanStepTravelY'])
            sdVideoMeanStepTravel = np.std(rows['meanStepTravel'])
                
            
            rows['meanVideoStepDisplacementX'] = meanVideoStepDisplacementX
            rows['meanVideoStepDisplacementY'] = meanVideoStepDisplacementY
            rows['meanVideoStepDisplacement'] = meanVideoStepDisplacement
                
            rows['sdVideoMeanStepDisplacementX'] = sdVideoMeanStepDisplacementX
            rows['sdVideoMeanStepDisplacementY'] = sdVideoMeanStepDisplacementY
            rows['sdVideoMeanStepDisplacement'] = sdVideoMeanStepDisplacement
                
            rows['meanStepTravelX'] = meanStepTravelX
            rows['meanStepTravelY'] = meanStepTravelY
            rows['meanStepTravel'] = meanStepTravel
                
            rows['sdVideoMeanStepTravelX'] = sdVideoMeanStepTravelX
            rows['sdVideoMeanStepTravelY'] = sdVideoMeanStepTravelY
            rows['sdVideoMeanStepTravel'] = sdVideoMeanStepTravel
            
            testingData = testingData.append(rows)
            
            if(count != len(files)-1): # Don't create new rows for video if we are over
                rows = pd.DataFrame()
                rows = rows.append(filepuller(lab, video, track), ignore_index = True)
                
        else: 
            rows = rows.append(filepuller(lab, video, track), ignore_index = True)
        
        count += 1

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 477/477 [00:01<00:00, 411.98it/s]


In [59]:
testingData

Unnamed: 0,UID,displacementX,displacementY,displacement,totalTravelX,totalTravelY,totalTravel,timeElapsed,meanStepDisplacementX,meanStepDisplacementY,...,lab,meanVideoStepDisplacementX,meanVideoStepDisplacementY,meanVideoStepDisplacement,sdVideoMeanStepDisplacementX,sdVideoMeanStepDisplacementY,sdVideoMeanStepDisplacement,sdVideoMeanStepTravelX,sdVideoMeanStepTravelY,sdVideoMeanStepTravel
0,lab_19_0,3.000,-9.000,9.486833,3.000,9.000,9.486833,286,0.013699,-0.041096,...,1.0,0.078986,-0.117881,5.405276,4.922968,5.115829,4.605481,3.384295,3.400750,4.231420
1,lab_19_1,21.000,102.000,104.139330,662.334,537.334,852.885781,301,0.069767,0.338870,...,1.0,0.078986,-0.117881,5.405276,4.922968,5.115829,4.605481,3.384295,3.400750,4.231420
2,lab_19_10,5.000,-4.000,6.403124,5.000,6.218,7.978943,88,0.056818,-0.045455,...,1.0,0.078986,-0.117881,5.405276,4.922968,5.115829,4.605481,3.384295,3.400750,4.231420
3,lab_19_11,221.000,-95.000,240.553528,221.000,157.000,271.090391,62,3.564516,-1.532258,...,1.0,0.078986,-0.117881,5.405276,4.922968,5.115829,4.605481,3.384295,3.400750,4.231420
4,lab_19_12,45.000,-450.000,452.244403,153.000,450.000,475.298853,44,1.071429,-10.714286,...,1.0,0.078986,-0.117881,5.405276,4.922968,5.115829,4.605481,3.384295,3.400750,4.231420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9,lab_42_5,-44.583,43.500,62.288794,44.583,43.500,62.288794,20,-2.229150,2.175000,...,1.0,-2.814509,2.023749,3.872715,2.515684,1.535091,2.388362,2.208094,1.074144,2.184554
10,lab_42_6,-331.125,286.833,438.083252,331.125,286.833,438.083252,104,-3.183894,2.758010,...,1.0,-2.814509,2.023749,3.872715,2.515684,1.535091,2.388362,2.208094,1.074144,2.184554
11,lab_42_7,-706.973,647.793,958.877778,914.389,848.127,1247.167451,300,-2.729625,2.501131,...,1.0,-2.814509,2.023749,3.872715,2.515684,1.535091,2.388362,2.208094,1.074144,2.184554
12,lab_42_8,-1017.141,290.638,1057.849829,1017.141,459.898,1116.280424,166,-10.378990,2.965694,...,1.0,-2.814509,2.023749,3.872715,2.515684,1.535091,2.388362,2.208094,1.074144,2.184554


In [62]:
testingData.to_csv('testingData.csv', index=False)

In [63]:
testingData.isnull().values.any()

False