## CS 155 Project 1 Testing Extractor.
Extracts Testing Data features into a panda data frame. Also saved as a CSV for reference.


In [3]:
import warnings
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import os

directory = "test_csvs/"

In [4]:
def filepuller(lab,video,track):
    
    # get the specified csv
    file = directory
    if lab == 1:
        file += "lab_"
    else: 
        file += "sim_"
        
    file += str(video)
    file += "_"
    file += str(track)
    file += ".csv"
    
    data = pd.read_csv(file)
    data['dx'] = data['x'].diff()
    data['dy'] = data['y'].diff()
    data['angle'] = np.arctan(data['dy']/data['dx'])
    
    # data = data.drop(labels=0, axis=0) not sure this is safe because we might have 1 row test data 
    # going to set to 0 which may not be best decision- up for debate
    data = data.replace(np. nan,0)
    
    # feature engineering
    timeElapsed = np.sum(data.tail(1)['t'])
    
    meanStepDisplacementX = np.mean(data['dx'])
    meanStepDisplacementY = np.mean(data['dy'])
    meanStepDisplacement = np.sqrt(meanStepDisplacementX**2 + meanStepDisplacementY**2)
    
    sdStepDisplacementX = np.std(data['dx'])
    sdStepDisplacementY = np.std(data['dy'])
    sdStepDisplacement = np.sqrt(sdStepDisplacementX**2 + sdStepDisplacementY**2)
    
    meanStepTravelX = np.mean(abs(data['dx']))
    meanStepTravelY = np.mean(abs(data['dy']))
    meanStepTravel = np.sqrt(meanStepTravelX**2 + meanStepTravelY**2)
    
    sdStepTravelX = np.std(abs(data['dx']))
    sdStepTravelY = np.std(abs(data['dy']))
    sdStepTravel = np.sqrt(sdStepTravelX**2 + sdStepTravelY**2)
    
    meanAngle = np.mean(data['angle'])
    sdAngle = np.std(data['angle'])    
    
    # Shapiro Wilk test of Normality on displacement; get p value of test
    # Remember if P is low then the null hypothesis must go! 
    # (Low p => not normal)
    # Expect a normal distribution centered on 0 for Brownian motion.
    # https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test
    # https://en.wikipedia.org/wiki/Brownian_motion
    if len(data) >=3:
        shapiroDisplacement = stats.shapiro(np.sqrt(data['dx']**2+data['dy']**2)).pvalue
    else:
        # We can't collect Shapiro Wilk P-Value Data.
        # This is a problem. I will just assign these variables 0.05
        # on the cusp of 5% significance for normality vs non normality. perhaps a bad 
        # decision- to discuss.
        shapiroDisplacement = 0.05

    

    
    row = {
            'UID': file[10:-4],
        
            'meanStepDisplacement': meanStepDisplacement,
            
            'meanStepTravel': meanStepTravel,
            
            'sdStepDisplacement': sdStepDisplacement,
        
            'sdStepTravel': sdStepTravel,
            
            'shapiroDisplacement': shapiroDisplacement,
        
            'meanAngle': meanAngle,
        
            'sdAngle': sdAngle,
            
            'lab': lab
            }
    
    return row

In [6]:
# Getting training data into a csv.
video0 = "19"
testingData = pd.DataFrame()
rows = pd.DataFrame()
warnings.filterwarnings("ignore")
count = 0

for root, dirs, files in os.walk(directory):  
    for filename in tqdm(sorted(files)):
        parts = filename.split('_')
        lab = (parts[0] == 'lab') * 1.0
        video = parts[1]
        track = parts[2][:-4]
        
        # last row; add to rows for mean evaluation. nb I make the assumption that the last file is 
        # part of the video of the penultimate file. This is valid, looking at the test and train csvs.
        if count == len(files)-1: 
            rows = rows.append(filepuller(lab, video, track), ignore_index = True)
        
        # We should collect features on videos. (Track particles' neighbors behavior in same video)
        if video != video0 or count == len(files)-1:
            video0 = video

            meanVideoStepDisplacement = np.mean(rows['meanStepDisplacement'])

            sdVideoMeanStepDisplacement = np.std(rows['meanStepDisplacement'])

            meanVideoStepTravel = np.mean(rows['meanStepTravel'])

            sdVideoMeanStepTravel = np.std(rows['meanStepTravel'])
            
            meanVideoAngle = np.mean(rows['meanAngle'])

            sdVideoAngle = np.std(rows['sdAngle'])
                
            
            rows['meanVideoStepDisplacement'] = meanVideoStepDisplacement
                
            rows['sdVideoMeanStepDisplacement'] = sdVideoMeanStepDisplacement
                
            rows['meanVideoStepTravel'] = meanVideoStepTravel
                
            rows['sdVideoMeanStepTravel'] = sdVideoMeanStepTravel
            
            rows['meanVideoAngle'] = meanVideoAngle
            
            rows['sdVideoAngle'] = sdVideoAngle
                
            testingData = testingData.append(rows)
                
            if(count != len(files)-1): # Don't create new rows for video if we are over
                rows = pd.DataFrame()
                rows = rows.append(filepuller(lab, video, track), ignore_index = True)
                
        else: 
            rows = rows.append(filepuller(lab, video, track), ignore_index = True)
        
        count += 1

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 477/477 [00:01<00:00, 451.03it/s]


In [7]:
testingData

Unnamed: 0,UID,meanStepDisplacement,meanStepTravel,sdStepDisplacement,sdStepTravel,shapiroDisplacement,meanAngle,sdAngle,lab,meanVideoStepDisplacement,sdVideoMeanStepDisplacement,meanVideoStepTravel,sdVideoMeanStepTravel,meanVideoAngle,sdVideoAngle
0,lab_19_0,0.043319,0.043319,0.317590,0.317590,8.837445e-31,-0.022814,0.167257,1.0,5.405276,4.605481,6.383753,4.231420,-0.005333,0.304107
1,lab_19_1,0.345978,2.833508,3.489177,2.065258,2.342739e-13,-0.022981,0.821515,1.0,5.405276,4.605481,6.383753,4.231420,-0.005333,0.304107
2,lab_19_10,0.072763,0.090670,0.085190,0.065808,3.146741e-13,-0.112391,0.754332,1.0,5.405276,4.605481,6.383753,4.231420,-0.005333,0.304107
3,lab_19_11,3.879896,4.372426,3.424395,2.768025,1.554308e-05,-0.402021,0.623850,1.0,5.405276,4.605481,6.383753,4.231420,-0.005333,0.304107
4,lab_19_12,10.767724,11.316639,5.872705,4.729293,3.189151e-04,-0.322240,1.194763,1.0,5.405276,4.605481,6.383753,4.231420,-0.005333,0.304107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9,lab_42_5,3.114440,3.114440,0.767410,0.767410,2.805512e-08,-0.730387,0.176876,1.0,3.872715,2.388362,4.466578,2.184554,-0.346272,0.384723
10,lab_42_6,4.212339,4.212339,0.525152,0.525152,7.616786e-17,-0.704784,0.080784,1.0,3.872715,2.388362,4.466578,2.184554,-0.346272,0.384723
11,lab_42_7,3.702231,4.815318,4.745253,3.610630,1.541790e-11,-0.584118,0.701905,1.0,3.872715,2.388362,4.466578,2.184554,-0.346272,0.384723
12,lab_42_8,10.794386,11.390617,5.364682,3.943658,1.865706e-06,-0.253766,0.425931,1.0,3.872715,2.388362,4.466578,2.184554,-0.346272,0.384723


In [8]:
testingData.to_csv('testingData2.csv', index=False)

In [9]:
testingData.isnull().values.any()

False