## CS 155 Project 1 Training Extractor.
Extracts Training Data features into a panda data frame. Also saved as a CSV for reference.


In [3]:
import warnings
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import os

In [6]:
'''
Takes in lab, video, track specification for a training csv file.


returns a panda row with some features extracted from that csv file.
'''
def filepuller(lab,video,track):
    
    # get the specified csv
    file = "train_csvs/"
    if lab == 1:
        file += "lab_"
    else: 
        file += "sim_"
        
    file += str(video)
    file += "_"
    file += str(track)
    file += ".csv"
    
    data = pd.read_csv(file)
    data['dx'] = data['x'].diff()
    data['dy'] = data['y'].diff()
    data['angle'] = np.arctan(data['dy']/data['dx'])
    
    # data = data.drop(labels=0, axis=0) not sure this is safe because we might have 1 row test data 
    # going to set to 0 which may not be best decision- up for debate
    data = data.replace(np. nan,0)
    
    # feature engineering
    timeElapsed = np.sum(data.tail(1)['t'])
    
    meanStepDisplacementX = np.mean(data['dx'])
    meanStepDisplacementY = np.mean(data['dy'])
    meanStepDisplacement = np.sqrt(meanStepDisplacementX**2 + meanStepDisplacementY**2)
    
    sdStepDisplacementX = np.std(data['dx'])
    sdStepDisplacementY = np.std(data['dy'])
    sdStepDisplacement = np.sqrt(sdStepDisplacementX**2 + sdStepDisplacementY**2)
    
    meanStepTravelX = np.mean(abs(data['dx']))
    meanStepTravelY = np.mean(abs(data['dy']))
    meanStepTravel = np.sqrt(meanStepTravelX**2 + meanStepTravelY**2)
    
    sdStepTravelX = np.std(abs(data['dx']))
    sdStepTravelY = np.std(abs(data['dy']))
    sdStepTravel = np.sqrt(sdStepTravelX**2 + sdStepTravelY**2)
    
    meanAngle = np.mean(data['angle'])
    sdAngle = np.std(data['angle'])    
    
    # Shapiro Wilk test of Normality on displacement; get p value of test
    # Remember if P is low then the null hypothesis must go! 
    # (Low p => not normal)
    # Expect a normal distribution centered on 0 for Brownian motion.
    # https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test
    # https://en.wikipedia.org/wiki/Brownian_motion
    if len(data) >=3:
        shapiroDisplacement = stats.shapiro(np.sqrt(data['dx']**2+data['dy']**2)).pvalue
    else:
        # We can't collect Shapiro Wilk P-Value Data.
        # This is a problem. I will just assign these variables 0.05
        # on the cusp of 5% significance for normality vs non normality. perhaps a bad 
        # decision- to discuss.
        shapiroDisplacement = 0.05

    

    
    row = {            
            'meanStepDisplacement': meanStepDisplacement,
            
            'meanStepTravel': meanStepTravel,
            
            'sdStepDisplacement': sdStepDisplacement,
        
            'sdStepTravel': sdStepTravel,
            
            'shapiroDisplacement': shapiroDisplacement,
        
            'meanAngle': meanAngle,
        
            'sdAngle': sdAngle,
            
            'lab': lab,
        
            'label': np.sum(data.tail(1)['label'])
            }
    
    #return pd.DataFrame.from_dict(row)
    return row

In [9]:
# Getting training data into a csv.
directory = "train_csvs/"
video0 = "0"
trainingData = pd.DataFrame()
rows = pd.DataFrame()
warnings.filterwarnings("ignore")
count = 0

for root, dirs, files in os.walk(directory):  
    for filename in tqdm(sorted(files)):
        parts = filename.split('_')
        lab = (parts[0] == 'lab') * 1.0
        video = parts[1]
        track = parts[2][:-4]
        
        # last row; add to rows for mean evaluation. nb I make the assumption that the last file is 
        # part of the video of the penultimate file. This is valid, looking at the test and train csvs.
        if count == len(files)-1:
            rows = rows.append(filepuller(lab, video, track), ignore_index = True)
        
        # We should collect features on videos. (Track particles' neighbors behavior in same video)
        if video != video0 or count == len(files)-1:
            video0 = video

            meanVideoStepDisplacement = np.mean(rows['meanStepDisplacement'])

            sdVideoMeanStepDisplacement = np.std(rows['meanStepDisplacement'])

            meanVideoStepTravel = np.mean(rows['meanStepTravel'])

            sdVideoMeanStepTravel = np.std(rows['meanStepTravel'])
            
            meanVideoAngle = np.mean(rows['meanAngle'])

            sdVideoAngle = np.std(rows['sdAngle'])
                
            
            rows['meanVideoStepDisplacement'] = meanVideoStepDisplacement
                
            rows['sdVideoMeanStepDisplacement'] = sdVideoMeanStepDisplacement
                
            rows['meanVideoStepTravel'] = meanVideoStepTravel
                
            rows['sdVideoMeanStepTravel'] = sdVideoMeanStepTravel
            
            rows['meanVideoAngle'] = meanVideoAngle
            
            rows['sdVideoAngle'] = sdVideoAngle
                
            trainingData = trainingData.append(rows)
                
            if(count != len(files)-1): # Don't create new rows for video if we are over
                rows = pd.DataFrame()
                rows = rows.append(filepuller(lab, video, track), ignore_index = True)
                
        else: 
            rows = rows.append(filepuller(lab, video, track), ignore_index = True)
        
        count += 1
            
        

100%|████████████████████████████████████████████████████████████████████████████████████████████| 16080/16080 [00:50<00:00, 317.32it/s]


In [8]:
trainingData

Unnamed: 0,meanStepDisplacement,meanStepTravel,sdStepDisplacement,sdStepTravel,shapiroDisplacement,meanAngle,sdAngle,lab,label,meanVideoStepDisplacement,sdVideoMeanStepDisplacement,meanVideoStepTravel,sdVideoMeanStepTravel,meanVideoAngle,sdVideoAnle
0,0.134248,0.157543,0.147757,0.122617,1.458767e-17,0.303793,0.690325,1.0,0.0,0.615731,0.969082,0.827764,1.132996,0.189603,0.132144
1,0.109493,0.144618,0.150221,0.116793,7.008054e-08,0.084136,0.846034,1.0,0.0,0.615731,0.969082,0.827764,1.132996,0.189603,0.132144
2,0.105148,0.153100,0.163319,0.119539,2.602816e-11,0.291033,0.717287,1.0,0.0,0.615731,0.969082,0.827764,1.132996,0.189603,0.132144
3,0.141279,0.191872,0.196278,0.147208,6.498721e-13,0.077858,0.763897,1.0,0.0,0.615731,0.969082,0.827764,1.132996,0.189603,0.132144
4,0.005372,0.088261,0.137391,0.105428,4.951283e-21,0.438199,0.699752,1.0,0.0,0.615731,0.969082,0.827764,1.132996,0.189603,0.132144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6,1.998015,2.834970,2.852631,2.022997,2.015427e-06,-0.003842,0.672144,0.0,1.0,1.572443,0.262624,2.207785,0.646518,0.390463,0.178406
7,1.433192,2.258574,2.528071,1.828668,7.901024e-06,0.289843,0.949987,0.0,1.0,1.572443,0.262624,2.207785,0.646518,0.390463,0.178406
8,1.394544,1.394544,0.878444,0.878444,3.115130e-06,0.754467,0.510014,0.0,0.0,1.572443,0.262624,2.207785,0.646518,0.390463,0.178406
9,1.409244,1.409244,0.898269,0.898269,2.868702e-08,0.704831,0.473525,0.0,0.0,1.572443,0.262624,2.207785,0.646518,0.390463,0.178406


In [10]:
trainingData.to_csv('trainingData2.csv', index=False)

In [11]:
trainingData.isnull().values.any()

False