# Data cleaning

These cells are supposed to clean the data from the ctrjs generated from the low memory function.

In [1]:
import os
import sys

import numpy as np
import pandas as pd

from tqdm import tqdm
from IPython.display import clear_output

import matplotlib as mpl 
import matplotlib.pyplot as plt

sys.path.insert(0, '../../icenumerics/')
sys.path.insert(0, './auxnumerics/')
import icenumerics as ice

import auxiliary as aux
import montecarlo_tools as mc
import chirality_tools as chir
ureg = ice.ureg

%reload_ext autoreload
%autoreload 2

idx = pd.IndexSlice

In [2]:
sim_path = '../data/sims/10'
nctrj = pd.read_csv(os.path.join(sim_path,'ctrj','ctrj2.csv'), index_col=[0,1])
lmctrj = pd.read_csv(os.path.join(sim_path,'ctrj','ctrj8.csv'), index_col=[0,1])

In [3]:
def is_low_memory(df):
    """
        Determines if the file comes from low_memory
    """
    
    if 'mux' in df.columns:
        return True
    else:
        return False

def is_regular(df):
    """
        Determines if the file was regularly generated
    """
    
    if 'type' in df.columns:
        return True
    else:
        return False

def frames_dict(df):
    
    oldframes = df.index.get_level_values('frame').unique().to_list()
    
    mapping = [i for i,frame in enumerate(oldframes)]
    return mapping

def id_dict(df):
    oldid = df.index.get_level_values('id').unique().to_list()
    mapping = [i for i,oid in enumerate(oldid)]
    return mapping
    

def clean_data(sim_path,realization):
    
    # load the file
    filepath = os.path.join(sim_path,'ctrj',f'ctrj{realization}.csv')
    print("Cleaning...",filepath)
    
    # If the file does not exist, just return
    if not os.path.isfile(filepath):
        return None
    
    
    df = pd.read_csv(filepath,index_col=[0,1])
    
    if is_low_memory(df):
        
        # delete last row
        last_frame = df.index.get_level_values('frame').unique().to_list()[-1]
        dfclean = df.loc[idx[:last_frame-1,:]].drop(columns={'mux','muy','muz'})
        
        # remap the frames
        m1 = frames_dict(dfclean)
        m2 = id_dict(dfclean)
        dfclean.index = pd.MultiIndex.from_tuples([ (frame,i) for frame in m1 for i in m2 ],  names = ['frame','id'])
        dfclean.to_csv(os.path.join(sim_path,'ctrj',f'xtrj{realization}.csv'))
    
    elif is_regular(df):
        dfclean = df.drop(columns={'type'})
        dfclean.to_csv(os.path.join(sim_path,'ctrj',f'xtrj{realization}.csv'))
        
    else:
        print("Skip")

In [4]:
drive = r'/mnt/e/stuckgs/data/sims'
sizes = next(os.walk(drive))[1]

In [5]:
for size in sizes:
    ctrjpath = os.path.join(drive,size)
    for r in range(1,11):
        trypath = os.path.join(ctrjpath,'ctrj',f'xtrj{r}.csv')
        if os.path.isfile(trypath):
            continue
        else:
            clean_data(ctrjpath,r)
        

Cleaning... /mnt/e/stuckgs/data/sims/22/ctrj/ctrj6.csv
Cleaning... /mnt/e/stuckgs/data/sims/22/ctrj/ctrj7.csv
Cleaning... /mnt/e/stuckgs/data/sims/22/ctrj/ctrj8.csv
Cleaning... /mnt/e/stuckgs/data/sims/22/ctrj/ctrj9.csv
Cleaning... /mnt/e/stuckgs/data/sims/22/ctrj/ctrj10.csv
Cleaning... /mnt/e/stuckgs/data/sims/23/ctrj/ctrj1.csv
Cleaning... /mnt/e/stuckgs/data/sims/23/ctrj/ctrj2.csv
Cleaning... /mnt/e/stuckgs/data/sims/23/ctrj/ctrj3.csv
Cleaning... /mnt/e/stuckgs/data/sims/23/ctrj/ctrj4.csv
Cleaning... /mnt/e/stuckgs/data/sims/23/ctrj/ctrj5.csv
Cleaning... /mnt/e/stuckgs/data/sims/23/ctrj/ctrj6.csv
Cleaning... /mnt/e/stuckgs/data/sims/23/ctrj/ctrj7.csv
Cleaning... /mnt/e/stuckgs/data/sims/23/ctrj/ctrj8.csv
Cleaning... /mnt/e/stuckgs/data/sims/23/ctrj/ctrj9.csv
Cleaning... /mnt/e/stuckgs/data/sims/23/ctrj/ctrj10.csv
Cleaning... /mnt/e/stuckgs/data/sims/24/ctrj/ctrj1.csv
Cleaning... /mnt/e/stuckgs/data/sims/24/ctrj/ctrj2.csv
Cleaning... /mnt/e/stuckgs/data/sims/24/ctrj/ctrj3.csv
Cleaning