# Split large csv files into shorter files

In [1]:
## Import dependencies
import numpy as np
import pandas as pd

In [2]:
## Check system memory for Python
try:
  import psutil
except:
  %pip install psutil
  import psutil

memory = psutil.virtual_memory()
print(f" {'*' * 3} Memory used percentage - {memory.percent} \n {'*' * 4} Free Memory available - { round(memory.free / (1024.0 ** 3))} GB")


 *** Memory used percentage - 48.2 
 **** Free Memory available - 12 GB


In [6]:
# But it's interesting to change to Dask for large files and memory consumption.
# Sanitize data in long files - one possible solution: chunk files
# There are some events with error
# Each event has 100000 samples. Each sample is in a row
file = "../data/1.Wfm.csv"
i = 0
# Error in row 174827928: there is an additional column in this sample.
# Solution: read file until 174800000 and continue after 174900000. It lost the event with error
# But if chunksize is 10000000,  
# df =  pd.read_csv(file, encoding = 'utf-8', sep=' ',skiprows = 174827928, nrows=5)
#with pd.read_csv(file, encoding='utf-8', sep=' ', skiprows=174827929, chunksize=10000000, iterator=True, low_memory=False, memory_map=True) as reader:
with pd.read_csv(file, encoding='utf-8', sep=' ', chunksize=10000000, nrows=170000000, low_memory=False, memory_map=True, index_col=False\
    , header=None) as reader:
    for chunk in reader:
        fth_file = "../results/test_" + str(i) + ".csv"
        df = pd.DataFrame(chunk)
        df.to_csv(fth_file, index=False)        
        i = i + 1

In [7]:
# But it's interesting to change to Dask for large files and memory consumption.
# Sanitize data in long files - one possible solution: chunk files
# There are some events with error
# Each event has 100000 samples. Each sample is in a row
file = "../data/1.Wfm.csv"
i = 17
# Error in row 174827928: there is an additional column in this sample.
# Solution: read file until 174800000 and continue after 174900000. It lost the event with error
# But if chunksize is 10000000,  
# df =  pd.read_csv(file, encoding = 'utf-8', sep=' ',skiprows = 174827928, nrows=5)
#with pd.read_csv(file, encoding='utf-8', sep=' ', skiprows=174827929, chunksize=10000000, iterator=True, low_memory=False, memory_map=True) as reader:
with pd.read_csv(file, encoding='utf-8', sep=' ', skiprows=180000000, chunksize=10000000, low_memory=False, memory_map=True, index_col=False\
    , header=None) as reader:
    for chunk in reader:
        fth_file = "../results/test_" + str(i) + ".csv"
        df = pd.DataFrame(chunk)
        df.to_csv(fth_file, index=False)        
        i = i + 1

Sample between 170000000 and 180000000 (chunk size = 10000000) are lost
Each event = 100000 samples
Total events lost: 100

Codes below is only for test

In [None]:
## Test with 10 rows
df = pd.read_csv(file, encoding='utf-8', sep=' ', nrows=10, low_memory=False, memory_map=True, index_col=False, header=None)
df.to_csv("test1_0.csv", index=False) 
dffr = pd.read_csv("test1_0.csv")
print(dffr)

In [None]:
## Test with parquet
i = 0
with pd.read_csv(file, encoding='utf-8', sep=' ', chunksize=2, nrows=10, low_memory=False, memory_map=True, index_col=False\
    , header=None, names=["time", "ch0", "ch1", "ch2", "ch3"]) as reader:
    for chunk in reader:
        fth_file = "test_" + str(i) + ".parquet"
        df = pd.DataFrame(chunk).reset_index()
        df.to_parquet(fth_file, index=None)        
        i = i + 1

for y in range(i):
    fth_file = "test_" + str(y) + ".parquet"
    dff = pd.read_parquet(fth_file, columns=["time", "ch0", "ch1", "ch2", "ch3"])
    print(dff)

In [None]:
## Test with csv
i = 0
with pd.read_csv(file, encoding='utf-8', sep=' ', chunksize=2, nrows=10, low_memory=False, memory_map=True, index_col=False\
    , header=None) as reader:
    for chunk in reader:
        fth_file = "test_" + str(i) + ".csv"
        df = pd.DataFrame(chunk)
        df.to_csv(fth_file, index=False)        
        i = i + 1

for y in range(i):
    fth_file = "test_" + str(y) + ".csv"
    dff = pd.read_csv(fth_file)
    print(dff)