## Data Preperation 

* Many of the columns are just repeated the values, changed the column data type to categorical data type in this way the the rows are just references to these values.

* Using the Parquet file type with PyArrow (or fast Parquet) . The parquet file encodes our data types and it is also lighter and much faster to load. Pandas will now use PyArrow in the backend  now the whole BHB data set can be loaded and manipulated easily.

PyArrow
```Shell
conda install -c conda-forge pyarrow
```

In [6]:
import os
import pandas as pd

cpu_model = !sed '5!d' /proc/cpuinfo
n_cpus=os.cpu_count()

print(cpu_model, '\n CPUS:', n_cpus)

df=pd.read_csv("BHBHm.csv")
df

['model name\t: Intel(R) Core(TM) i5-4210U CPU @ 1.70GHz'] 
 CPUS: 4


Unnamed: 0,ID,name,BWorldtime,Mass_0,Radius_0,Zams_0,Phase_0,RemnantType_0,Mass_1,Radius_1,...,NSu,channel,subchannel,collisions,CE,CEb,CEa,SMTa,EventsSimple,alpha
0,40084,0_629754175943409,5.654169,29.804870,0.000127,32.84829,7,6,18.601870,0.000079,...,0,2,s,0,0,0,0,1,M:ehS:M:erS,0.5
1,40736,0_533923893120796,6.186252,20.484050,0.000087,22.21250,7,6,16.858310,0.000072,...,0,2,s,0,0,0,0,1,M:ehS:M:M:erS,0.5
2,40982,0_594107279352875,3.794108,36.202330,0.000154,42.04988,7,6,37.315660,0.000158,...,0,2,s,0,0,0,0,1,M:ehS:M:erS,0.5
3,41248,0_110242806030047,5.113947,20.964950,0.000089,22.97500,7,6,22.923960,0.000097,...,0,2,s,0,0,0,0,1,M:ehS:M:erS,0.5
4,41445,0_508796918105102,7.302836,15.490630,0.000066,16.74938,7,6,12.068660,0.000051,...,0,2,s,0,0,0,0,1,M:ehS:M:erS,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
801033,3911065,0_985999188379511,6.093106,9.356520,0.000040,19.95530,7,6,8.156447,0.000035,...,0,0,n,0,0,0,0,0,ehS:hrS,5.0
801034,3501890,0_539944566758815,5.900609,10.859560,0.000046,21.28041,7,6,8.572336,0.000036,...,0,0,n,0,0,0,0,0,ehS:hrS,5.0
801035,2450800,0_407755694057291,4.903003,10.656350,0.000045,21.12394,7,6,6.308559,0.000027,...,0,2,s,0,0,0,0,1,ehS:M:erS,5.0
801036,2642107,0_633656604516244,5.398514,9.010068,0.000038,19.63342,7,6,6.504719,0.000028,...,0,0,n,0,0,0,0,0,ehS:erS,5.0


In [4]:
df.columns

Index(['ID', 'name', 'BWorldtime', 'Mass_0', 'Radius_0', 'Zams_0', 'Phase_0',
       'RemnantType_0', 'Mass_1', 'Radius_1', 'Phase_1', 'Zams_1',
       'RemnantType_1', 'Semimajor', 'Eccentricity', 'GWtime', 'EventsAll',
       'Events', 'EventsPlus', 'Mzams_0', 'Mzams_1', 'Semimajor_ini',
       'Eccentricity_ini', 'Z', 'NSt', 'NSu', 'channel', 'subchannel',
       'collisions', 'CE', 'CEb', 'CEa', 'SMTa', 'EventsSimple', 'alpha'],
      dtype='object')

### This  is innefficent as we don't actually need all of these columns

In [5]:
df=df[["Mass_0","Mass_1","Z","alpha"]]
df

Unnamed: 0,Mass_0,Mass_1,Z,alpha
0,10.231240,1.330314,0.0001,0.5
1,16.193230,1.264407,0.0001,0.5
2,11.472340,1.158055,0.0001,0.5
3,7.810100,1.262766,0.0001,0.5
4,9.672037,1.211787,0.0001,0.5
...,...,...,...,...
209965,1.520273,8.679057,0.0300,5.0
209966,1.339562,7.682926,0.0300,5.0
209967,10.638750,1.328837,0.0300,5.0
209968,1.443154,5.990815,0.0300,5.0


### Better yet to not load the extra columns at all

In [7]:
df=pd.read_csv("BHBHm.csv", usecols = ['Mass_0','Mass_1','Z','alpha'])
df

Unnamed: 0,Mass_0,Mass_1,Z,alpha
0,29.804870,18.601870,0.0001,0.5
1,20.484050,16.858310,0.0001,0.5
2,36.202330,37.315660,0.0001,0.5
3,20.964950,22.923960,0.0001,0.5
4,15.490630,12.068660,0.0001,0.5
...,...,...,...,...
801033,9.356520,8.156447,0.0300,5.0
801034,10.859560,8.572336,0.0300,5.0
801035,10.656350,6.308559,0.0300,5.0
801036,9.010068,6.504719,0.0300,5.0


Make the repetitive columns categorical

In [8]:
df['alpha'] = df.alpha.astype('category')
df['Z'] = df.Z.astype('category')
df.to_parquet('BHBHm.pq')

In [10]:
old_size=os.stat("BHBHm.csv").st_size
new_size=os.stat('BHBHm.pq').st_size
print("file now", round((new_size/old_size)*100), '% of the original size')

file now 5 % of the original size


## Reading Parquet Files
```Python
from pyarrow.parquet import ParquetFile
import pyarrow as pa 

pf = ParquetFile('BHNSm.pq') 
first_ten_rows = next(pf.iter_batches(batch_size = 30)) 
df = pa.Table.from_batches([first_ten_rows]).to_pandas() 
df
```


In [34]:
pd.read_parquet('BHNSm.pq')

Unnamed: 0,Mass_0,Mass_1,Z,alpha
0,10.231240,1.330314,0.0001,0.5
1,16.193230,1.264407,0.0001,0.5
2,11.472340,1.158055,0.0001,0.5
3,7.810100,1.262766,0.0001,0.5
4,9.672037,1.211787,0.0001,0.5
...,...,...,...,...
209965,1.520273,8.679057,0.0300,5.0
209966,1.339562,7.682926,0.0300,5.0
209967,10.638750,1.328837,0.0300,5.0
209968,1.443154,5.990815,0.0300,5.0


In [62]:
def read_rows(file, nrows=10, skiprows=11):  
    from pyarrow.parquet import ParquetFile
    import pyarrow as pa 
    pf = ParquetFile(file) 
    batch_list=list(pf.iter_batches(batch_size = nrows))
    nbatch=len(batch_list)
    nskip=int(skiprows/nrows)
    df = pa.Table.from_batches([batch_list[nskip]]).to_pandas()
    #could write something to open up batches either side and make new df
    #not teh most efficent
    #rounds to closed batch
    return df

read_rows('BHNSm.pq')

Unnamed: 0,Mass_0,Mass_1,Z,alpha
0,15.28055,1.473822,0.0001,0.5
1,12.36655,1.312276,0.0001,0.5
2,9.216427,1.431209,0.0001,0.5
3,8.339562,1.373469,0.0001,0.5
4,8.682196,1.381753,0.0001,0.5
5,10.24778,1.379576,0.0001,0.5
6,5.977589,1.496739,0.0001,0.5
7,8.983903,1.382455,0.0001,0.5
8,9.09712,1.306501,0.0001,0.5
9,18.68613,1.314753,0.0001,0.5


In [59]:
from pyarrow.parquet import ParquetFile
import pyarrow as pa 

pf = ParquetFile('BHNSm.pq') 
first_ten_rows = next(pf.iter_batches(batch_size = 30)) 
df = pa.Table.from_batches([first_ten_rows]).to_pandas() 
df

Unnamed: 0,Mass_0,Mass_1,Z,alpha
0,10.23124,1.330314,0.0001,0.5
1,16.19323,1.264407,0.0001,0.5
2,11.47234,1.158055,0.0001,0.5
3,7.8101,1.262766,0.0001,0.5
4,9.672037,1.211787,0.0001,0.5
5,9.081004,1.396124,0.0001,0.5
6,9.64616,1.417451,0.0001,0.5
7,11.36924,1.491942,0.0001,0.5
8,9.386548,1.395518,0.0001,0.5
9,9.371522,1.298651,0.0001,0.5
