More about Vaex:
https://towardsdatascience.com/how-to-analyse-100s-of-gbs-of-data-on-your-laptop-with-python-f83363dda94

In [1]:
import pandas as pd
import numpy as np
import vaex
import glob
import os
import re

In [2]:
path="D:/ML/Project/NYC Taxi Trip/"

In [3]:
def del_hdf5(filepath):
    files = glob.glob(filepath+'*.hdf5')
    for f in files:
        os.remove(f)

In [4]:
def tryint(s):
    try:
        return int(s)
    except:
        return s

In [5]:
def alphanum_key(s):
    """ Turn a string into a list of string and number chunks.
        "z23a" -> ["z", 23, "a"]
    """
    return [ tryint(c) for c in re.split('([0-9]+)', s) ]

In [6]:
def csv2hdf5(filepath, filename, rownum):
    
    # Delete existing HDF5 files
    del_hdf5(filepath)
    
    # Initialize the counters
    count=0
    i=0
    rows=0
    
    # Count the number of rows in the csv file
    with open(filepath+filename+'.csv') as fp:
        for _ in fp:
            count += 1
            
    # Loop to convert csv to HDF5 file
    while count - rownum > rownum:
        df = pd.read_csv(filepath+filename+'.csv',
                         skiprows=range(1,rows+1),
                         nrows=rownum) #skip rows
        df = vaex.from_pandas(df) # Convert to vaex format
        df.export_hdf5(filepath+str(i)+filename+'.hdf5') # Export in HDF5 format
        count -= rownum
        i += 1
        rows += rownum
    else:
        df = pd.read_csv(filepath+filename+'.csv',
                         skiprows=range(1,rows+1),
                         nrows=count)
        df = vaex.from_pandas(df)
        df.export_hdf5(filepath+str(i)+filename+'.hdf5')
        
    # Return the list of HDF5 file
    hdf5_list = glob.glob(filepath+'*.hdf5')
    hdf5_list.sort(key=alphanum_key)
    hdf5_list = np.array(hdf5_list)
    
    # Open all HDF5 files into dataframe
    master_df = vaex.open_many(hdf5_list)
    
    # Export dataframe into one file with HDF extension
    master_df.export_hdf5(filepath+filename+'.hdf', progress=True)
    
    # Close open file handles
    for df in master_df.dfs:
        df.close_files()
    
    # Delete all file with HDF5 extension
    del_hdf5(filepath)
    
    # Correct the master DF extension
    os.rename(filepath+filename+'.hdf', filepath+filename+'.hdf5')
    
    return filename+'.hdf5'

In [7]:
%%time
file = csv2hdf5(path,'train',10000)

[#######################################-]:  98.52% estimated time:        0s =  0.0m =  0.0h         Wall time: 1min 51s


In [9]:
data = vaex.open(path+file)
data.sample(5)

#,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,index
0,id2945727,2,2016-02-03 23:40:17,2016-02-03 23:43:07,1,-73.9941,40.7268,-73.9897,40.7342,N,170,1266
1,id2187626,2,2016-04-22 14:35:53,2016-04-22 14:42:08,1,-74.0065,40.7322,-73.9976,40.7336,N,375,5847
2,id1225230,2,2016-01-03 02:00:40,2016-01-03 02:13:36,1,-73.973,40.7852,-73.9802,40.7306,N,776,5304
3,id2578994,1,2016-03-28 19:27:08,2016-03-28 19:37:34,1,-73.9604,40.7786,-73.9787,40.7641,N,626,2338
4,id0369292,1,2016-04-20 18:11:17,2016-04-20 18:25:55,1,-73.9531,40.7724,-73.9892,40.7382,N,878,2810


Let's check the integrity of the data after conversion

In [10]:
data.describe()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,index
dtype,str,int64,str,str,int64,float64,float64,float64,float64,str,int64,int64
count,1458644,1458644,1458644,1458644,1458644,1458644,1458644,1458644,1458644,1458644,1458644,1458644
,0,0,0,0,0,0,0,0,0,0,0,0
mean,--,1.5349502688798637,--,--,1.6645295219395548,-73.97348630489282,40.750920908391734,-73.9734159469458,40.7517995149002,--,959.4922729603659,5054.7426555074435
std,--,0.498777,--,--,1.31424,0.0709018,0.0328812,0.0706433,0.0358905,--,5237.43,2972
min,--,1,--,--,0,-121.933,34.3597,-121.933,32.1811,--,1,0
max,--,2,--,--,9,-61.3355,51.8811,-61.3355,43.921,--,3526282,18643


In [11]:
%%time
pd.read_csv(path+'train.csv').describe()

Wall time: 4.82 s


Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
count,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0
mean,1.53495,1.66453,-73.97349,40.75092,-73.97342,40.7518,959.4923
std,0.4987772,1.314242,0.07090186,0.03288119,0.07064327,0.03589056,5237.432
min,1.0,0.0,-121.9333,34.3597,-121.9333,32.18114,1.0
25%,1.0,1.0,-73.99187,40.73735,-73.99133,40.73588,397.0
50%,2.0,1.0,-73.98174,40.7541,-73.97975,40.75452,662.0
75%,2.0,2.0,-73.96733,40.76836,-73.96301,40.76981,1075.0
max,2.0,9.0,-61.33553,51.88108,-61.33553,43.92103,3526282.0
