## Directly Read the File

In [1]:
import pandas as pd

searches="big_searches.csv"

searches_df= pd.read_csv(searches,sep='^')
searches_df.head(5)

Unnamed: 0,Date,Time,TxnCode,OfficeID,Country,Origin,Destination,RoundTrip,NbSegments,Seg1Departure,...,Seg6Arrival,Seg6Date,Seg6Carrier,Seg6BookingCode,From,IsPublishedForNeg,IsFromInternet,IsFromVista,TerminalID,InternetOffice
0,2013-01-01,20:25:57,MPT,624d8c3ac0b3a7ca03e3c167e0f48327,DE,TXL,AUH,1,2,TXL,...,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,FRA
1,2013-01-01,10:15:33,MPT,b0af35b31588dc4ab06d5cf2986e8e02,MD,ATH,MIL,0,1,ATH,...,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,KIV
2,2013-01-01,18:04:49,MPT,3561a60621de06ab1badc8ca55699ef3,US,ICT,SFO,1,2,ICT,...,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,NYC
3,2013-01-01,17:42:40,FXP,1864e5e8013d9414150e91d26b6a558b,SE,RNB,ARN,0,1,RNB,...,,,,,1ASI,0,0,0,d41d8cd98f00b204e9800998ecf8427e,STO
4,2013-01-01,17:48:29,MPT,1ec336348f44207d2e0027dc3a68c118,NO,OSL,MAD,1,2,OSL,...,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,OSL


In [9]:
df_memory=searches_df.memory_usage(index=True).sum()
print("The dataframe consume",df_memory,"bytes")
print("The dataframe consume",df_memory/(10**9),"G")

The dataframe consume 7202879720 bytes
The dataframe consume 7.20287972 G


In [14]:
def memory_usage(df):
    df_memory=df.memory_usage(index=True).sum()
    print("The dataframe consume",df_memory,"bytes")
    print("The dataframe consume",df_memory/(10**9),"G")

## Using the chunk
### small chuck

In [16]:
chksize_small=1000
reader=pd.read_csv(searches,sep='^',chunksize=chksize_small)

for df in reader:
    searches_chuck_df=df
    memory_usage(searches_chuck_df)
    break

The dataframe consume 360080 bytes
The dataframe consume 0.00036008 G


### large chuck

In [20]:
chksize_big=1000000
reader=pd.read_csv(searches,sep='^',chunksize=chksize_big)

for df in reader:
    searches_chuck_df=df
    memory_usage(searches_chuck_df)
    break
    

The dataframe consume 360000080 bytes
The dataframe consume 0.36000008 G


In [21]:
searches_chuck_df.tail(5)

Unnamed: 0,Date,Time,TxnCode,OfficeID,Country,Origin,Destination,RoundTrip,NbSegments,Seg1Departure,...,Seg6Arrival,Seg6Date,Seg6Carrier,Seg6BookingCode,From,IsPublishedForNeg,IsFromInternet,IsFromVista,TerminalID,InternetOffice
999995,2013-01-01,18:39:14,MPT,38a3abb0a28e3f00fa79a11f552a5052,FR,PMO,VRN,1,2,PMO,...,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,PAR
999996,2013-01-01,08:47:07,MPT,9aa037e8d4135205e54688c113ed7e80,DE,QYG,SFO,1,2,QYG,...,,,,,1ASI,0,0,0,d41d8cd98f00b204e9800998ecf8427e,BER
999997,2013-01-01,21:07:29,MPT,38a3abb0a28e3f00fa79a11f552a5052,FR,MAH,PAR,0,1,MAH,...,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,PAR
999998,2013-01-01,17:17:21,MTP,236cbf458f79dbb9d5e9c9430438c3db,US,SFO,LGW,1,2,SFO,...,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,HPN
999999,2013-01-01,15:08:37,MPT,909e0b385888f1305839025d209a5a93,GB,LHR,LAX,1,2,LHR,...,,,,,1ASI,0,0,0,d41d8cd98f00b204e9800998ecf8427e,LON


## Exercise 3

In [23]:
ffilter = 'ex3_large.h5'
#In order to handle numerica NaN values on PAX, we will set missing values to 0.
def nan2zero(num):
    try:
        return np.int(num)
    except:
        return 0

reader = pd.read_csv(searches,sep='^',iterator=True,memory_map=True, \
                     chunksize=chksize_big)#,converters={'pax':nan2zero})
with pd.HDFStore(ffilter,mode='w') as store:
    nc = 1
    for df in reader:
        print("Reading chunk number %s"%nc)
        #Reduce the problem to two columns
        df2 = df.filter(['Destination','Date'],axis=1)
        #Drop NaN values
        df2 = df2.dropna(axis=0,how='any')
        #Go to datetime format with pandas
        df2['Date'] = pd.to_datetime(df2['Date'],errors='coerce',yearfirst=True)
        df2['Day'] = df2['Date'].dt.day
        #Make sure string is clean (no white spaces)
        df2['Destination']= df2['Destination'].astype('str').str.strip().str.upper()
        #Write to file
        store.append('searches', df2)#data_columns=['arr_port','pax'])
        nc += 1

Reading chunk number 1
Reading chunk number 2
Reading chunk number 3
Reading chunk number 4
Reading chunk number 5
Reading chunk number 6
Reading chunk number 7
Reading chunk number 8
Reading chunk number 9
Reading chunk number 10
Reading chunk number 11
Reading chunk number 12
Reading chunk number 13
Reading chunk number 14
Reading chunk number 15
Reading chunk number 16
Reading chunk number 17
Reading chunk number 18
Reading chunk number 19
Reading chunk number 20
Reading chunk number 21


In [24]:
df = pd.read_hdf(ffilter,mode='r')
df.head(10)

Unnamed: 0,Destination,Date,Day
0,AUH,2013-01-01,1
1,MIL,2013-01-01,1
2,SFO,2013-01-01,1
3,ARN,2013-01-01,1
4,MAD,2013-01-01,1
5,BLR,2013-01-01,1
6,PAR,2013-01-01,1
7,DUB,2013-01-01,1
8,ACE,2013-01-01,1
9,BGW,2013-01-01,1


In [25]:
memory_usage(df)

The dataframe consume 640255968 bytes
The dataframe consume 0.640255968 G


In [28]:
import pandas_profiling

df.profile_report(title='amadeus')

