# Exercise 4

**Match searches with bookings**

- For every search in the searches file, find out whether the search ended up in a booking or not (using the info in the bookings file). For instance, search and booking origin and destination should match.
- For the bookings file, origin and destination are the columns `dep_port` and `arr_port`, respectively.
- Generate a CSV file with the search data, and an additional field, containing 1 if the search ended up in a booking, and 0 otherwise.

In [7]:
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt

pd.options.display.max_columns = None
#pd.options.display.max_rows = 100

In [11]:
def searches_data_preprocessing(df, columns_to_clean): 
    '''
    
    '''
    df.drop_duplicates(inplace=True)
    
    for column in columns_to_clean:
        df[column] = df[column].str.strip()
    
        if 'Date' not in column:
            df[column] = df[column].str.upper()
            # Para saber si hay lengths superiores a 3 sin que nos de error por los missing values    
            cell_lengths = df[column].map(lambda x: len(x) if not isinstance(x, float) else x)
            cell_lengths_notna = cell_lengths[cell_lengths.notna()]
            cell_lengths_sum = (cell_lengths_notna != 3.0).sum()
            if cell_lengths_sum != 0:
                print(f'Alert! There are `{column}` values with length different than 3. Dropping them..')

                mask = cell_lengths_notna != 3.0
                wrong_data = cell_lengths_notna[mask]
                print(wrong_data)
                df.drop(index=wrong_data.index, inplace=True)
    return df      

In [9]:
dtypes = {'Date':'object',
          'Time':'object',
          'TxnCode':'object',
          'OfficeID':'object',
          'Country':'object',
          'Origin':'object',
          'Destination':'object',
          'RoundTrip':'int64',
          'NbSegments':'int64',
          'Seg1Departure':'object',
          'Seg1Arrival':'object',
          'Seg1Date':'object',
          'Seg1Carrier':'object',
          'Seg1BookingCode':'object',
          'Seg2Departure':'object',
          'Seg2Arrival':'object',
          'Seg2Date':'object',
          'Seg2Carrier':'object',
          'Seg2BookingCode':'object',
          'Seg3Departure':'object',
          'Seg3Arrival':'object',
          'Seg3Date':'object',
          'Seg3Carrier':'object',
          'Seg3BookingCode':'object',
          'Seg4Departure':'object',
          'Seg4Arrival':'object',
          'Seg4Date':'object',
          'Seg4Carrier':'object',
          'Seg4BookingCode':'object',
          'Seg5Departure':'object',
          'Seg5Arrival':'object',
          'Seg5Date':'object',
          'Seg5Carrier':'object',
          'Seg5BookingCode':'object',
          'Seg6Departure':'object',
          'Seg6Arrival':'object',
          'Seg6Date':'object',
          'Seg6Carrier':'object',
          'Seg6BookingCode':'object',
          'From':'object',
          'IsPublishedForNeg':'int64',
          'IsFromInternet':'int64',
          'IsFromVista':'int64',
          'TerminalID':'object',
          'InternetOffice':'object'}

In [15]:
columns_to_clean = ['Date', 'Seg1Departure', 'Seg1Arrival', 'Seg1Date', 
                 'Seg2Departure', 'Seg2Arrival', 'Seg2Date', 
                 'Seg3Departure', 'Seg3Arrival', 'Seg3Date', 
                 'Seg4Departure', 'Seg4Arrival', 'Seg4Date', 
                 'Seg5Departure', 'Seg5Arrival', 'Seg5Date', 
                 'Seg6Departure', 'Seg6Arrival', 'Seg6Date',
                ]

data_iterator = pd.read_csv("../challenge/searches.csv.bz2", 
                            chunksize=10**5, 
                            sep='^', 
                            compression='bz2',
                            nrows=10**6,
                            low_memory=False,
                            #dtype=dtypes,
                            #na_values=[np.nan],
                            memory_map=True
                           )

searches = pd.DataFrame()
for i, data_chunk in enumerate(data_iterator):
    data_chunk = searches_data_preprocessing(data_chunk, columns_to_clean)
    #data_chunk.reset_index(inplace=True)
    data_chunk = data_chunk
    searches = searches.append(data_chunk)
    print("Chunk: %d, size of chunk %d"%(i+1, data_chunk.shape[0]))

Chunk: 1, size of chunk 100000
Chunk: 2, size of chunk 99999
Chunk: 3, size of chunk 100000
Chunk: 4, size of chunk 100000
Chunk: 5, size of chunk 100000
Chunk: 6, size of chunk 99999
Chunk: 7, size of chunk 100000
Chunk: 8, size of chunk 100000
Chunk: 9, size of chunk 100000
Chunk: 10, size of chunk 99999


In [16]:
searches.shape

(999997, 45)

In [18]:
def bookings_data_preprocessing(df): 
    '''
    
    '''
    df.drop_duplicates(inplace=True)
    
    for column in df.columns:
        df[column] = df[column].str.strip()
    
        if 'port' in column:
            df[column] = df[column].str.upper()
            
            # Para saber si hay lengths superiores a 3 sin que nos de error por los missing values
            cell_lengths = df[column].map(lambda x: len(x) if not isinstance(x, float) else x)
            cell_lengths_notna = cell_lengths[cell_lengths.notna()]
            cell_lengths_sum = (cell_lengths_notna != 3.0).sum()
            if cell_lengths_sum != 0:
                print(f'Alert! There are `{column}` values with length different than 3. Dropping them..')

                mask = cell_lengths_notna != 3.0
                wrong_data = cell_lengths_notna[mask]
                print(wrong_data)
                df.drop(index=wrong_data.index, inplace=True)
        else:
            df[column] = df[column].str.split().str[0]
        
    return df      

In [20]:
data_iterator = pd.read_csv("../challenge/bookings.csv.bz2", 
                            chunksize = 10**5, 
                            sep = '^', 
                            compression = 'bz2',
                            nrows = 10**6,
                            usecols = ['dep_port', 
                                       'arr_port', 
                                       'brd_time           ',
                                       'cre_date           ',
                                      ],
                            low_memory=False
                           )

bookings = pd.DataFrame()
for i, data_chunk in enumerate(data_iterator):
    data_chunk = bookings_data_preprocessing(data_chunk)
    #data_chunk.reset_index(inplace=True)
    data_chunk['Booking'] = 1
    bookings = bookings.append(data_chunk)
    print("Chunk: %d, size of chunk %d"%(i+1, data_chunk.shape[0]))


Chunk: 1, size of chunk 71284
Chunk: 2, size of chunk 71096
Chunk: 3, size of chunk 70668
Chunk: 4, size of chunk 70846
Chunk: 5, size of chunk 71757
Chunk: 6, size of chunk 72056
Chunk: 7, size of chunk 71845
Chunk: 8, size of chunk 71992
Chunk: 9, size of chunk 72239
Chunk: 10, size of chunk 72088


In [21]:
bookings.shape

(715871, 5)

Comprobar de nuevo los duplicados y eliminarlos

In [22]:
bookings.duplicated().sum()

56873

In [23]:
searches.duplicated().sum()

540998

In [24]:
bookings.drop_duplicates(inplace=True)
bookings.duplicated().sum()

0

In [25]:
searches.drop_duplicates(inplace=True)
searches.duplicated().sum()

0

In [26]:
print(bookings.shape)
print(searches.shape)

(658998, 5)
(458999, 45)


Merge

In [200]:
new_columns_bookings = [['Date', 'Seg1Departure', 'Seg1Arrival', 'Seg1Date', 'booking'], 
                        ['Date', 'Seg2Departure', 'Seg2Arrival', 'Seg2Date', 'booking'],
                        ['Date', 'Seg3Departure', 'Seg3Arrival', 'Seg3Date', 'booking'],
                        ['Date', 'Seg4Departure', 'Seg4Arrival', 'Seg4Date', 'booking'],
                        ['Date', 'Seg5Departure', 'Seg5Arrival', 'Seg5Date', 'booking'],
                        ['Date', 'Seg6Departure', 'Seg6Arrival', 'Seg6Date', 'booking'],
                       ]

In [201]:
for i, lista in enumerate(new_columns_bookings):
    bookings.columns = lista

    searches = searches.merge(bookings, 
                              how = 'left', 
                              on = lista[:4], 
                              suffixes=(i, i+1),
                             )
    print(searches.shape)

(458999, 46)
(458999, 47)
(458999, 48)
(458999, 49)
(458999, 50)
(458999, 51)


In [202]:
searches.iloc[:, -6:].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 458999 entries, 0 to 458998
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   booking1  530 non-null    float64
 1   booking2  214 non-null    float64
 2   booking3  36 non-null     float64
 3   booking4  30 non-null     float64
 4   booking5  3 non-null      float64
 5   booking6  1 non-null      float64
dtypes: float64(6)
memory usage: 24.5 MB


In [203]:
searches.iloc[:, -6:] = searches.iloc[:, -6:].fillna(0)

In [204]:
searches.iloc[:, -6:].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 458999 entries, 0 to 458998
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   booking1  458999 non-null  float64
 1   booking2  458999 non-null  float64
 2   booking3  458999 non-null  float64
 3   booking4  458999 non-null  float64
 4   booking5  458999 non-null  float64
 5   booking6  458999 non-null  float64
dtypes: float64(6)
memory usage: 24.5 MB


In [205]:
searches.iloc[:, -6:].sum()

booking1    530.0
booking2    214.0
booking3     36.0
booking4     30.0
booking5      3.0
booking6      1.0
dtype: float64

Creamos la nueva columna

In [206]:
searches['Booking'] = 0
for column in searches.columns[-7:-1]:
    searches['Booking'] += searches[column]

In [207]:
searches['Booking'].sum()

814.0

In [208]:
searches.iloc[:, -7:-1].sum().sum() #comprobacion

814.0

Eliminamos las columnas sobrantes

In [209]:
searches.columns[-7:-1]

Index(['booking1', 'booking2', 'booking3', 'booking4', 'booking5', 'booking6'], dtype='object')

In [212]:
print(searches.shape)
searches.drop(columns=searches.columns[-7:-1], inplace=True)
print(searches.shape)

(458999, 52)
(458999, 46)


In [213]:
searches['Booking'].sum()

814.0

In [27]:
bookings.isna().sum()

cre_date               0
dep_port               0
arr_port               0
brd_time               0
Booking                0
dtype: int64

In [28]:
searches.isna().sum()

Date                      0
Time                      0
TxnCode                   0
OfficeID                  0
Country                  51
Origin                    0
Destination               0
RoundTrip                 0
NbSegments                0
Seg1Departure             0
Seg1Arrival               0
Seg1Date               1282
Seg1Carrier          284543
Seg1BookingCode      411897
Seg2Departure        132239
Seg2Arrival          132239
Seg2Date             133765
Seg2Carrier          337279
Seg2BookingCode      418449
Seg3Departure        434559
Seg3Arrival          434559
Seg3Date             434691
Seg3Carrier          435786
Seg3BookingCode      437078
Seg4Departure        439480
Seg4Arrival          439480
Seg4Date             439599
Seg4Carrier          439648
Seg4BookingCode      439945
Seg5Departure        455221
Seg5Arrival          455221
Seg5Date             455249
Seg5Carrier          455244
Seg5BookingCode      455309
Seg6Departure        456732
Seg6Arrival         

|Column|NaN|
|:---:|:---:|
|Date|0|
|Seg1Departure|0|
|Seg1Arrival|0|
|Seg1Date|1282|

|Column|NaN|
|:---:|:---:|
|Date|0|
|Seg2Departure|132239|
|Seg2Arrival|132239|
|Seg2Date|133765|

Pdemos observar el mismo patron en todos los segmentos, `SegnDeparture` y `SegnArribal` tienen la misma cantidad de missing values, mientras que `SegnDate` tiene unos pocos mas. 

Ello significa que si un cierto dia se reservan vuelos de una ciudad a otra en dias distintos, estos tendran los mismos datos en las columnas relevantes para hacer el join.

Esto lo vamos a dejar para el finde.