In [1]:
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt

pd.options.display.max_columns = None
#pd.options.display.max_rows = 100

# Exercise 4

**Match searches with bookings**

- For every search in the searches file, find out whether the search ended up in a booking or not (using the info in the bookings file). For instance, search and booking origin and destination should match.
- For the bookings file, origin and destination are the columns `dep_port` and `arr_port`, respectively.
- Generate a CSV file with the search data, and an additional field, containing 1 if the search ended up in a booking, and 0 otherwise.

###  Steps

These were the steps I followed to solve the problem, but in this notebook you can see only the last version. If you want to see the other steps you can find them in the notebook called "Data_Science_Challenge_full_notebook.ipynb".

1. Get familiar with data
2. Prepare the data for processing
3. Make action plan
4. Develop the code that works with a sample
5. Adjust the code to work with Big data
6. Test big data approach on a sample
7. Run program with big data

### Approach

We will assign two new columns to `searches` dataset. The first one will be called `Number_of_flights` and the second one will be called `Booking`. You can see an explanation and three tables that will help you understand what we will do.

- `Number_of_flights`: The column `Number_of_flights` will show a number representing the number of flights a search that ended in a booking has. If the search didn't end in a booking the number will be `0`.
- `Booking`: This column will show a `1` if the search ended in a booking. Otherwise, the number will be `0`.

In the notebook there is a picture below the text which shows that each `Search` can have several flights. Each flight is showed as a `Seg`. That is, `Seg1` may be a flight from `D` to `R`, or maybe from `A` to `B`. `Seg2` may be a flight from `R` to `B`, or maybe from `B` to `A`.

As you can't see the picture in GitHub, I have built three tables that show something similar. In the second table we represent the origin and the destination as `dep_port` and `arr_port`.

![](../images/diagram.png)

|Searche|Seg 1|Seg 2|Seg 3|Seg 4|
|:--:|:--:|:--:|:--:|:--:|
|Searche 1|Seg1: D-R|Seg2: R-B|Seg3: B-R|Seg4: R-D|
|Searche 2|Seg1: A-B|Seg2: B-A|||

|Booking||dep_port|arr_port|
|:--:|:--:|:--:|:--:|
|Booking|...|A|B|
|Booking|...|B|A|

|Searche||Number of flights|Booking|
|:--:|:--:|:--:|:--:|
|Searche 1|...|0|0|
|Searche 2|...|2|1|

### Dropping duplicated columns

#### Searches

In [2]:
path_searches = "../challenge/searches.csv.bz2"
path_bookings = "../challenge/bookings.csv.bz2"
path_searches_no_dup = "../challenge/searches_no_dups.csv"
path_bookings_no_dup = "../challenge/bookings_no_dups.csv"

In [5]:
data_iterator = pd.read_csv(path_searches, 
                            chunksize=5*10**5, 
                            sep='^', 
                            compression='bz2', 
                            low_memory=False, 
                            dtype=str)

searches = pd.DataFrame()
for i, data_chunk in enumerate(data_iterator):
    searches = searches.append(data_chunk)
    searches.drop_duplicates(inplace=True)
    searches.to_csv(path_searches_no_dup, sep='^', index=False)
    print('Chunk: %d, size of chunk %d' %(i+1, searches.shape[0]))

Chunk: 1, size of chunk 358999
Chunk: 2, size of chunk 358999
Chunk: 3, size of chunk 359003
Chunk: 4, size of chunk 359003
Chunk: 5, size of chunk 359003
Chunk: 6, size of chunk 359003
Chunk: 7, size of chunk 359003
Chunk: 8, size of chunk 359003
Chunk: 9, size of chunk 359003
Chunk: 10, size of chunk 359003
Chunk: 11, size of chunk 359003
Chunk: 12, size of chunk 359003
Chunk: 13, size of chunk 359003
Chunk: 14, size of chunk 359003
Chunk: 15, size of chunk 359003
Chunk: 16, size of chunk 359003
Chunk: 17, size of chunk 359003
Chunk: 18, size of chunk 359003
Chunk: 19, size of chunk 359003
Chunk: 20, size of chunk 359003
Chunk: 21, size of chunk 359003
Chunk: 22, size of chunk 359003
Chunk: 23, size of chunk 359003
Chunk: 24, size of chunk 359003
Chunk: 25, size of chunk 359003
Chunk: 26, size of chunk 359003
Chunk: 27, size of chunk 359003
Chunk: 28, size of chunk 359003
Chunk: 29, size of chunk 359003
Chunk: 30, size of chunk 359003
Chunk: 31, size of chunk 359003
Chunk: 32, size o

**Arreglar los valores con comas.**

In [6]:
searches.tail()

Unnamed: 0,Date,Time,TxnCode,OfficeID,Country,Origin,Destination,RoundTrip,NbSegments,Seg1Departure,Seg1Arrival,Seg1Date,Seg1Carrier,Seg1BookingCode,Seg2Departure,Seg2Arrival,Seg2Date,Seg2Carrier,Seg2BookingCode,Seg3Departure,Seg3Arrival,Seg3Date,Seg3Carrier,Seg3BookingCode,Seg4Departure,Seg4Arrival,Seg4Date,Seg4Carrier,Seg4BookingCode,Seg5Departure,Seg5Arrival,Seg5Date,Seg5Carrier,Seg5BookingCode,Seg6Departure,Seg6Arrival,Seg6Date,Seg6Carrier,Seg6BookingCode,From,IsPublishedForNeg,IsFromInternet,IsFromVista,TerminalID,InternetOffice
1436000,2013-01-01,20:25:57,MPT,624d8c3ac0b3a7ca03e3c167e0f48327,DE,TXL,AUH,1.0,2.0,TXL,AUH,2013-01-26,D2,,AUH,TXL,2013-02-02,D2,,,,,,,,,,,,,,,,,,,1ASIWS,0,0.0,0,d41d8cd98f00b204e9800998ecf8427e,FRA,,,
1436001,"2013-01-01,10:15:33,MPT,b0af35b31588dc4ab06d5c...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1436002,2013-01-01,18:04:49,MPT,3561,US,ICT,SFO,1.0,2.0,ICT,SFO,2013-08-02,,,SFO,ICT,2013-08-09,,,,,,,,,,,,,,,,,,,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,NYC
1436009,2013-01-01,19:57:57,MPT,28d7a8c95e4db88589d3d35b66920e78,DE,FRA,BGW,1.0,2.0,FRA,BGW,2013-02-26,,,BGW,FRA,2013-04-08,,,,,,,,,,,,,,,,,,,,,1ASI,0.0,0,0,d41d8cd98f00b204e9800998ecf8427e,BNJ,,
20390197,2013-10-13,18:57:54,MTP,e41c9d833aa74600552f2ed688b67d81,AT,VIE,HA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


#### Bookings

In [8]:
data_iterator = pd.read_csv(path_bookings,
                            chunksize = 5*10**5,
                            sep = '^',
                            compression = 'bz2',
                            #nrows = 10**6,
                            usecols = ['dep_port', 
                                       'arr_port', 
                                       'brd_time           ', 
                                       'cre_date           '],
                            low_memory=True, 
                            dtype=str)

bookings = pd.DataFrame()
for i, data_chunk in enumerate(data_iterator):
    #data_chunk = bookings_data_preprocessing(data_chunk)
    #data_chunk.reset_index(inplace=True)
    #data_chunk['Booking'] = 1
    bookings = bookings.append(data_chunk)
    bookings.drop_duplicates(inplace=True)
    print("Chunk: %d, size of chunk %d" %(i+1, bookings.shape[0]))

Chunk: 1, size of chunk 354095
Chunk: 2, size of chunk 708879
Chunk: 3, size of chunk 708879
Chunk: 4, size of chunk 708879
Chunk: 5, size of chunk 708879
Chunk: 6, size of chunk 708879
Chunk: 7, size of chunk 708879
Chunk: 8, size of chunk 708879
Chunk: 9, size of chunk 708879
Chunk: 10, size of chunk 708879
Chunk: 11, size of chunk 708882
Chunk: 12, size of chunk 708882
Chunk: 13, size of chunk 708882
Chunk: 14, size of chunk 708882
Chunk: 15, size of chunk 708882
Chunk: 16, size of chunk 708882
Chunk: 17, size of chunk 708882
Chunk: 18, size of chunk 708882
Chunk: 19, size of chunk 708882
Chunk: 20, size of chunk 708882
Chunk: 21, size of chunk 708882


The `Booking` dataset columns have blank spaces, so let's remove them before save the data.

In [9]:
bookings.columns = bookings.columns.str.strip()
bookings.columns.tolist()

['cre_date', 'dep_port', 'arr_port', 'brd_time']

In [10]:
bookings.to_csv(path_bookings_no_dup, sep='^', index=False)

### Loading the data

#### Searches

In [3]:
def searches_data_preprocessing(df, columns_to_clean):
    '''
    '''
    df.drop_duplicates(inplace=True)
    df.dropna(inplace=True, subset=['Date', 'Seg1Departure'])
    
    for column in columns_to_clean:
        df[column] = df[column].str.strip()
        
        if 'Date' not in column:
            df[column] = df[column].str.upper()
            # Para saber si hay lengths superiores a 3 sin que nos de error por los missing values
            cell_lengths = df[column].map(lambda x: len(x) if not isinstance(x, float) else x)
            cell_lengths_notna = cell_lengths[cell_lengths.notna()]
            cell_lengths_sum = (cell_lengths_notna != 3.0).sum()
            
            if cell_lengths_sum != 0:
                print(f'Alert! There are `{column}` values with length different than 3. Dropping them..')
                mask = cell_lengths_notna != 3.0
                wrong_data = cell_lengths_notna[mask]
                print(wrong_data)
                df.drop(index=wrong_data.index, inplace=True)
    return df

In [4]:
columns_to_clean = ['Date', 'Seg1Departure', 'Seg1Arrival', 'Seg1Date', 
                    'Seg2Departure', 'Seg2Arrival', 'Seg2Date', 
                    'Seg3Departure', 'Seg3Arrival', 'Seg3Date', 
                    'Seg4Departure', 'Seg4Arrival', 'Seg4Date', 
                    'Seg5Departure', 'Seg5Arrival', 'Seg5Date', 
                    'Seg6Departure', 'Seg6Arrival', 'Seg6Date']

In [5]:
data_iterator = pd.read_csv(path_searches_no_dup, 
                            chunksize=10**5, 
                            sep='^', 
                            low_memory=False)

searches = pd.DataFrame()
for i, data_chunk in enumerate(data_iterator):
    data_chunk = searches_data_preprocessing(data_chunk, columns_to_clean)
    #data_chunk.reset_index(inplace=True)
    searches = searches.append(data_chunk)
    #searches.drop_duplicates(inplace=True)
    print("Chunk: %d, size of chunk %d" %(i+1, searches.shape[0]))

Chunk: 1, size of chunk 100000
Chunk: 2, size of chunk 200000
Chunk: 3, size of chunk 300000
Chunk: 4, size of chunk 359002


#### Bookings

In [6]:
def bookings_data_preprocessing(df):
    '''
    '''
    df.dropna(inplace=True, subset=['cre_date'])
    df['cre_date'] = df['cre_date'].str.split(',').str[0]
    df.drop_duplicates(inplace=True)
    
    for column in df.columns:
        df[column] = df[column].str.strip()
        
        if 'port' in column:
            df[column] = df[column].str.upper()
            
            # Para saber si hay lengths superiores a 3 sin que nos de error por los missing values
            cell_lengths = df[column].map(lambda x: len(x) if not isinstance(x, float) else x)
            cell_lengths_notna = cell_lengths[cell_lengths.notna()]
            cell_lengths_sum = (cell_lengths_notna != 3.0).sum()
            if cell_lengths_sum != 0:
                print(f'Alert! There are `{column}` values with length different than 3. Dropping them..')
                mask = cell_lengths_notna != 3.0
                wrong_data = cell_lengths_notna[mask]
                print(wrong_data)
                df.drop(index=wrong_data.index, inplace=True)
        else:
            df[column] = df[column].str.split().str[0]
    return df    

In [7]:
data_iterator = pd.read_csv(path_bookings_no_dup, 
                            chunksize = 10**5, 
                            sep = '^', 
                            usecols = ['dep_port', 
                                       'arr_port', 
                                       'brd_time', 
                                       'cre_date'])

bookings = pd.DataFrame()
for i, data_chunk in enumerate(data_iterator):
    data_chunk = bookings_data_preprocessing(data_chunk)
    #data_chunk.reset_index(inplace=True)
    data_chunk['Booking'] = 1
    bookings = bookings.append(data_chunk)
    print("Chunk: %d, size of chunk %d" %(i+1, bookings.shape[0]))

Chunk: 1, size of chunk 100000
Chunk: 2, size of chunk 200000
Chunk: 3, size of chunk 300000
Chunk: 4, size of chunk 400000
Chunk: 5, size of chunk 500000
Chunk: 6, size of chunk 600000
Chunk: 7, size of chunk 700000
Alert! There are `dep_port` values with length different than 3. Dropping them..
708880    2
Name: dep_port, dtype: int64
Chunk: 8, size of chunk 708881


In [8]:
bookings.shape

(708881, 5)

In [9]:
bookings.tail()

Unnamed: 0,cre_date,dep_port,arr_port,brd_time,Booking
708876,2013-05-28,AUS,RDU,2013-07-12,1
708877,2013-05-28,TLS,ORY,2013-06-04,1
708878,2013-05-28,TLS,ORY,2013-06-04,1
708879,2013-03-26,AKL,SVO,2013-04-24,1
708881,2013-03-25,TYO,SIN,2013-04-16,1


### Checking duplicates one more time

In [10]:
bookings.duplicated().sum()

49882

In [11]:
searches.duplicated().sum()

0

In [12]:
bookings.drop_duplicates(inplace=True)
bookings.duplicated().sum()

0

In [13]:
print(bookings.shape)
print(searches.shape)

(658999, 5)
(359002, 45)


### Merge --> left join

To merge both datasets we will build a for loop that will help us to iterate over `new_columns_bookings`. `new_columns_bookings` is a list of lists and each list contains the `searches` column names in addition to `Booking`. In this way we can replace the right list as `booking` column names  in each iteration. Each iteration will create a new column than contains `1` when the right segment match with a booking.

In [14]:
new_columns_bookings = [['Date', 'Seg1Departure', 'Seg1Arrival', 'Seg1Date', 'Booking'], 
                        ['Date', 'Seg2Departure', 'Seg2Arrival', 'Seg2Date', 'Booking'],
                        ['Date', 'Seg3Departure', 'Seg3Arrival', 'Seg3Date', 'Booking'],
                        ['Date', 'Seg4Departure', 'Seg4Arrival', 'Seg4Date', 'Booking'],
                        ['Date', 'Seg5Departure', 'Seg5Arrival', 'Seg5Date', 'Booking'],
                        ['Date', 'Seg6Departure', 'Seg6Arrival', 'Seg6Date', 'Booking'],
                       ]

In [15]:
for i, lista in enumerate(new_columns_bookings):
    bookings.columns = lista

    searches = searches.merge(bookings, 
                              how = 'left', 
                              on = lista[:4], 
                              suffixes=(i, i+1))
    print(searches.shape)

(359002, 46)
(359002, 47)
(359002, 48)
(359002, 49)
(359002, 50)
(359002, 51)


In [16]:
searches.iloc[:, -6:].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 359002 entries, 0 to 359001
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Booking1  451 non-null    float64
 1   Booking2  181 non-null    float64
 2   Booking3  31 non-null     float64
 3   Booking4  25 non-null     float64
 4   Booking5  3 non-null      float64
 5   Booking6  1 non-null      float64
dtypes: float64(6)
memory usage: 19.2 MB


Now we fill the `NaN` values with `0` in each new column.

In [17]:
searches.iloc[:, -6:] = searches.iloc[:, -6:].fillna(0)

In [18]:
searches.iloc[:, -6:].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 359002 entries, 0 to 359001
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   Booking1  359002 non-null  float64
 1   Booking2  359002 non-null  float64
 2   Booking3  359002 non-null  float64
 3   Booking4  359002 non-null  float64
 4   Booking5  359002 non-null  float64
 5   Booking6  359002 non-null  float64
dtypes: float64(6)
memory usage: 19.2 MB


In [19]:
searches.iloc[:, -6:].sum()

Booking1    451.0
Booking2    181.0
Booking3     31.0
Booking4     25.0
Booking5      3.0
Booking6      1.0
dtype: float64

In [20]:
(searches['Booking2']!=0).sum()

181

In [21]:
searches[searches['Booking2']!=0]

Unnamed: 0,Date,Time,TxnCode,OfficeID,Country,Origin,Destination,RoundTrip,NbSegments,Seg1Departure,Seg1Arrival,Seg1Date,Seg1Carrier,Seg1BookingCode,Seg2Departure,Seg2Arrival,Seg2Date,Seg2Carrier,Seg2BookingCode,Seg3Departure,Seg3Arrival,Seg3Date,Seg3Carrier,Seg3BookingCode,Seg4Departure,Seg4Arrival,Seg4Date,Seg4Carrier,Seg4BookingCode,Seg5Departure,Seg5Arrival,Seg5Date,Seg5Carrier,Seg5BookingCode,Seg6Departure,Seg6Arrival,Seg6Date,Seg6Carrier,Seg6BookingCode,From,IsPublishedForNeg,IsFromInternet,IsFromVista,TerminalID,InternetOffice,Booking1,Booking2,Booking3,Booking4,Booking5,Booking6
1514,2013-01-02,09:54:39,FQP,4014b98b900d9d8620abcfce2028e66d,NO,BGO,BOO,1.0,6.0,BGO,OSL,2013-01-09,DU,L,OSL,TRD,2013-01-09,DU,L,TRD,BOO,2013-01-09,HP,L,BOO,TRD,2013-01-11,HP,L,TRD,OSL,2013-01-11,DU,L,OSL,BGO,2013-01-11,DU,L,1ASI,0,0,0,d41d8cd98f00b204e9800998ecf8427e,OSL,0.0,1.0,0.0,0.0,0.0,0.0
1644,2013-01-02,12:51:33,CAL,cb028e2166a95168fee7d483a463ae54,DE,HAM,FRA,1.0,2.0,HAM,FRA,2013-01-14,7U,,FRA,HAM,2013-01-14,7U,,,,,,,,,,,,,,,,,,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,FRA,0.0,1.0,0.0,0.0,0.0,0.0
3112,2013-01-04,15:33:10,FXA,be00295076af5aa39c27a0af17b58b26,DE,TXL,SFO,1.0,6.0,TXL,LHR,2013-01-24,LK,J,LHR,JFK,2013-01-24,LK,F,JFK,SFO,2013-01-24,LK,F,SFO,LAX,2013-01-26,LK,F,LAX,LHR,2013-01-26,LK,F,LHR,TXL,2013-01-27,LK,J,1ASI,0,0,0,d41d8cd98f00b204e9800998ecf8427e,CGN,0.0,1.0,0.0,0.0,0.0,0.0
3651,2013-01-04,13:02:12,MPT,1ee479f5d8e435942d95363363634aee,DE,CUR,AUA,1.0,2.0,CUR,AUA,2013-02-01,,,AUA,CUR,2013-02-08,,,,,,,,,,,,,,,,,,,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,FRA,0.0,1.0,0.0,0.0,0.0,0.0
6052,2013-01-07,07:39:25,FXX,86e879ef1d918ab776004f7413244749,FR,TLS,PBM,1.0,4.0,TLS,AMS,2013-02-12,UV,J,AMS,PBM,2013-02-12,UV,J,PBM,AMS,2013-02-21,UV,J,AMS,TLS,2013-02-22,UV,J,,,,,,,,,,,1ASI,0,0,0,ebf7110aa4e3c97dc3938c94de548a62,PAR,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329538,2013-11-26,07:13:57,MPT,715773658985f59706880801440e0678,FI,OUL,HEL,1.0,2.0,OUL,HEL,2013-11-27,JJ,,HEL,OUL,2013-11-28,JJ,,,,,,,,,,,,,,,,,,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,0,0.0,1.0,0.0,0.0,0.0,0.0
330106,2013-11-27,05:46:03,FXX,73efca0beec954cfa548c5738b982143,PH,MNL,KUL,0.0,2.0,MNL,KUL,2013-12-04,WR,J,KUL,JED,2013-12-04,WR,J,,,,,,,,,,,,,,,,,,,,,1ASI,0,0,0,9cf7ac47ff8b51b12624e245bfef0c62,0,0.0,1.0,0.0,0.0,0.0,0.0
335282,2013-12-02,16:57:36,FXP,40ae285c1f81620812b0b4a511fb6ea1,GB,LHR,JFK,1.0,2.0,LHR,JFK,2013-12-11,LK,F,JFK,LHR,2013-12-15,LK,F,,,,,,,,,,,,,,,,,,,,,1ASI,0,0,0,46d6e2e1d23eba0c711965486ce35a6d,0,0.0,1.0,0.0,0.0,0.0,0.0
353796,2013-12-20,08:23:12,MPT,236cbf458f79dbb9d5e9c9430438c3db,US,ORD,SFO,1.0,2.0,ORD,SFO,2014-02-14,FD,,SFO,ORD,2014-02-17,FD,,,,,,,,,,,,,,,,,,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,0,0.0,1.0,0.0,0.0,0.0,0.0


In [22]:
searches[(searches['Booking2']!=0) & (searches['Booking1']!=0)]

Unnamed: 0,Date,Time,TxnCode,OfficeID,Country,Origin,Destination,RoundTrip,NbSegments,Seg1Departure,Seg1Arrival,Seg1Date,Seg1Carrier,Seg1BookingCode,Seg2Departure,Seg2Arrival,Seg2Date,Seg2Carrier,Seg2BookingCode,Seg3Departure,Seg3Arrival,Seg3Date,Seg3Carrier,Seg3BookingCode,Seg4Departure,Seg4Arrival,Seg4Date,Seg4Carrier,Seg4BookingCode,Seg5Departure,Seg5Arrival,Seg5Date,Seg5Carrier,Seg5BookingCode,Seg6Departure,Seg6Arrival,Seg6Date,Seg6Carrier,Seg6BookingCode,From,IsPublishedForNeg,IsFromInternet,IsFromVista,TerminalID,InternetOffice,Booking1,Booking2,Booking3,Booking4,Booking5,Booking6
6954,2013-01-07,09:31:03,FXB,e7daf073dd67b8513cc1e36ac28eee5f,FR,ORY,NCE,1.0,2.0,ORY,NCE,2013-01-10,KP,J,NCE,ORY,2013-01-10,KP,J,,,,,,,,,,,,,,,,,,,,,1ASI,0,0,0,8f0d16f6cbd27d0a62f1e124ebd2c2e6,PAR,1.0,1.0,0.0,0.0,0.0,0.0
15607,2013-01-16,11:40:06,MPT,50c54b52322080d2c257390f714f91bd,ES,MAD,BCN,1.0,2.0,MAD,BCN,2013-01-17,,,BCN,MAD,2013-01-24,,,,,,,,,,,,,,,,,,,,,,,1ASI,0,0,0,d41d8cd98f00b204e9800998ecf8427e,MAD,1.0,1.0,0.0,0.0,0.0,0.0
105548,2013-04-16,13:37:31,FXA,321427684618b693b937f8c271e0cf0a,SE,CPH,DUS,1.0,2.0,CPH,DUS,2013-04-24,DU,C,DUS,CPH,2013-04-25,DU,C,,,,,,,,,,,,,,,,,,,,,1ASI,0,0,0,6b543f9251e6bfc907e2b5fe027e0995,MMA,1.0,1.0,0.0,0.0,0.0,0.0
287092,2013-10-15,14:50:08,FXR,dafa0f79e75d46cb574ee4b4d9138e51,IN,DEL,BOM,1.0,2.0,DEL,BOM,2013-10-15,KS,C,BOM,DEL,2013-10-18,KS,C,,,,,,,,,,,,,,,,,,,,,1ASI,0,0,0,6f8e64b103c9f4583918a0ce31a115ac,0,1.0,1.0,0.0,0.0,0.0,0.0


As you are probably assuming, it can't be possible a search has a `1` in `Booking2` and a `0` in `Booking1`. The same can be applied for the other columns. If `Booking1` is `0`, the other columns have to be `0`.

As each new column represent a segment, it is not possible that a search has a `0` in `Booking1` (which represent `Seg1`) and a `1` in `Booking2` (which represent `Seg2`). If you have a round trip (`RoundTrip` column), you hope to have the outbound flight too. Having a `1` in `Booking2` and a `0` in `Booking1` would mean that you have the return flight, but not the outbound one.

We did our best with the data we have been provided, so let's remove them.

In [23]:
searches.columns[-6:].tolist()

['Booking1', 'Booking2', 'Booking3', 'Booking4', 'Booking5', 'Booking6']

In [24]:
condition_1 = searches['Booking1'] == 1
condition_2 = ((searches[['Booking1', 'Booking2']] == 1).all(axis=1)).sum()
condition_2

4

In [25]:
condition_1 = (searches['Booking1'] == 1).sum() > 0
condition_2 = ((searches[['Booking1', 'Booking2']] == 1).all(axis=1)).sum() > 0
condition_3 = ((searches[['Booking1', 'Booking2', 'Booking3']] == 1).all(axis=1)).sum() > 0
condition_4 = ((searches[['Booking1', 'Booking2', 'Booking3', 'Booking4']] == 1).all(axis=1)).sum() > 0
condition_5 = ((searches[['Booking1', 'Booking2', 'Booking3', 'Booking4', 'Booking5']] == 1).all(axis=1)).sum() > 0
condition_6 = ((searches[['Booking1', 'Booking2', 'Booking3', 'Booking4', 'Booking5', 'Booking6']] == 1).all(axis=1)).sum() > 0
print(condition_1)
print(condition_2)
print(condition_3)
print(condition_4)
print(condition_5)
print(condition_6)

True
True
False
False
False
False


In [26]:
b2 = searches[(searches[['Booking1', 'Booking2']] == 1).all(axis=1)]
b2

Unnamed: 0,Date,Time,TxnCode,OfficeID,Country,Origin,Destination,RoundTrip,NbSegments,Seg1Departure,Seg1Arrival,Seg1Date,Seg1Carrier,Seg1BookingCode,Seg2Departure,Seg2Arrival,Seg2Date,Seg2Carrier,Seg2BookingCode,Seg3Departure,Seg3Arrival,Seg3Date,Seg3Carrier,Seg3BookingCode,Seg4Departure,Seg4Arrival,Seg4Date,Seg4Carrier,Seg4BookingCode,Seg5Departure,Seg5Arrival,Seg5Date,Seg5Carrier,Seg5BookingCode,Seg6Departure,Seg6Arrival,Seg6Date,Seg6Carrier,Seg6BookingCode,From,IsPublishedForNeg,IsFromInternet,IsFromVista,TerminalID,InternetOffice,Booking1,Booking2,Booking3,Booking4,Booking5,Booking6
6954,2013-01-07,09:31:03,FXB,e7daf073dd67b8513cc1e36ac28eee5f,FR,ORY,NCE,1.0,2.0,ORY,NCE,2013-01-10,KP,J,NCE,ORY,2013-01-10,KP,J,,,,,,,,,,,,,,,,,,,,,1ASI,0,0,0,8f0d16f6cbd27d0a62f1e124ebd2c2e6,PAR,1.0,1.0,0.0,0.0,0.0,0.0
15607,2013-01-16,11:40:06,MPT,50c54b52322080d2c257390f714f91bd,ES,MAD,BCN,1.0,2.0,MAD,BCN,2013-01-17,,,BCN,MAD,2013-01-24,,,,,,,,,,,,,,,,,,,,,,,1ASI,0,0,0,d41d8cd98f00b204e9800998ecf8427e,MAD,1.0,1.0,0.0,0.0,0.0,0.0
105548,2013-04-16,13:37:31,FXA,321427684618b693b937f8c271e0cf0a,SE,CPH,DUS,1.0,2.0,CPH,DUS,2013-04-24,DU,C,DUS,CPH,2013-04-25,DU,C,,,,,,,,,,,,,,,,,,,,,1ASI,0,0,0,6b543f9251e6bfc907e2b5fe027e0995,MMA,1.0,1.0,0.0,0.0,0.0,0.0
287092,2013-10-15,14:50:08,FXR,dafa0f79e75d46cb574ee4b4d9138e51,IN,DEL,BOM,1.0,2.0,DEL,BOM,2013-10-15,KS,C,BOM,DEL,2013-10-18,KS,C,,,,,,,,,,,,,,,,,,,,,1ASI,0,0,0,6f8e64b103c9f4583918a0ce31a115ac,0,1.0,1.0,0.0,0.0,0.0,0.0


In [27]:
searches['Number_of_flights'] = searches['Booking1'] # 1 or 0

for index in b2.index:
    searches.loc[index, 'Number_of_flights'] += searches.loc[index, 'Booking2']

In [28]:
searches['Number_of_flights'].sum()

455.0

In [29]:
searches[searches['Number_of_flights'] > 1.0]

Unnamed: 0,Date,Time,TxnCode,OfficeID,Country,Origin,Destination,RoundTrip,NbSegments,Seg1Departure,Seg1Arrival,Seg1Date,Seg1Carrier,Seg1BookingCode,Seg2Departure,Seg2Arrival,Seg2Date,Seg2Carrier,Seg2BookingCode,Seg3Departure,Seg3Arrival,Seg3Date,Seg3Carrier,Seg3BookingCode,Seg4Departure,Seg4Arrival,Seg4Date,Seg4Carrier,Seg4BookingCode,Seg5Departure,Seg5Arrival,Seg5Date,Seg5Carrier,Seg5BookingCode,Seg6Departure,Seg6Arrival,Seg6Date,Seg6Carrier,Seg6BookingCode,From,IsPublishedForNeg,IsFromInternet,IsFromVista,TerminalID,InternetOffice,Booking1,Booking2,Booking3,Booking4,Booking5,Booking6,Number_of_flights
6954,2013-01-07,09:31:03,FXB,e7daf073dd67b8513cc1e36ac28eee5f,FR,ORY,NCE,1.0,2.0,ORY,NCE,2013-01-10,KP,J,NCE,ORY,2013-01-10,KP,J,,,,,,,,,,,,,,,,,,,,,1ASI,0,0,0,8f0d16f6cbd27d0a62f1e124ebd2c2e6,PAR,1.0,1.0,0.0,0.0,0.0,0.0,2.0
15607,2013-01-16,11:40:06,MPT,50c54b52322080d2c257390f714f91bd,ES,MAD,BCN,1.0,2.0,MAD,BCN,2013-01-17,,,BCN,MAD,2013-01-24,,,,,,,,,,,,,,,,,,,,,,,1ASI,0,0,0,d41d8cd98f00b204e9800998ecf8427e,MAD,1.0,1.0,0.0,0.0,0.0,0.0,2.0
105548,2013-04-16,13:37:31,FXA,321427684618b693b937f8c271e0cf0a,SE,CPH,DUS,1.0,2.0,CPH,DUS,2013-04-24,DU,C,DUS,CPH,2013-04-25,DU,C,,,,,,,,,,,,,,,,,,,,,1ASI,0,0,0,6b543f9251e6bfc907e2b5fe027e0995,MMA,1.0,1.0,0.0,0.0,0.0,0.0,2.0
287092,2013-10-15,14:50:08,FXR,dafa0f79e75d46cb574ee4b4d9138e51,IN,DEL,BOM,1.0,2.0,DEL,BOM,2013-10-15,KS,C,BOM,DEL,2013-10-18,KS,C,,,,,,,,,,,,,,,,,,,,,1ASI,0,0,0,6f8e64b103c9f4583918a0ce31a115ac,0,1.0,1.0,0.0,0.0,0.0,0.0,2.0


In [30]:
searches.columns[-7:-1]

Index(['Booking1', 'Booking2', 'Booking3', 'Booking4', 'Booking5', 'Booking6'], dtype='object')

In [31]:
print(searches.shape)
searches.drop(columns=searches.columns[-7:-1], inplace=True)
print(searches.shape)

(359002, 52)
(359002, 46)


In [32]:
(searches['Number_of_flights'] > 1).sum()

4

In [33]:
(searches['Number_of_flights'] > 0).sum()

451

In [34]:
searches.loc[searches['Number_of_flights'] > 0, 'Booking'] = 1

In [35]:
searches['Booking'].sum()

451.0

De las 359.002 searches, 451 terminaron en booking. De esas 451, solo 4 tienen 2 vuelos. Ninguna tiene mas de dos vuelos.

In [36]:
searches['Booking'] = searches['Booking'].fillna(0)

In [37]:
searches

Unnamed: 0,Date,Time,TxnCode,OfficeID,Country,Origin,Destination,RoundTrip,NbSegments,Seg1Departure,Seg1Arrival,Seg1Date,Seg1Carrier,Seg1BookingCode,Seg2Departure,Seg2Arrival,Seg2Date,Seg2Carrier,Seg2BookingCode,Seg3Departure,Seg3Arrival,Seg3Date,Seg3Carrier,Seg3BookingCode,Seg4Departure,Seg4Arrival,Seg4Date,Seg4Carrier,Seg4BookingCode,Seg5Departure,Seg5Arrival,Seg5Date,Seg5Carrier,Seg5BookingCode,Seg6Departure,Seg6Arrival,Seg6Date,Seg6Carrier,Seg6BookingCode,From,IsPublishedForNeg,IsFromInternet,IsFromVista,TerminalID,InternetOffice,Number_of_flights,Booking
0,2013-01-01,20:25:57,MPT,624d8c3ac0b3a7ca03e3c167e0f48327,DE,TXL,AUH,1.0,2.0,TXL,AUH,2013-01-26,D2,,AUH,TXL,2013-02-02,D2,,,,,,,,,,,,,,,,,,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,FRA,0.0,0.0
1,2013-01-01,10:15:33,MPT,b0af35b31588dc4ab06d5cf2986e8e02,MD,ATH,MIL,0.0,1.0,ATH,MIL,2013-01-04,,,,,,,,,,,,,,,,,,,,,,,,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,KIV,0.0,0.0
2,2013-01-01,18:04:49,MPT,3561a60621de06ab1badc8ca55699ef3,US,ICT,SFO,1.0,2.0,ICT,SFO,2013-08-02,,,SFO,ICT,2013-08-09,,,,,,,,,,,,,,,,,,,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,NYC,0.0,0.0
3,2013-01-01,17:42:40,FXP,1864e5e8013d9414150e91d26b6a558b,SE,RNB,ARN,0.0,1.0,RNB,ARN,2013-01-02,DU,W,,,,,,,,,,,,,,,,,,,,,,,,,,1ASI,0,0,0,d41d8cd98f00b204e9800998ecf8427e,STO,0.0,0.0
4,2013-01-01,17:48:29,MPT,1ec336348f44207d2e0027dc3a68c118,NO,OSL,MAD,1.0,2.0,OSL,MAD,2013-03-22,,,MAD,OSL,2013-03-31,,,,,,,,,,,,,,,,,,,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,OSL,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358997,2013-12-25,03:32:53,FFP,8837afb0e639c78916107ffa3bc40984,GB,DME,BKK,0.0,1.0,DME,BKK,2013-12-28,KM,,,,,,,,,,,,,,,,,,,,,,,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,0,0.0,0.0
358998,2013-12-25,20:54:39,FXA,7f8c2d9dfe430c2c6b19298dd7e0ff96,US,LAX,TLV,1.0,5.0,LAX,AMS,2014-02-02,UV,J,AMS,TLV,2014-02-03,UV,J,TLV,AMS,2014-02-17,UV,J,AMS,ATL,2014-02-17,UV,J,ATL,LAX,2014-02-17,UV,J,,,,,,1ASI,0,0,0,d41d8cd98f00b204e9800998ecf8427e,0,0.0,0.0
358999,2013-01-01,20:25:57,MPT,624d8c3ac0b3a7ca03e3c167e0f48327,DE,TXL,AUH,1.0,2.0,TXL,AUH,2013-01-26,D2,,AUH,TXL,2013-02-02,D2,,,,,,,,,,,,,,,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,FRA,,,,0.0,0.0
359000,2013-01-01,18:04:49,MPT,3561,US,ICT,SFO,1.0,2.0,ICT,SFO,2013-08-02,,,SFO,ICT,2013-08-09,,,,,,,,,,,,,,,,,,,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,NYC,0.0,0.0


In [38]:
searches.to_csv('../challenge/searches_booking.csv', sep='^', index=False)