In [28]:
cd ".."

c:\Users\offic\Desktop


In [42]:
import json
import pandas as pd
import re
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [31]:
from Utilities.Scripts.Functions import *

In [32]:
with open("Configurations.yaml","r") as configurations:
    _Configurations = json.load(configurations)

## Loading Dataset

In [33]:
df = pd.read_csv(_Configurations["Dataset"]["Processed"]["SampleDataset"])

In [34]:
invalidTransactions = df["TRANSACTION_KEY"][(df["DEPARTURE_DATE"]=="'\\N'") & (df["MARKETING_AIRLINE_CD"]!="'V'")].unique()
df = df[~df["TRANSACTION_KEY"].isin(invalidTransactions)]

In [8]:
# Let's have a look if a transaction key has more than one-trip type

for transactionKey, frame in df.groupby(by=["TRANSACTION_KEY"]):
    if len(frame["TRIP_TYPE"].unique()) > 1:
        print(transactionKey)

In [13]:
# let's save the properties of trip-type for transaction-keys - Do some work here

transactionKeyTripTypeDf = df[["TRANSACTION_KEY","TRANSACTION_TYPE", "TICKETING_AIRLINE","TICKETING_AIRLINE_CD","AGENCY","ISSUE_DATE","COUNTRY",
                            "TRIP_TYPE", "CABIN"]]
transactionKeyTripTypeDf.drop_duplicates(keep='first', inplace=True)
transactionKeyTripTypeDf.to_csv("Utilities/Dataset/Processed/SampleDatasetTransactionKeyRelations.csv",index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transactionKeyTripTypeDf.drop_duplicates(keep='first', inplace=True)


## Implementing Algorithm

We convert
- One Way Trip with Multiple Segments: A Single One-Way trip
- Return Trip into two One-Way Trips
- Complex Trip into Multiple One-Way Trips

Heurisitics:
- Maximum Difference between two possible segments: 1 Day
- A entry is considered a part of an ongoing trip, if and only if it does not visit the a previously visited node
- The Date of diparture for a one-way trip would be based on the lowest of all departure dates in that trip

In [227]:
lst = ["'T-1809239577600177000'", "'T-1808639477801778934'", "'T-1808639477801469119'", "'T-1808639477801890663'", "'T-1808639477801469119'", "'T-1833843851800015549'", "'T-1808639477801886872'", "'T-1833843851300180290'"]
tempDf = df[["TRANSACTION_KEY", "TRANSACTION_TYPE", "TRIP_TYPE","SEG_NUMBER","ORIGIN_AIRPORT","DESTINATION_AIRPORT","DEPARTURE_DATE"]][df["TRANSACTION_KEY"].isin(lst)]
tempDf

Unnamed: 0,TRANSACTION_KEY,TRANSACTION_TYPE,TRIP_TYPE,SEG_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,DEPARTURE_DATE
231,'T-1808639477801469119','I','XX',1,'YEG','YVR','2018-04-04'
232,'T-1808639477801469119','I','XX',2,'YVR','YXS','2018-04-04'
233,'T-1808639477801469119','I','XX',3,'YXS','YVR','2018-04-05'
234,'T-1808639477801469119','I','XX',4,'YVR','YLW','2018-04-05'
235,'T-1808639477801469119','I','XX',5,'YLW','YEG','2018-04-08'
1072,'T-1808639477801778934','I','XX',1,'HRB','NKG','2018-03-29'
1073,'T-1808639477801778934','I','XX',2,'NKG','LJG','2018-03-30'
1485,'T-1808639477801886872','I','XX',1,'POA','GRU','2018-05-12'
1486,'T-1808639477801886872','I','XX',2,'GRU','MAD','2018-05-12'
1487,'T-1808639477801886872','I','XX',3,'MAD','BCN','2018-05-13'


In [35]:
def convertStringToDateTime(string):
    match = re.search("\d\d\d\d-\d\d-\d\d",string)
    return datetime.strptime(match.group(),'%Y-%m-%d') 
    #(x-y).days
    
def DateDifference(stringDate1, stringDate2):
    date1 = convertStringToDateTime(stringDate1)
    date2 = convertStringToDateTime(stringDate2)
    return abs((date1-date2).days)

In [36]:
def OneWayTripMergeSegmentHelper(transactionKey, frame):
    _dfDict = {"TRANSACTION_KEY":[], "TRIP_TYPE":[], "NUMBER_OF_SEGMENTS":[], "ORIGIN_AIRPORT":[], "DESTINATION_AIRPORT":[], "DEPARTURE_DATE":[]}
   
    _dfDict["TRANSACTION_KEY"].append(transactionKey)
    _dfDict["TRIP_TYPE"] = "'OW'"
    frame = frame.sort_values(by="SEG_NUMBER")

        
    firstRow = frame.iloc[0]
    lastRow = frame.iloc[-1]
    _dfDict["NUMBER_OF_SEGMENTS"] = lastRow["SEG_NUMBER"]
    _dfDict["ORIGIN_AIRPORT"] = firstRow["ORIGIN_AIRPORT"]
    _dfDict["DESTINATION_AIRPORT"] = lastRow["DESTINATION_AIRPORT"]
    _dfDict["DEPARTURE_DATE"] = firstRow["DEPARTURE_DATE"]

    return pd.DataFrame(_dfDict)

In [37]:
def ReturnTripMergeSegmentHelper(transactionKey, frame, originDepartureDate = None):
    _dfDict = {"TRANSACTION_KEY":[], "TRIP_TYPE":[], "NUMBER_OF_SEGMENTS":[], "ORIGIN_AIRPORT":[], "DESTINATION_AIRPORT":[], "DEPARTURE_DATE":[]}

    if frame.shape[0] == 0:
        return pd.DataFrame(_dfDict) # If the frame is empty

    frame.reset_index(drop=True)
    frame = frame.sort_values(by="SEG_NUMBER")
    firstRow = frame.iloc[0]

    originSegmentNumber = firstRow["SEG_NUMBER"]
    lastSegmentNumber = originSegmentNumber

    originAirport = firstRow["ORIGIN_AIRPORT"]

    destinationAirport = firstRow["DESTINATION_AIRPORT"]

    if originDepartureDate is None:
        originDepartureDate = firstRow["DEPARTURE_DATE"]
    previousDate = originDepartureDate
    
    visitedAirport = set([])
    visitedAirport.add(originAirport)
    visitedAirport.add(destinationAirport)

    locationalIndex = 1
    
    for index, row in frame[1:].iterrows():

        if row["ORIGIN_AIRPORT"] != destinationAirport:
            _dfDict["TRANSACTION_KEY"].append(transactionKey)
            _dfDict["TRIP_TYPE"].append("'OW'")
            _dfDict["NUMBER_OF_SEGMENTS"].append(lastSegmentNumber - originSegmentNumber + np.int64(1))
            _dfDict["ORIGIN_AIRPORT"].append(originAirport)
            _dfDict["DESTINATION_AIRPORT"].append(destinationAirport)
            _dfDict["DEPARTURE_DATE"].append(originDepartureDate)
            return pd.concat([pd.DataFrame(_dfDict),ReturnTripMergeSegmentHelper(transactionKey,frame[locationalIndex:])],axis=0) # Leveraging Recursion

        else:
            try: # Valid-Departure Date
                currentDate = row["DEPARTURE_DATE"]

                if DateDifference(currentDate,previousDate)<2 and row["DESTINATION_AIRPORT"] not in visitedAirport:
                    lastSegmentNumber = row["SEG_NUMBER"]
                    destinationAirport = row["DESTINATION_AIRPORT"]
                    previousDate = row["DEPARTURE_DATE"]
                    visitedAirport.add(destinationAirport)
                    locationalIndex+=1

                else:
                    _dfDict["TRANSACTION_KEY"].append(transactionKey)
                    _dfDict["TRIP_TYPE"].append("'OW'")
                    _dfDict["NUMBER_OF_SEGMENTS"].append(lastSegmentNumber - originSegmentNumber + np.int64(1))
                    _dfDict["ORIGIN_AIRPORT"].append(originAirport)
                    _dfDict["DESTINATION_AIRPORT"].append(destinationAirport)
                    _dfDict["DEPARTURE_DATE"].append(originDepartureDate)
                    return pd.concat([pd.DataFrame(_dfDict),ReturnTripMergeSegmentHelper(transactionKey,frame[locationalIndex:])],axis=0) # Leveraging Recursion

            except: # Non Flight Segment
                if row["DESTINATION_AIRPORT"] not in visitedAirport:
                    lastSegmentNumber = row["SEG_NUMBER"]
                    destinationAirport = row["DESTINATION_AIRPORT"]
                    visitedAirport.add(destinationAirport)
                    locationalIndex+=1

                else:
                    _dfDict["TRANSACTION_KEY"].append(transactionKey)
                    _dfDict["TRIP_TYPE"].append("'OW'")
                    _dfDict["NUMBER_OF_SEGMENTS"].append(lastSegmentNumber - originSegmentNumber + np.int64(1))
                    _dfDict["ORIGIN_AIRPORT"].append(originAirport)
                    _dfDict["DESTINATION_AIRPORT"].append(destinationAirport)
                    _dfDict["DEPARTURE_DATE"].append(originDepartureDate)
                    return pd.concat([pd.DataFrame(_dfDict),ReturnTripMergeSegmentHelper(transactionKey,frame[locationalIndex:], previousDate)],axis=0) # Leveraging Recursion

    _dfDict["TRANSACTION_KEY"].append(transactionKey)
    _dfDict["TRIP_TYPE"].append("'OW'")
    _dfDict["NUMBER_OF_SEGMENTS"].append(lastSegmentNumber - originSegmentNumber + np.int64(1))
    _dfDict["ORIGIN_AIRPORT"].append(originAirport)
    _dfDict["DESTINATION_AIRPORT"].append(destinationAirport)
    _dfDict["DEPARTURE_DATE"].append(originDepartureDate)
    return pd.DataFrame(_dfDict)
                

In [38]:
def MergeSegment(df):
    dataFrame = pd.DataFrame({"TRANSACTION_KEY":[], "TRIP_TYPE":[], "NUMBER_OF_SEGMENTS":[], "ORIGIN_AIRPORT":[], "DESTINATION_AIRPORT":[], "DEPARTURE_DATE":[]}) # The New Dataframe
    group = df.groupby(by=["TRANSACTION_KEY","TRIP_TYPE"])
    noOfGroups = len(group)
    print("Number of Groups : " , noOfGroups)
    currentNumber = 1
    for node, frame in tqdm(group):
        tripType = node[1]
        if tripType == "'OW'": # Processing One-Way Trip
            dataFrame = pd.concat([dataFrame,OneWayTripMergeSegmentHelper(node[0], frame)],axis=0)
        elif tripType == "'RT'": # Processing One Way Trip
            dataFrame = pd.concat([dataFrame,ReturnTripMergeSegmentHelper(node[0],frame)],axis=0)
        else: # Processing Complex Trip
            dataFrame = pd.concat([dataFrame,ReturnTripMergeSegmentHelper(node[0],frame)],axis=0)
        currentNumber+=1

    return dataFrame.reset_index(drop=True)


In [39]:
c = 5000
tempdf = df[["TRANSACTION_KEY", "TRIP_TYPE","SEG_NUMBER","ORIGIN_AIRPORT","DESTINATION_AIRPORT","DEPARTURE_DATE"]][0:3000]

for i in range(100):
    tempdf = pd.concat([tempdf,df[["TRANSACTION_KEY", "TRIP_TYPE","SEG_NUMBER","ORIGIN_AIRPORT","DESTINATION_AIRPORT","DEPARTURE_DATE"]][c*i:(c*i+3000)]],axis=0)

tempdf

Unnamed: 0,TRANSACTION_KEY,TRIP_TYPE,SEG_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,DEPARTURE_DATE
0,'T-1808639477801385302','OW',1,'CGO','SYX','2018-04-14'
1,'T-1808639477801385684','OW',1,'CTU','CGO','2018-03-31'
2,'T-1808639477801386920','OW',1,'BHY','CKG','2018-05-04'
3,'T-1808639477801386961','OW',1,'KWE','FOC','2018-03-30'
4,'T-1808639477801387207','OW',1,'NNG','CTU','2018-04-03'
...,...,...,...,...,...,...
498839,'T-1820941607600421260','OW',1,'CGO','SHA','2018-07-30'
498840,'T-1820941607600421683','OW',1,'PVG','SHE','2018-07-29'
498841,'T-1820941607600421908','OW',1,'PVG','SZX','2018-08-01'
498842,'T-1820941607600422797','OW',1,'WUX','SZX','2018-08-07'


In [44]:
tempdf = MergeSegment(tempdf)

Number of Groups :  162709


  0%|          | 0/162709 [00:00<?, ?it/s]

### SAVING BOOKING DATASET

In [46]:
tempdf.to_csv(r"C:\Users\offic\Desktop\Big Data Analysis and Transaction Predictive Modelling - ARC Transaction Dataset\Dataset\Processed\SampleDataset\ProjectWorkSampleBookingDataset.csv",index=False)

### CREATING A TRAVEL DESTINATIONS DATASET

Here, we don't consider the back trip to the origin, only the destinations travelled. We make this assumptions:
- A person may not consider home origin as travelling destination

In [49]:
# Replace all One-Way trip types by actual Trip Type, and remove the 
tempdf = tempdf.drop(['TRIP_TYPE'],axis=1)

In [55]:
tempdf

Unnamed: 0,TRANSACTION_KEY,NUMBER_OF_SEGMENTS,ORIGIN_AIRPORT,DESTINATION_AIRPORT,DEPARTURE_DATE
0,'T-1800537914200635964',1.0,'ORD','LAS','2018-01-13'
1,'T-1800537914200635964',2.0,'LAS','DCA','2018-01-15'
2,'T-1800537914200636207',1.0,'DFW','SAT','2018-01-20'
3,'T-1800537914200636207',1.0,'SAT','DFW','2018-01-20'
4,'T-1800537914200636760',1.0,'MIA','BDA','2018-01-08'
...,...,...,...,...,...
235821,'T-2021159117601533030',1.0,'GIG','CGH','2018-12-10'
235822,'T-2021159117601533146',2.0,'GIG','NVT','2019-04-18'
235823,'T-2021159117601533146',1.0,'NVT','GIG','2019-04-25'
235824,'T-2021159117601533278',1.0,'SDU','VIX','2018-11-09'


In [53]:
transactionKeyRelation_df = pd.read_csv(r"Utilities\Dataset\Processed\SampleDatasetTransactionKeyRelations.csv")
transactionKeyRelation_df

Unnamed: 0,TRANSACTION_KEY,TRANSACTION_TYPE,TICKETING_AIRLINE,TICKETING_AIRLINE_CD,AGENCY,ISSUE_DATE,COUNTRY,TRIP_TYPE,CABIN
0,'T-1808639477801385302','I','Chengdu Airlines ','811','','2018-03-27','CN','OW','Prem'
1,'T-1808639477801385684','I','Chengdu Airlines ','811','','2018-03-27','CN','OW','Econ'
2,'T-1808639477801386920','I','Chengdu Airlines ','811','','2018-03-26','CN','OW','Econ'
3,'T-1808639477801386961','I','Chengdu Airlines ','811','','2018-03-27','CN','OW','Econ'
4,'T-1808639477801387207','I','Chengdu Airlines ','811','','2018-03-27','CN','OW','Econ'
...,...,...,...,...,...,...,...,...,...
1048570,'T-1821041627900238256','I','American Airlines Inc. ','001','','2018-07-30','US','OW','Econ'
1048571,'T-1821041627900238592','I','American Airlines Inc. ','001','','2018-07-30','MX','OW','Econ'
1048572,'T-1821041627900238693','E','American Airlines Inc. ','001','','2018-07-30','US','OW','Econ'
1048573,'T-1821041627900239159','I','American Airlines Inc. ','001','','2018-07-30','US','OW','Econ'


In [56]:
tempdf = tempdf.merge(transactionKeyRelation_df,on="TRANSACTION_KEY",how="left")
tempdf.to_csv(r"C:\Users\offic\Desktop\Big Data Analysis and Transaction Predictive Modelling - ARC Transaction Dataset\Dataset\Processed\SampleDataset\ProjectWorkSampleBookingDataset.csv",index=False)

In [65]:
def OneWayTripGetHolidayDatasetHelper(transactionKey, tripType, frame):
    _dfDict = {"TRANSACTION_KEY":[], "TRIP_TYPE":[], "NUMBER_OF_SEGMENTS":[], "ORIGIN_AIRPORT":[], "DESTINATION_AIRPORT":[], "DEPARTURE_DATE":[]}
    row = frame.iloc[0]

    if row["ORIGIN_AIRPORT"]!= row["DESTINATION_AIRPORT"]:
        _dfDict["TRANSACTION_KEY"].append(transactionKey)
        _dfDict["TRIP_TYPE"].append(tripType)
        _dfDict["NUMBER_OF_SEGMENTS"].append(row["NUMBER_OF_SEGMENTS"])
        _dfDict["ORIGIN_AIRPORT"].append(row["ORIGIN_AIRPORT"])
        _dfDict["DESTINATION_AIRPORT"].append(row["DESTINATION_AIRPORT"])
        _dfDict["DEPARTURE_DATE"].append(row["DEPARTURE_DATE"])
        
    return pd.DataFrame(_dfDict)

In [66]:
def ReturnAndComplexTripGetHolidayDatasetHelper(transactionKey, tripType, frame):
    _dfDict = {"TRANSACTION_KEY":[], "TRIP_TYPE":[], "NUMBER_OF_SEGMENTS":[], "ORIGIN_AIRPORT":[], "DESTINATION_AIRPORT":[], "DEPARTURE_DATE":[]}
    firstRow = frame.iloc[0]
    originAirport = firstRow["ORIGIN_AIRPORT"]

    for index,row in frame.iterrows():
        if row["DESTINATION_AIRPORT"]!= originAirport:
            _dfDict["TRANSACTION_KEY"].append(transactionKey)
            _dfDict["TRIP_TYPE"].append(tripType)
            _dfDict["NUMBER_OF_SEGMENTS"].append(row["NUMBER_OF_SEGMENTS"])
            _dfDict["ORIGIN_AIRPORT"].append(row["ORIGIN_AIRPORT"])
            _dfDict["DESTINATION_AIRPORT"].append(row["DESTINATION_AIRPORT"])
            _dfDict["DEPARTURE_DATE"].append(row["DEPARTURE_DATE"])
            
    return pd.DataFrame(_dfDict)

In [67]:
def GetHolidayDataset(df):
    dataFrame = pd.DataFrame({"TRANSACTION_KEY":[], "TRIP_TYPE":[], "NUMBER_OF_SEGMENTS":[], "ORIGIN_AIRPORT":[], "DESTINATION_AIRPORT":[], "DEPARTURE_DATE":[]})
    for node,frame in tqdm(df.groupby(by=["TRANSACTION_KEY","TRIP_TYPE"])):
        if node[1] == "'OW'":
            dataFrame = pd.concat([dataFrame,OneWayTripGetHolidayDatasetHelper(node[0],node[1],frame)],axis=0)
        else:
            dataFrame = pd.concat([dataFrame,ReturnAndComplexTripGetHolidayDatasetHelper(node[0],node[1],frame)],axis=0)

    return dataFrame.reset_index(drop=True)

In [68]:
holidayDataset = GetHolidayDataset(tempdf[["TRANSACTION_KEY","TRIP_TYPE","NUMBER_OF_SEGMENTS","ORIGIN_AIRPORT","DESTINATION_AIRPORT", "DEPARTURE_DATE"]])
holidayDataset

  0%|          | 0/162709 [00:00<?, ?it/s]

Unnamed: 0,TRANSACTION_KEY,TRIP_TYPE,NUMBER_OF_SEGMENTS,ORIGIN_AIRPORT,DESTINATION_AIRPORT,DEPARTURE_DATE
0,'T-1800537914200635964','RT',1.0,'ORD','LAS','2018-01-13'
1,'T-1800537914200635964','RT',2.0,'LAS','DCA','2018-01-15'
2,'T-1800537914200636207','RT',1.0,'DFW','SAT','2018-01-20'
3,'T-1800537914200636760','XX',1.0,'MIA','BDA','2018-01-08'
4,'T-1800537914200637208','OW',1.0,'AMS','PHL','2018-01-06'
...,...,...,...,...,...,...
168364,'T-2021159117601532917','RT',1.0,'REC','FOR','2018-11-07'
168365,'T-2021159117601533030','OW',1.0,'GIG','CGH','2018-12-10'
168366,'T-2021159117601533146','RT',2.0,'GIG','NVT','2019-04-18'
168367,'T-2021159117601533278','OW',1.0,'SDU','VIX','2018-11-09'


In [70]:
holidayDataset.to_csv("Dataset\Processed\SampleDataset\ProjectWorkSampleHolidayDataset",index=False)