## Travel Dataset Generator

This code generate a dataset of travel operations, _e.g._ , plane tickets and lodging.   
This notebook just ilustrate the steps to generate the dataset, for extensive generators use the python code inside.   

In [1]:
# Import packages
import names
import tqdm

import random
import pandas as pd
from datetime import datetime as dt
from datetime import timedelta as td

---
### Definitions

Predefine variables.

In [2]:
#- Companies and Users
defGenders = ['male', 'female', 'none']
defAgesInterval = {'min': 23, 'max': 50}
defFlightsInterval = {'min': 0, 'max': 3}
defCompanies = {
    'HHD': {'usersCount': 2},
    '4You': {'usersCount': 3},
}

#- Flight Agencies
defFlightTypes = {
    'economic': {'price': 1.0},
    'premium': {'price': 1.5},
}
defAgenciesName = ['FlyingDrops', 'Rainbow', 'CloudFy']
defAgencies = dict()

#- Places
defPlacesName = ['Sao Paulo (SP)', 'Rio de Janeiro (RJ)', 'Santa Catarina (SC)']
defPlaces = {name: dict() for name in defPlacesName}
defDistancesInterval = {'min': 200.0, 'max': 850.0}
defPlaceTravelKmPerHour = 400.0 

#- Lodge (Accommodation)
defLodgesInterval = {'min': 1, 'max': 3}
defLodgesPrices   = {'min': 60.0, 'max': 200.0}
defLodgesPrex = 'Hotel'
defLodges = {name: list() for name in defPlacesName}

#- Travel
defTravels = list()
defTravelsDays = {'min': 1, 'max': 3}
defTravelsFlightPrices = {'init': 300.0, 'interval': 100.0}
defTravelWithLodge = 0.3
defTravelDate = {'init': dt.now(), 'interval':{'min': 10, 'max': 60}}

---
### Companies and Users - Generator

In [3]:
#- Functions
def funcUserGenerator(genders, agesInterval, flightsInterval, code):
    '''
    Generate random user, based on predefinitions.
    - genders: list
    - agesInterval {min, max}: user age
    - flightsInterval {min, max}: number of flights
    - code: user ID
    '''
    user = dict()
    user['code'] = code
    user['gender'] = genders[random.randint(0, len(genders)-1)]
    gender = user['gender'] if (user['gender'] != 'none') else False
    user['name'] = names.get_full_name(gender=gender)
    user['age'] = random.randint(agesInterval['min'], agesInterval['max'])
    user['flights'] = random.randint(flightsInterval['min'], flightsInterval['max'])
    return user

In [4]:
#- Fill Companies data
userId = 0
for company, data in defCompanies.items():
    users = list()
    for idx in range(data['usersCount']):
        user = funcUserGenerator(defGenders, defAgesInterval, defFlightsInterval, userId)
        users.append(user)
        userId += 1
    defCompanies[company]['users'] = users

Example - Users from a Company

In [5]:
defCompanies['HHD']['users']

[{'code': 0, 'gender': 'male', 'name': 'Steve Lee', 'age': 37, 'flights': 0},
 {'code': 1,
  'gender': 'none',
  'name': 'Richard Unger',
  'age': 45,
  'flights': 1}]

### Flight Agencies - Generator

In [6]:
#- Functions
def funcAgencyGenerator(flightTypes):
    '''
    Generate random agency services, based on predefinitions.
    - flightTypes: types of flight
    '''
    agency = dict()
    types = list(flightTypes.copy().keys())
    random.shuffle(types)
    typesMany = random.randint(1, len(types))
    agency['types'] = [types[i] for i in range(typesMany)]
    return agency

In [7]:
for agency in defAgenciesName:
    defAgencies[agency] = funcAgencyGenerator(defFlightTypes)

Example - Flight Types of Agencies

In [8]:
defAgencies

{'FlyingDrops': {'types': ['economic']},
 'Rainbow': {'types': ['economic']},
 'CloudFy': {'types': ['economic', 'premium']}}

### Places - Generator

In [9]:
#- Functions
def funcPlaceGenerator(i, j, distInterval, kmPerHour):
    '''
    Generate random place distances, based on predefinitions.
    - i: number of place
    - j: number of place
    - distInterval {min, max} values: distance range
    - kmPerHour: km per hour of the plain
    '''
    if i == j:
        return False, False, False
    distance = round(random.uniform(distInterval['min'], distInterval['max']), 2)
    time = round(distance/kmPerHour, 2)
    hours = int(time)
    minutes = (time*60) % 60
    timeMsg = '%d:%dh' % (hours, minutes)
    return (distance, time, timeMsg)

In [10]:
n = len(defPlacesName)
for i in range(n):
    for j in range(i, n):
        fromPlace = defPlacesName[i]
        toPlace = defPlacesName[j]
        distance, time, msg = funcPlaceGenerator(i, j, defDistancesInterval, defPlaceTravelKmPerHour)
        if distance and time:
            place = {'distance': distance, 'time': time, 'timeMsg': msg}
            defPlaces[fromPlace][toPlace] = place
            defPlaces[toPlace][fromPlace] = place

Example - Distances from a Place

In [11]:
defPlaces['Sao Paulo (SP)']

{'Rio de Janeiro (RJ)': {'distance': 687.86, 'time': 1.72, 'timeMsg': '1:43h'},
 'Santa Catarina (SC)': {'distance': 269.56, 'time': 0.67, 'timeMsg': '0:40h'}}

### Lodges - Generator

In [12]:
#- Definitions
defName = 'A'


#- Functions
def getNextChar(text):
    '''
    Generate order alphabetic.
    - text: input text
    '''
    if len(text) == 0:
        return 'A'
    nextChar = chr(ord(text[-1]) + 1)
    if nextChar <= 'Z':
        text = text[:-1] + nextChar
    else:
        text = getNextChar(text[:-1]) + 'A'
    return text


def funcLodgesGenerator(lodgesInterval, lodgesPrices):
    '''
    Generate random lodges, based on predefinitions.
    - lodgesInterval {min, max} values: number of hotels
    - lodgesPrices {min, max} values: hotel range
    '''
    global defName
    lodges = list()
    n = random.randint(lodgesInterval['min'], lodgesInterval['max'])
    for i in range(n):
        lodgeName = '%s %s' % (defLodgesPrex, defName)
        price = round(random.uniform(lodgesPrices['min'], lodgesPrices['max']), 2)
        lodge = {'code': defName, 'name': lodgeName, 'price': price}
        lodges.append(lodge)
        defName = getNextChar(defName)
    return lodges

In [13]:
for name in defPlacesName:
    lodges = funcLodgesGenerator(defLodgesInterval, defLodgesPrices)
    defLodges[name] = lodges

Example - Hotels from a Place

In [14]:
defLodges['Sao Paulo (SP)']

[{'code': 'A', 'name': 'Hotel A', 'price': 76.73}]

### Travel Possibilities - Generator

In [15]:
#- Functions
def funcCalculatePrice(priceMin, priceMax, weight):
    '''
    Calculate a random price for a travel.
    - priceMin: min price
    - priceMax: max price
    - weight: weight the price range
    '''
    priceMin = priceMin * weight
    priceMax = priceMax * weight
    price = round(random.uniform(priceMin, priceMax), 2)
    return price


def funcElaborateflight(fromPlace, toPlace, distance, agency, flightType, price, \
                        time, timeMsg):
    '''
    Elaborate a possible flight.
    - fromPlace: from
    - toPlace: to
    - distance: distance
    - agency: agency name
    - flightType: flight type
    - price: flight price
    - time: time in hours
    - timeMsg: time calculated
    '''
    flight = {'from': fromPlace, 'to': toPlace, 'distance': distance,
              'agency': agency, 'flightType': flightType, 'price': price,
              'time': time, 'timeMsg': timeMsg}
    return flight


def funcFlightsPossibilities(places, flightPrices, flightTypes, agencies):
    '''
    Elaborate a list of possible flights.
    - places: places data
    - flightPrices: flight prices
    - flightTypes: flight types
    - agencies: agencies data
    '''
    flightsPossibilities = list()
    for fromPlace, toPlaces in places.items():
        toPlacesSorted = sorted(toPlaces.items(), key=lambda x:x[1]['distance'], reverse=False)
        priceA, priceB = flightPrices['init'], \
                         flightPrices['init'] + flightPrices['interval']
        for (toPlace, placeData) in toPlacesSorted:
            for (agencyName, agencyData) in agencies.items():
                if len(agencyData['types']) > 1: # has more than 1 element
                    for typeA in agencyData['types']:
                        weight = flightTypes[typeA]['price']
                        price = funcCalculatePrice(priceA, priceB, weight)
                        flight = funcElaborateflight(fromPlace, toPlace, placeData['distance'], \
                                                     agencyName, typeA, price, placeData['time'], placeData['timeMsg'])
                        flightsPossibilities.append(flight)
                else:
                    typeA = agencyData['types'][0]
                    weight = flightTypes[typeA]['price']
                    price = funcCalculatePrice(priceA, priceB, weight)
                    flight = funcElaborateflight(fromPlace, toPlace, placeData['distance'], agencyName, \
                                                 typeA, price, placeData['time'], placeData['timeMsg'])
                    flightsPossibilities.append(flight)
            # Update prices for bigger distances
            priceA, priceB = priceB, priceB + flightPrices['interval']
    return flightsPossibilities


def funcLodgesPossibilities(placesName, lodges):
    '''
    Elaborate a list of possible hotels.
    - placesName: places names
    - lodges: lodges data
    '''
    lodgesPossibilities = list()
    for place in placesName:
        for lodge in lodges[place]:
            lodge = lodge.copy()
            lodge['place'] = place
            lodgesPossibilities.append(lodge)
    return lodgesPossibilities

Example - Fligts Possibilities (for each Place)

In [16]:
flightsPossibilities = funcFlightsPossibilities(defPlaces, defTravelsFlightPrices, defFlightTypes, defAgencies)

In [17]:
pd.DataFrame(flightsPossibilities).head(5)

Unnamed: 0,agency,distance,flightType,from,price,time,timeMsg,to
0,FlyingDrops,269.56,economic,Sao Paulo (SP),385.48,0.67,0:40h,Santa Catarina (SC)
1,Rainbow,269.56,economic,Sao Paulo (SP),360.66,0.67,0:40h,Santa Catarina (SC)
2,CloudFy,269.56,economic,Sao Paulo (SP),330.57,0.67,0:40h,Santa Catarina (SC)
3,CloudFy,269.56,premium,Sao Paulo (SP),474.55,0.67,0:40h,Santa Catarina (SC)
4,FlyingDrops,687.86,economic,Sao Paulo (SP),400.64,1.72,1:43h,Rio de Janeiro (RJ)


Example - Hotel Possibilities (for each Place)

In [18]:
lodgesPossibilities = funcLodgesPossibilities(defPlacesName, defLodges)

In [19]:
pd.DataFrame(lodgesPossibilities).head(5)

Unnamed: 0,code,name,place,price
0,A,Hotel A,Sao Paulo (SP),76.73
1,B,Hotel B,Rio de Janeiro (RJ),178.82
2,C,Hotel C,Rio de Janeiro (RJ),137.23
3,D,Hotel D,Rio de Janeiro (RJ),124.55
4,E,Hotel E,Santa Catarina (SC),85.15


---
### Travel Dataset - Generator

In [20]:
#- Definitions
travelCode = 0


#- Functions
def df2Dict(df):
    '''
    Convert dataframe into dict
    '''
    procDict = dict()
    tmp = df.to_dict('split')
    data = tmp['data'][0]
    for (i, column) in enumerate(tmp['columns']):
        procDict[column] = data[i]
    return procDict


def funcTravelsSimulated(companies, flightsPossibilities, lodgesPossibilities, travelDate, travelsDays, \
                         travelWithLodge, placesName):
    '''
    Elaborate random travels with flights and lodges, based on possibilities.
    - flightsPossibilities: possible flights
    - lodgesPossibilities: possible hotels
    '''
    global travelCode
    dfFlightsPos = pd.DataFrame(flightsPossibilities)
    dfLodgesPos = pd.DataFrame(lodgesPossibilities)
    flightsSimulated, lodgesSimulated = list(), list()
    for (companyName, companyData) in companies.items():
        for user in companyData['users']:
            date = travelDate['init']
            for i in range(user['flights']):
                # random - days, places, hotel?
                daysFlight = random.randint(travelsDays['min'], travelsDays['max'])
                daysNextTravel = random.randint(travelDate['interval']['min'], travelDate['interval']['min'])
                fromPlace, toPlace = random.sample(placesName, 2)
                chanceTravelWithLodge = (random.randrange(100) < travelWithLodge*100)
                # travels
                fromConditions = (dfFlightsPos['from']==fromPlace) & (dfFlightsPos['to']==toPlace)
                tmpFlightFrom  = df2Dict(dfFlightsPos[fromConditions].sample(n=1))
                toConditions = (dfFlightsPos['from']==toPlace) & (dfFlightsPos['to']==fromPlace) & \
                               (dfFlightsPos['agency']==tmpFlightFrom['agency']) & \
                               (dfFlightsPos['flightType']==tmpFlightFrom['flightType'])
                tmpFlightTo  = df2Dict(dfFlightsPos[toConditions])
                tmpFlightFrom['userCode'] = tmpFlightTo['userCode'] = user['code']
                tmpFlightFrom['travelCode'] = tmpFlightTo['travelCode'] = travelCode
                tmpFlightFrom['date'] = date
                tmpFlightTo['date']   = date + td(days=daysFlight)
                # lodge
                if chanceTravelWithLodge:
                    lodgeConditions = (dfLodgesPos['place']==toPlace)
                    tmpLodge = df2Dict(dfLodgesPos[lodgeConditions])
                    tmpLodge['userCode'] = user['code']
                    tmpLodge['date'] = date
                    tmpLodge['days'] = daysFlight
                    tmpLodge['total'] = round(tmpLodge['price'] * daysFlight, 2)
                    tmpLodge['travelCode'] = travelCode
                    lodgesSimulated.append(tmpLodge)
                # save and update data
                flightsSimulated.append(tmpFlightFrom)
                flightsSimulated.append(tmpFlightTo)
                travelCode += 1
                date = dt.now() + td(days=daysNextTravel)
    return flightsSimulated, lodgesSimulated

In [21]:
flightsSimulated, lodgesSimulated = \
        funcTravelsSimulated(defCompanies, flightsPossibilities, lodgesPossibilities, 
        defTravelDate, defTravelsDays, defTravelWithLodge, defPlacesName)

Example - Travel (From->To + To->From)

In [22]:
flightsSimulated[0:2]

[{'agency': 'CloudFy',
  'distance': 619.11,
  'flightType': 'economic',
  'from': 'Rio de Janeiro (RJ)',
  'price': 388.15,
  'time': 1.55,
  'timeMsg': '1:33h',
  'to': 'Santa Catarina (SC)',
  'userCode': 1,
  'travelCode': 0,
  'date': datetime.datetime(2019, 9, 24, 16, 39, 49, 122962)},
 {'agency': 'CloudFy',
  'distance': 619.11,
  'flightType': 'economic',
  'from': 'Santa Catarina (SC)',
  'price': 409.21,
  'time': 1.55,
  'timeMsg': '1:33h',
  'to': 'Rio de Janeiro (RJ)',
  'userCode': 1,
  'travelCode': 0,
  'date': datetime.datetime(2019, 9, 26, 16, 39, 49, 122962)}]

In [23]:
lodgesSimulated[0]

{'code': 'A',
 'name': 'Hotel A',
 'place': 'Sao Paulo (SP)',
 'price': 76.73,
 'userCode': 2,
 'date': datetime.datetime(2019, 9, 24, 16, 39, 49, 122962),
 'days': 2,
 'total': 153.46,
 'travelCode': 1}