# Generate data for recommendation system
---
## Part 1: Create 100 user data records

## Loading and preprocessing
### use AWS-CLI command

`aws cognito-idp list-users --user-pool-id ap-south-1_AbhMNmzI2 --attributes-to-get '["sub"]`

---

To get the list of all users in cognito user pool as json. and then copy and save it as a file [cog-user-data.json](~/Documents/python-scripts/htt-recommendation-system/cog-user-data.json)

In [None]:
# imports for part 1
import json
import uuid
import base64
import datetime
import random
import pandas as pd
from pandas.io.json import json_normalize

### Filter out cognito id's from aws cli response json
- Load the file as json. 
- Read all the data 
- use list comprehension to filter out the cognito IDs
- save it in another file

In [None]:
# open and read the JSON file
with open('./cog-user-data.json') as f:
  cog_user_data = json.load(f)

In [None]:
# fetch all cognito IDs using list comprehension
cogIDs = [i["Username"] for i in cog_user_data]

In [None]:
# write the IDs to another file for later use.
with open('./cognito-user-IDs_copy.txt', 'w') as f:
    for i in cogIDs:
        f.write(i+'\n')

### Create a copy of userIDs, there are only 46 in userpool, we will generate 54 more and use 100 users for our case.

In [None]:
# append 54 more random uuid's to the list of userID's
with open('./cognito-user-IDs_copy.txt', 'a') as f:
    for i in range(54):
        f.write(str(uuid.uuid4())+'\n')

### Generate name, surname, birthdays, email address, nationality, phone and gender for each user and save it as json.

- Some fields like name doesn't really make sense.
- Hit the isdCodes endpoint so we can generate some phone numbers and get Nationality.
- Save this data in [isdCodes.json](./isdCodes.json)

In [None]:
with open('./isdCodes.json', 'r') as f:
    codes = json.load(f)


In [None]:
# Date data for generating birthdays.
start_date = datetime.date(1960, 1, 1)
end_date = datetime.date(2000, 1, 1)
time_between_dates = end_date - start_date
days_between_dates = time_between_dates.days


all_data = []
# load the userIDs from text file.
with open('./cognito-user-IDs_copy.txt', 'r') as f:
    i = 1 # used for index
    for user_id in f.readlines(): # for each userID
        i+= 1
        user_data_json = {}

        # save userID
        user_data_json["USER_ID"] = user_id

        # generate names from userIDs (simply convert it into base64)
        name = str(base64.b64encode(bytes(user_id, 'utf-8')), 'utf-8')
        
        # Name is too long so we are randomly chopping it.
        name = name[:-30 -random.randrange(15)] 
        
        # split and save it as fname and fname.
        user_data_json["FIRST_NAME"], user_data_json["LAST_NAME"] = name[:len(name)//2], name[len(name)//2:] 

        # gender distribution is 60% male and 40% female (ideally choosen)
        ageList = ['M', 'M', 'M', 'F', 'F']

        # Randomly choose a gender with the provided distribution.
        user_data_json["GENDER"] = random.choice(ageList)

        # generate email with fname and lname
        user_data_json["EMAIL"] = name[:len(name)//2]+"."+name[len(name)//2:]+'@gmail.com'
        
        # randomly choose a Nationality and associated ISD code.
        phone_and_nationality_code = random.choice(codes)

        # Create phone number using ISD code and generate a random 10 digit phone number.
        user_data_json["PHONE_NUMBER"] = phone_and_nationality_code['dial_code']+ str(random.randrange(9999999998) + 1000000000)

        # Save nationality
        user_data_json["NATIONALITY"] = phone_and_nationality_code['code']
        
        # Select a random birthday
        random_number_of_days = random.randrange(days_between_dates)
        random_date = start_date + datetime.timedelta(days=random_number_of_days)
        
        # save the birthday
        user_data_json["BIRTH_DATE"] = random_date.strftime("%m/%d/%Y")
        
        # Do this for all the userIDs
        all_data.append(user_data_json)


In [None]:
# convert JSOn into a dataframe
df = json_normalize(all_data)

In [None]:
# Save the dataframe as a CSV file.
df.to_csv('user_data.csv')

---
## Part 2. Airports table data

In [None]:
# imports used for part 2
import boto3
from pandas.io.json import json_normalize

In [None]:
# boto3 dynamodb resource object
dynamodb = boto3.resource('dynamodb')

In [None]:
# run a scan query and fetch all the 3000+ records from database
table = dynamodb.Table('Airports')start_date = datetime.date(1960, 1, 1)
end_date = datetime.date(2000, 1, 1)
time_between_dates = end_date - start_date
days_between_dates = time_between_dates.days
response = table.scan()
data = response['Items']
while 'LastEvaluatedKey' in response:
    response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
    data.extend(response['Items'])

In [None]:
# convert json data to pandas dataframe
df = json_normalize(data)

In [None]:
# save dataframe as CSV
df.to_csv('airports.csv')

## 3. Events Data
### Basics Assumptions
- we are working on a time frame of 1 month. (possibly December 2019)
- We are going to book/view/search for 1000 destinations
- We will make a list of popular destinations

**Columns required:**

- [x] UserID
- [x] Destination (IATA_CODE)
- [x] Source (IATA_CODE)
- [x] SessionID
- [ ] Timestamp - Time of Booking
- [x] ServiceType - 'Flights' 
- [x] Service Price - '0 for all'
- [ ] ServiceTime - DateOfTravel (isoUnixDateTime)
- [ ] Event Type (Search/ View/ Booking)
- [ ] EventValue 

In [1]:
# imports for part 3
import datetime, time
import random
import json
import uuid
from collections import Counter
from pandas.io.json import json_normalize
import pandas as pd

In [2]:
# Get the time frame set-up for event dates
start_date = datetime.date(2019, 12, 1)
end_date = datetime.date(2020, 1, 1)

time_between_dates = end_date - start_date
days_between_dates = time_between_dates.days

# Get the current date
start_date_current = datetime.date(2019, 4, 1)
end_date_current = datetime.date(2019, 11, 30)

time_between_current_dates = end_date_current - start_date_current
days_between_current_dates = time_between_current_dates.days


In [3]:
# load userIDs and load destinations
with open('./cognito-user-IDs_copy.txt', 'r') as f:
    userIDs = f.readlines()
with open('./destinations.json', 'r') as f:
    destinations = json.load(f)

In [4]:
days_between_current_dates

243

In [5]:
# Optionally more considering Christmas and New Year
booking_days = [i for i in range(days_between_dates)]
# Checking days can be random distributed
checking_days = [i for i in range(days_between_current_dates)]
# assuming travel plans increases near christmas and new year 
booking_distribution = [1,1,1,1,2,1,1,1,1,2,1,1,1,3,3,3,3,5,5,10,10,15,10,3,1,3,15,10,5,3,1]

# list comprehension to create non-linear distribution.
new_booking_days = [[booking_days[i] for j in range(dist)] for i, dist in enumerate(booking_distribution)]

# one line function to flat list
flatten = lambda l: [item for sublist in l for item in sublist]


new_booking_days = flatten(new_booking_days)
# new_booking_days

In [6]:
# here is what new distribution looks like.
# _ = [print(start_date + datetime.timedelta(days=i)) for i in new_booking_days]

In [7]:
# create non-linear (popular) destinations
dest_list = [i['code'] for i in destinations]
source_list = dest_list.copy()
# destinations
popular_list = [("HYD", 15), ("BCN", 13), ("LVS", 11), ("BKK", 9), ("MV", 8), ("KUL", 7), ("SIN", 6), ("MAD", 5), ("DXB", 4), ("KAI", 3)]
_ = [[dest_list.append(i[0]) for _ in range(i[1])] for i in popular_list]
# [print(i) for i in range(popular_list)]

In [8]:
print(len(dest_list))
Counter(dest_list).most_common(11)

989


[('HYD', 16),
 ('BCN', 14),
 ('LVS', 12),
 ('BKK', 10),
 ('MV', 9),
 ('KUL', 8),
 ('SIN', 7),
 ('MAD', 6),
 ('DXB', 5),
 ('KAI', 4),
 ('BTR', 1)]

In [9]:
with open('destinations.json') as f:
    D = json.load(f)
len([i["code"] for i in D])

908

In [10]:
# Event choices

event_types = [('Search', 0.33), ('View', 0.66), ('Booking', 0.99)]
event_option_weights = [1, 1, 1]
event_options = flatten([[event_types[i] for j in range(c)] for i, c in enumerate(event_option_weights)])

# Price choices

price_list = [i for i in range(50,150)]

In [25]:
# this is where we create the records:
event_records = []
for i in range(4000):
    event_record = {}
    event_record["USER_ID"] = random.choice(userIDs)[:-1]
    event_record["SESSION_ID"] = str(uuid.uuid4())
    current_date = start_date_current + datetime.timedelta(days=random.choice(checking_days))
    c_timestamp = time.mktime(current_date.timetuple())
    event_record["TIMESTAMP"] = int(c_timestamp)
    event_record["ITEM_ID"] = random.choice(dest_list)
    event_selected = random.choice(event_options)
    event_record["EVENT_TYPE"] = event_selected[0]
    event_record["EVENT_VALUE"] = event_selected[1]
    event_record["SERVICE_TYPE"] = 'FLIGHTS'
    event_record["USER_LOCATION"] = random.choice(source_list)
    event_record["SERVICE_LOCATION"] = event_record["ITEM_ID"]
    event_date = start_date + datetime.timedelta(days=random.choice(new_booking_days))
    timestamp = time.mktime(event_date.timetuple())
    event_record["SERVICE_TIME"] = int(timestamp)
    event_record["SERVICE_PRICE"] = 0 if event_record["EVENT_TYPE"] == "Search" else random.choice(price_list)
    event_records.append(event_record)
df = json_normalize(event_records)

In [26]:
df.head()
df.to_csv('event_data.csv')

In [27]:
# Counter([i["ITEM_ID"] for i in event_records]).most_common(100)

In [14]:
import uuid

In [28]:
df = pd.read_csv('event_data.csv')

In [29]:
# df.head()

In [33]:
df1 = df.loc[df['EVENT_TYPE'] == 'View']

In [31]:
df1

Unnamed: 0.1,Unnamed: 0,USER_ID,SESSION_ID,TIMESTAMP,ITEM_ID,EVENT_TYPE,EVENT_VALUE,SERVICE_TYPE,USER_LOCATION,SERVICE_LOCATION,SERVICE_TIME,SERVICE_PRICE
0,0,505b2165-5089-4ba1-85d9-ab0eedddb6ff,281f25c8-1544-4ad1-a982-5ece2268b6de,1569177000,JAI,Booking,0.99,FLIGHTS,DEN,JAI,1577039400,104
2,2,4da565db-a67e-4a17-a9a6-147a5970deba,0856c5b5-cef7-4418-a2fb-b3483d7d9ad3,1562005800,NAN,Booking,0.99,FLIGHTS,CAE,NAN,1577385000,69
6,6,6861966e-930e-4a42-ab6e-eed13e3d3be4,35bca8d2-d380-4ce8-b15e-c5722d7339fd,1568399400,VIA,Booking,0.99,FLIGHTS,BIN,VIA,1576348200,110
8,8,f8082749-d4a8-413a-b01e-2a4b81b0fde2,ffaf2000-600a-4d81-bb59-f32880642b9e,1563906600,LEO,Booking,0.99,FLIGHTS,HAN,LEO,1576780200,52
11,11,12d8c4c0-a472-4da5-950b-ce02b426d32c,e30c86d6-16c8-40b1-83fd-87589d27665b,1555957800,CEN,Booking,0.99,FLIGHTS,ALC,CEN,1576780200,142
...,...,...,...,...,...,...,...,...,...,...,...,...
3990,3990,a7981edb-234d-42ce-938a-8dd7aed43c98,7de58e57-770c-4001-9655-f846fded3728,1567017000,SDR,Booking,0.99,FLIGHTS,CPH,SDR,1576866600,70
3991,3991,a8bc3210-449b-44a8-ac54-b182b13be0fc,78387b14-b09b-44a4-bca0-b26065507279,1554489000,SAT,Booking,0.99,FLIGHTS,PVG,SAT,1577471400,116
3995,3995,110749c7-6b27-4bfb-bb92-ab1010b73ba5,733cce34-627b-4d76-a588-54e14a0d3073,1565116200,CDO,Booking,0.99,FLIGHTS,QQX,CDO,1576693800,89
3996,3996,47227935-085f-4877-8573-f56a3d7717a6,a693e8f4-4278-449a-9588-b64b62932e6e,1570645800,TH7,Booking,0.99,FLIGHTS,NWH,TH7,1576953000,102


In [19]:
len(df1.ITEM_ID.unique() )

0

In [21]:
# use for generating random UUIDs
[uuid.uuid4() for i in range(1)]

[UUID('bfa8865f-bd1a-4b1f-948a-0876f389c33a')]