In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import os
import numpy as np
import re
import ast
from geopy.distance import geodesic
import json

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
directory = "../../data/processed"

In [4]:
data = pd.read_csv(os.path.join(directory,"categoricals_numericals_raw.csv"))

FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/categoricals_numericals_raw.csv'

In [None]:
data

In [None]:
data.info()

In [None]:
drop_list = []

# Meals

In [None]:
meal_columns= ['day_' + str(i) + "_meals" for i in range(1, 29)]

In [None]:
unique_meals = set()

# Iterate through each column that contains meal lists
for col in meal_columns:
    # Split the comma-separated values in each cell and add them to the set
    unique_meals.update(data[col].str.split(',').explode().str.strip())

unique_meals.discard(np.nan)
# Convert the set of unique meal values to a list
unique_meals_list = list(unique_meals)

# Print the list of unique meal values
print(unique_meals_list)

In [None]:
new_meal_columns = []

for meal in unique_meals:
    data[meal] = data.apply(lambda row: any(meal in str(cell) for cell in row[meal_columns]), axis=1).astype(int)
    new_meal_columns.append(meal)
data

In [None]:
data[new_meal_columns]

In [None]:
data[new_meal_columns].sum().sort_values(ascending=False)

**High Frequency, Low Uniqueness:** Columns like "Breakfast," "Dinner," and "Lunch" occur very frequently but are not very unique, meaning many tours will have these features, and thus, they may not significantly contribute to differentiating between tours. However, they can still be essential for filtering; for example, if a user specifically wants a tour that includes breakfast.

**Low Frequency, High Uniqueness:** Columns like "Cooking Class," "Morning tea," and "Light refreshments on the river cruise" are unique but occur very infrequently. These can be valuable for differentiation but may apply to very few cases, limiting their overall impact on the recommender.

**Moderate Frequency, Moderate Uniqueness:** Columns like "Dinner with Wine," "Welcome Reception," and "Be My Guest" can be the most beneficial as they offer a balance between uniqueness and frequency, allowing them to significantly contribute to the recommendation logic.


In [None]:
drop_list.extend(meal_columns)

In [None]:
drop_list.extend(['Morning tea', 'Dinner and local drinks (including wine and spirits)', 'Light refreshments on the river cruise', 'Dinner (one evening only)', 'Dinner (one evening only)','Cooking Class' ])
drop_list

# Accommodation

In [None]:
itineraries = pd.read_csv('../data/processed/itineraries.csv')

In [None]:
itineraries['accommodation']

In [None]:
itineraries['accommodation'].value_counts()

In [None]:
itineraries[itineraries['accommodation'] == "Hilton Vienna Park."]['tour_option_id'].value_counts()

In [None]:
value_counts = itineraries['accommodation'].dropna().value_counts(dropna=False)
(value_counts > 1).sum(), (value_counts > 2).sum(), (value_counts > 3).sum(), (value_counts > 5).sum(), (value_counts > 10).sum(), (value_counts > 15).sum(), (value_counts > 20).sum()

In [None]:
# Find values in the 'accommodation' column that occur more than 15 times
new_accommodation_columns = value_counts[value_counts > 15].index.tolist()
new_accommodation_columns

In [None]:
def convert_to_snake_case(input_string):
    # Convert to snake case
    snake_case_string = '_'.join(word.lower() for word in input_string.split())
    
    # Replace characters with '_'
    cleaned_string = re.sub(r'['',.\-()/]', '', snake_case_string)

    # Truncate to a maximum of 40 characters
    if len(cleaned_string) > 40:
        cleaned_string = cleaned_string[:40]
    
    return cleaned_string

In [None]:
new_accommodation_columns_snake = list(map(convert_to_snake_case, new_accommodation_columns))
new_accommodation_columns_snake

In [None]:
accommodation_columns= ['day_' + str(i) + "_accommodation" for i in range(1, 29)]

In [None]:
new_columns_df = pd.DataFrame()

for accommodation in new_accommodation_columns_snake:
    new_columns_df[accommodation] = data.apply(
        lambda row: any(accommodation in convert_to_snake_case(str(cell))
                        for cell in row[accommodation_columns]), axis=1).astype(int)

data = pd.concat([data, new_columns_df], axis=1)


In [None]:
data[new_accommodation_columns_snake]

In [None]:
len(new_accommodation_columns_snake)

In [None]:
data[new_accommodation_columns_snake].sum()

In [None]:
data

In [None]:
drop_list.extend(accommodation_columns)

In [None]:
data = data.drop(drop_list, axis = 1)

# Locations

In [None]:
data['locations'][0]

## Data Transformation Plan

1. **Number of Locations**
   - **Objective**: Create a new feature representing the number of locations visited during the tour.

2. **Unique Locations**
   - **Objective**: Extract unique locations visited during the tour and represent them as binary features using one-hot encoding.
   - **Example**: Create binary columns like `visited_Anchorage`, `visited_Valdez`, etc.

3. **Country Code**
   - **Objective**: Extract unique country codes visited during the tour and represent them as binary features using one-hot encoding.
   - **Example**: Create binary columns like `visited_country_US`, `visited_country_CA`, etc.

4. **Latitude and Longitude Statistics**
   - **Objective**: Calculate statistics for latitude and longitude across all locations to provide insights into the central tendencies of the tour locations.
   - **Statistics to Calculate**:
     - Mean latitude and longitude
     - Median latitude and longitude
     - Minimum latitude and longitude
     - Maximum latitude and longitude
     - Range of latitude and longitude

5. **Distance Traveled**
   - **Objective**: Calculate the total distance traveled during the tour by summing the distances between consecutive locations.
   - **Method**: Use the Haversine formula to calculate the distance between two latitude-longitude points.

By following this plan, we aim to enhance the understanding of tour data by creating meaningful features and statistics related to the locations visited during the tour.


In [None]:
file_path = '../data/processed/locations_visited.csv'
locations_visited = pd.read_csv(file_path)

In [None]:
locations_visited

In [None]:
locations_with_tour_option_id = pd.merge(
    locations_visited, 
    itineraries[['id', 'tour_option_id']].rename(columns={'id': 'itinerary_id'}), 
    on='itinerary_id', how='left')
locations_with_tour_option_id

In [None]:
from math import isnan

def calculate_total_distance(locations_df):
    locations = locations_df.to_dict('records')  # Convert DataFrame to a list of dicts
    total_distance = 0
    for i in range(1, len(locations)):
        lat1, lon1 = locations[i-1]['latitude'], locations[i-1]['longitude']
        lat2, lon2 = locations[i]['latitude'], locations[i]['longitude']
        
        if isnan(lat1) or isnan(lon1) or isnan(lat2) or isnan(lon2):
            continue  # Skip if any of the coordinates are NaN
        
        coord1 = (lat1, lon1)
        coord2 = (lat2, lon2)
        total_distance += geodesic(coord1, coord2).miles
    return total_distance

In [None]:
locations_with_tour_option_id.isna().sum()

In [None]:
locations_with_tour_option_id['countryCode'].fillna('', inplace=True)
locations_with_tour_option_id['longitude'].fillna(0, inplace=True)
locations_with_tour_option_id['latitude'].fillna(0, inplace=True)

In [None]:
# Apply the union_sets function to 'unique_locations' and 'unique_country_codes'
aggregated_data = locations_with_tour_option_id.groupby('tour_option_id').agg({
    'name': [('count', 'count'), ('nunique', 'nunique'), ('location', lambda x: list(set(x)))],
    'countryCode': [('count', 'count'), ('nunique', 'nunique'), ('codes', lambda x: list(set(x)))],
    'latitude': 'mean',
    'longitude': 'mean'
    
}).reset_index()
aggregated_data

In [None]:
# Flatten the MultiIndex column names
aggregated_data.columns = ['_'.join(col).strip('_') for col in aggregated_data.columns.values]

In [None]:
aggregated_data

In [None]:
# Calculate total distance traveled for each tour_option_id
aggregated_data['total_distance_traveled'] = locations_with_tour_option_id.groupby('tour_option_id').apply(calculate_total_distance).reset_index(name='total_distance_traveled')['total_distance_traveled']
aggregated_data

In [None]:
unique_country_codes = locations_with_tour_option_id['countryCode'].unique()
unique_country_codes = np.delete(unique_country_codes, np.where(unique_country_codes == ''))
unique_country_codes

In [None]:
# Add new columns for each unique country code and set values
for country_code in unique_country_codes:
    # Initialize the column with zeros
    aggregated_data[country_code] = 0
    
    # Set 1 if the country code is in the countryCode_codes column for that row
    for index, row in aggregated_data.iterrows():
        if country_code in row['countryCode_codes']:
            aggregated_data.at[index, country_code] = 1

aggregated_data

In [None]:
mean_lat_long = aggregated_data[['latitude_mean', 'longitude_mean']]


In [None]:
# aggregated_data = aggregated_data.drop('countryCode_codes', axis = 1)

In [None]:
name_counts = locations_with_tour_option_id['name'].value_counts()
unique_locations = name_counts[name_counts > 15].index.tolist()
unique_locations

In [None]:
len(unique_locations)

In [None]:
locations_df = pd.DataFrame(0, index=aggregated_data.index, columns=unique_locations)
locations_df

In [None]:
locations_df['name_location'] = aggregated_data['name_location']

In [None]:
for location in unique_locations:
    for index, row in locations_df.iterrows():
        if location in row['name_location']:
            locations_df.at[index, location] = 1

locations_df = locations_df.drop('name_location', axis = 1)

In [None]:
aggregated_data = pd.concat([aggregated_data, locations_df], axis=1)


In [None]:
aggregated_data = aggregated_data.drop('name_location', axis = 1)

In [None]:
aggregated_data

In [None]:
data = pd.merge(data, aggregated_data, on='tour_option_id', how='left')

In [None]:
data

# NaNs

In [None]:
# Get the count of NaN values in each column
nan_counts = data.isna().sum()

# Filter and print columns that have NaN values with the count
for column, nan_count in nan_counts.items():
    if nan_count > 0:
        print(f"{column}: {nan_count} NaN values")

In [None]:
data = data.drop(['sourceTourOptionName', 'maxPax'], axis =1)

In [None]:
data['countryCode_codes']

In [None]:
def fillna_with_country_codes(row):
    # If startLocationCountryCode is NaN, fill with the first country in countryCode_codes
    if pd.isna(row['startLocationCountryCode']) and row['countryCode_codes']:
        row['startLocationCountryCode'] = row['countryCode_codes'][0]
    
    # If endLocationCountryCode is NaN, fill with the last country in countryCode_codes
    if pd.isna(row['endLocationCountryCode']) and row['countryCode_codes']:
        row['endLocationCountryCode'] = row['countryCode_codes'][-1]
    
    return row

In [None]:
data = data.apply(fillna_with_country_codes, axis=1)
data

In [None]:
data['startLocationCountryCode'].isna().sum(), data['endLocationCountryCode'].isna().sum()

In [None]:
def fillna_with_mean(row):
    if pd.isna(row['startLocationLongitude']) and not pd.isna(row['longitude_mean']):
        row['startLocationLongitude'] = row['longitude_mean']
        
    if pd.isna(row['startLocationLatitude']) and not pd.isna(row['latitude_mean']):
        row['startLocationLatitude'] = row['latitude_mean']
        
    if pd.isna(row['endLocationLongitude']) and not pd.isna(row['longitude_mean']):
        row['endLocationLongitude'] = row['longitude_mean']
        
    if pd.isna(row['endLocationLatitude']) and not pd.isna(row['latitude_mean']):
        row['endLocationLatitude'] = row['latitude_mean']
        
    return row

In [None]:
data = data.apply(fillna_with_mean, axis=1)
data['startLocationLongitude'].isna().sum(), data['startLocationLatitude'].isna().sum(), data['endLocationLongitude'].isna().sum(), data['endLocationLatitude'].isna().sum()

In [None]:
data['activityLevel'].fillna('not_specified', inplace=True)

In [None]:
data.isna().sum().sum()

# Encoding

In [None]:
data.select_dtypes(include='object').columns

In [None]:
tour_ids = data[['tour_option_id', 'tour_id']]

In [None]:
drop_list_2 = ['tour_id', 'tour_option_id', 'fkSeasonId', 'tour_id.1', 'countries_visited', 'locations', 'countryCode_codes', 'isPrivateRequest']

In [None]:
to_one_hot = ['productType', 'brand', 'activityLevel',
       'lowestOptionRoomType', 'startLocationName', 
       'startLocationCountryCode', 'endLocationName',
       'endLocationCountryCode']

In [None]:
data = data.drop(drop_list_2, axis = 1)

In [None]:
data_encoded = pd.get_dummies(data, columns=to_one_hot)

In [None]:
data_encoded

# Scaling

In [None]:
# Get descriptive statistics for each column
descriptive_stats = data_encoded.describe()

# Identify columns that are not in [0, 1] range
columns_not_in_range = []
for column in descriptive_stats.columns:
    min_value = descriptive_stats.at['min', column]
    max_value = descriptive_stats.at['max', column]
    if min_value < 0 or max_value > 1:
        columns_not_in_range.append(column)


In [None]:
columns_not_in_range

In [None]:
scaler = StandardScaler()

# Scale the features not in [0, 1] range
data_encoded[columns_not_in_range] = scaler.fit_transform(data_encoded[columns_not_in_range])

In [None]:
data_encoded

In [None]:
def convert_to_snake_case(name):
    # Replace all non-alphanumeric characters with underscores
    s = re.sub('[\W_]+', '_', name)
    # Convert to lowercase
    return s.lower()

In [None]:
new_columns = [convert_to_snake_case(col) for col in data_encoded.columns]
data_encoded.columns = new_columns
data_encoded

In [None]:
file_path = os.path.join(directory, f"categoricals_numericals_encoded_scaled.csv")
data_encoded.to_csv(file_path, index=False)

In [None]:
file_path = os.path.join(directory, f"reference_tour_ids.csv")
tour_ids.to_csv(file_path, index=False)

In [None]:
file_path = os.path.join(directory, f"mean_lat_long.csv")
mean_lat_long.to_csv(file_path, index=False)