In [None]:
import pandas as pd

# Load both datasets
searches_data = pd.read_csv('./data/searches.tsv', sep='\t')
contacts_data = pd.read_csv('./data/contacts.tsv', sep='\t')

# Define the aggregation functions for searches data
def mean_agg(series):
    return series.mean()

def set_agg(series):
    result = set(series.dropna().unique())
    return ', '.join(result) if result else ''

# Aggregation rules for searches data
aggregation_rules = {
    'n_searches': mean_agg,
    'n_nights': mean_agg,
    'n_guests_min': mean_agg,
    'n_guests_max': mean_agg,
    'origin_country': set_agg,
    'filter_price_min': mean_agg,
    'filter_price_max': mean_agg,
    'filter_room_types': set_agg,
    'filter_neighborhoods': set_agg
}

# Aggregate searches data by user ID
grouped_searches = searches_data.groupby('id_user').agg(aggregation_rules)
grouped_searches.reset_index(inplace=True)

# Assume that contacts data has a column named 'id_guest' which is the user ID
# For simplicity, let's consider you want to add a few columns from contacts data, such as 'n_messages' and 'ts_booking_at'
# First, aggregate these columns as needed (e.g., sum of messages, last booking date)
contacts_aggregation = {
    'n_messages': 'sum',
    'ts_booking_at': 'max'  # You might want the most recent booking date
}

grouped_contacts = contacts_data.groupby('id_guest').agg(contacts_aggregation)
grouped_contacts.reset_index(inplace=True)

# Merge the two dataframes on user ID, where 'id_user' in searches matches 'id_guest' in contacts
final_data = pd.merge(grouped_searches, grouped_contacts, left_on='id_user', right_on='id_guest', how='left')

# Optionally drop the redundant 'id_guest' column if it's the same as 'id_user'
final_data.drop(columns='id_guest', inplace=True)

# Save the merged data to a new CSV file
final_data.to_csv('merged_aggregated_data.csv', index=False)

print(final_data.head())
