In [1]:
import pandas as pd

# Cargar los archivos CSV
flights_df = pd.read_csv('../data/flights.csv')
hotels_df = pd.read_csv('../data/hotels.csv')
users_df = pd.read_csv('../data/users.csv')

# Verificar las columnas de cada DataFrame
print("Columns in flights_df:", flights_df.columns)
print("Columns in hotels_df:", hotels_df.columns)
print("Columns in users_df:", users_df.columns)

# Verificar los primeros registros de cada DataFrame
print("First few rows of flights_df:\n", flights_df.head())
print("First few rows of hotels_df:\n", hotels_df.head())
print("First few rows of users_df:\n", users_df.head())

# Renombrar columnas para evitar conflictos
flights_df_renamed = flights_df.rename(columns={'userCode': 'userCode_flight'})
hotels_df_renamed = hotels_df.rename(columns={'userCode': 'userCode_hotel'})

# Paso 1: Unir flights y hotels por travelCode
flights_hotels_df = pd.merge(flights_df_renamed, hotels_df_renamed, on='travelCode', suffixes=('_flight', '_hotel'))

# Verificar el DataFrame resultante
print("First few rows of flights_hotels_df:\n", flights_hotels_df.head())

# Paso 2: Unir el resultado con users por userCode
combined_df = pd.merge(flights_hotels_df, users_df, left_on='userCode_flight', right_on='code')

# Verificar el DataFrame final combinado
print("First few rows of combined_df:\n", combined_df.head())

# Si necesitas guardar el resultado combinado en un archivo CSV
combined_df.to_csv('../data/result/combined_data.csv', index=False)


Columns in flights_df: Index(['travelCode', 'userCode', 'from', 'to', 'flightType', 'price', 'time',
       'distance', 'agency', 'date'],
      dtype='object')
Columns in hotels_df: Index(['travelCode', 'userCode', 'name', 'place', 'days', 'price', 'total',
       'date'],
      dtype='object')
Columns in users_df: Index(['code', 'company', 'name', 'gender', 'age'], dtype='object')
First few rows of flights_df:
    travelCode  userCode                from                  to  flightType   
0           0         0         Recife (PE)  Florianopolis (SC)  firstClass  \
1           0         0  Florianopolis (SC)         Recife (PE)  firstClass   
2           1         0       Brasilia (DF)  Florianopolis (SC)  firstClass   
3           1         0  Florianopolis (SC)       Brasilia (DF)  firstClass   
4           2         0        Aracaju (SE)       Salvador (BH)  firstClass   

     price  time  distance       agency        date  
0  1434.38  1.76    676.53  FlyingDrops  09/26/2019  


In [4]:

# Si necesitas guardar el resultado combinado en un archivo CSV
combined_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81104 entries, 0 to 81103
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   travelCode       81104 non-null  int64  
 1   userCode_flight  81104 non-null  int64  
 2   from             81104 non-null  object 
 3   to               81104 non-null  object 
 4   flightType       81104 non-null  object 
 5   price_flight     81104 non-null  float64
 6   time             81104 non-null  float64
 7   distance         81104 non-null  float64
 8   agency           81104 non-null  object 
 9   date_flight      81104 non-null  object 
 10  userCode_hotel   81104 non-null  int64  
 11  name_x           81104 non-null  object 
 12  place            81104 non-null  object 
 13  days             81104 non-null  int64  
 14  price_hotel      81104 non-null  float64
 15  total            81104 non-null  float64
 16  date_hotel       81104 non-null  object 
 17  code        