In [4]:
import pandas as pd

# Load the original dataset
original_df = pd.read_csv("training_set_VU_DM.csv")
original_test_set = pd.read_csv("test_set_VU_DM.csv")
df = original_df.copy()
df_test = original_test_set.copy()

In [7]:
df['has_hist_starrating'] = df['visitor_hist_starrating'].notnull().astype(int)
df['has_hist_adr'] = df['visitor_hist_adr_usd'].notnull().astype(int)
df['has_affinity'] = df['srch_query_affinity_score'].notnull().astype(int)

median_star = df['visitor_hist_starrating'].median()
median_adr  = df['visitor_hist_adr_usd'].median()
median_aff = df['srch_query_affinity_score'].median()

df['visitor_hist_starrating'].fillna(median_star, inplace=True)
df['visitor_hist_adr_usd'].fillna(median_adr,  inplace=True)
df['srch_query_affinity_score'].fillna(median_aff, inplace=True)

df_test['has_hist_starrating'] = df_test['visitor_hist_starrating'].notnull().astype(int)
df_test['has_hist_adr'] = df_test['visitor_hist_adr_usd'].notnull().astype(int)
df_test['has_affinity'] = df_test['srch_query_affinity_score'].notnull().astype(int)

median_star_test = df_test['visitor_hist_starrating'].median()
median_adr_test  = df_test['visitor_hist_adr_usd'].median()
median_aff_test = df_test['srch_query_affinity_score'].median()

df_test['visitor_hist_starrating'].fillna(median_star_test, inplace=True)
df_test['visitor_hist_adr_usd'].fillna(median_adr_test,  inplace=True)
df_test['srch_query_affinity_score'].fillna(median_aff_test, inplace=True)

# Drop columns with more than 70% missing values
missing_threshold = 0.7 
missing_fractions = df.isnull().mean()
columns_to_drop = missing_fractions[missing_fractions > missing_threshold].index.tolist()
df.drop(columns=columns_to_drop, axis=1, inplace=True, errors='ignore')
df_test.drop(columns=columns_to_drop, axis=1, inplace=True, errors='ignore')
print(f"Dropped columns due to missing values > {missing_threshold*100}%: {columns_to_drop}")

print("Missing values before cleaning:")
print(df.isnull().sum()[df.isnull().sum() > 0])
print("Missing values before cleaning: test")
print(df_test.isnull().sum()[df_test.isnull().sum() > 0])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['visitor_hist_starrating'].fillna(median_star, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['visitor_hist_adr_usd'].fillna(median_adr,  inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object

Dropped columns due to missing values > 70.0%: ['gross_bookings_usd']
Missing values before cleaning:
Series([], dtype: int64)
Missing values before cleaning: test
Series([], dtype: int64)


In [8]:
# Handling missing values of remaining variables
if "prop_review_score" in df.columns:
    df["prop_review_score"] = df["prop_review_score"].fillna(df["prop_review_score"].median())
    df_test["prop_review_score"] = df_test["prop_review_score"].fillna(df_test["prop_review_score"].median())

if "prop_location_score2" in df.columns:
    df["prop_location_score2"] = df["prop_location_score2"].fillna(0)
    df_test["prop_location_score2"] = df_test["prop_location_score2"].fillna(0)

if "orig_destination_distance" in df.columns:
    df["orig_destination_distance"] = df["orig_destination_distance"].fillna(df["orig_destination_distance"].median())
    df_test["orig_destination_distance"] = df_test["orig_destination_distance"].fillna(df_test["orig_destination_distance"].median())

# Handling competitor information
for i in range(1, 9):
    rate_col = f"comp{i}_rate"
    inv_col = f"comp{i}_inv"
    diff_col = f"comp{i}_rate_percent_diff"

    for col in [rate_col, inv_col, diff_col]:
        if col in df.columns:
            fill_val = 0 if "percent_diff" not in col else 0.0
            df[col] = df[col].fillna(fill_val)
            df_test[col] = df_test[col].fillna(fill_val)

print("Missing values after cleaning:")
print(df.isnull().sum()[df.isnull().sum() > 0])

print("Missing values after cleaning: test")
print(df_test.isnull().sum()[df_test.isnull().sum() > 0])

Missing values after cleaning:
Series([], dtype: int64)
Missing values after cleaning: test
Series([], dtype: int64)


In [9]:
# Save to a new file
df.to_csv("training_set_cleaned.csv", index=False)
df_test.to_csv("test_set_cleaned.csv", index=False)