In [6]:
import sys
print(sys.executable)

C:\Users\hilak\anaconda3\python.exe


In [7]:
!where python

C:\Users\hilak\anaconda3\envs\booking_kaggel_compatitiom\python.exe
C:\Users\hilak\anaconda3\python.exe
C:\Users\hilak\AppData\Local\Microsoft\WindowsApps\python.exe


In [8]:
!pip install fastparquet
!pip install pyarrow



In [None]:
booking_competition.ipynb

In [9]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack, save_npz, load_npz
from tqdm import tqdm
from joblib import dump, load
import json
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow
import fastparquet

ModuleNotFoundError: No module named 'pyarrow'

In [2]:
# Loading data
print("Loading data...")
reviews = pd.read_csv('../data/train_reviews.csv')
users = pd.read_csv('../data/train_users.csv')
matches = pd.read_csv('../data/train_matches.csv')

Loading data...


In [3]:
# Preprocessing data
print("Preprocessing data...")
reviews['review_title'] = reviews['review_title'].fillna("No Title")
reviews['review_positive'] = reviews['review_positive'].fillna("No Positive Review")
reviews['review_negative'] = reviews['review_negative'].fillna("No Negative Review")
users['guest_country'] = users['guest_country'].fillna("Unknown")
users['room_nights'] = users['room_nights'].apply(lambda x: min(x, 30))

Preprocessing data...


In [4]:
# Merging datasets
data = pd.merge(matches, reviews, on='review_id', how='inner')
data = pd.merge(data, users, on='user_id', how='inner')
data['text'] = data['review_title'] + " " + data['review_positive'] + " " + data['review_negative']

In [5]:
# Save processed data
data.to_parquet("processed_data.parquet")
print("Data saved as 'processed_data.parquet'")

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

In [None]:
# Load processed data
data = pd.read_parquet("processed_data.parquet")

# Basic statistics
print(data.describe())

In [None]:
# Splitting data by users
unique_users = data['user_id'].unique()
train_users, test_users = train_test_split(unique_users, test_size=0.2, random_state=42)

train_data = data[data['user_id'].isin(train_users)]
test_data = data[data['user_id'].isin(test_users)]

# Save train and test splits
train_data.to_parquet("../data/train_data.parquet")
test_data.to_parquet("../data/test_data.parquet")
print("Train and test data saved.")

In [None]:
# Load train data
train_data = pd.read_parquet("../data/train_data.parquet")

# Sample 20% of train data
train_data_sample = train_data.sample(frac=0.2, random_state=42)

# TF-IDF for text features
vectorizer = TfidfVectorizer(max_features=2000)
X_train_text = vectorizer.fit_transform(tqdm(train_data_sample['text'], desc="Processing Train Text Features"))

# Numeric features
numeric_features = ['review_score', 'room_nights', 'accommodation_score', 'accommodation_star_rating',
                    'location_is_ski', 'location_is_beach', 'location_is_city_center']
X_train_numeric = train_data_sample[numeric_features].fillna(0).values

# Combine features
X_train = hstack([X_train_text, X_train_numeric])
y_train = train_data_sample['user_id'].values

# Save features
save_npz('../data/X_train.npz', X_train)
np.save('../data/y_train.npy', y_train)
dump(vectorizer, '../data/vectorizer.joblib')
print("Features and vectorizer saved.")

In [None]:
# Train Random Forest
model = RandomForestClassifier(n_estimators=10, max_depth=10, n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

# Save model
dump(model, '../data/random_forest_model.joblib')
print("Model saved.")

In [None]:
# Calculate MRR@10
def calculate_mrr_at_k(y_true, y_pred, k=10):
    reciprocal_ranks = []
    for true_label, preds in zip(y_true, y_pred):
        try:
            rank = preds[:k].tolist().index(true_label) + 1
            reciprocal_ranks.append(1 / rank)
        except ValueError:
            reciprocal_ranks.append(0)
    return np.mean(reciprocal_ranks)

In [None]:
# Load test data
test_data = pd.read_parquet("../data/test_data.parquet")

# Prepare test features
vectorizer = load('../data/vectorizer.joblib')
X_test_text = vectorizer.transform(tqdm(test_data['text'], desc="Processing Test Text Features"))

X_test_numeric = test_data[numeric_features].fillna(0).values
X_test = hstack([X_test_text, X_test_numeric])
y_test = test_data['user_id'].values

# Save test features
save_npz('../data/X_test.npz', X_test)
np.save('../data/y_test.npy', y_test)

In [None]:
# Predict probabilities
model = load('../data/random_forest_model.joblib')
y_test_prob = model.predict_proba(X_test)

top_k_predictions = np.argsort(y_test_prob, axis=1)[:, -10:][:, ::-1]
mrr_at_10 = calculate_mrr_at_k(y_test, top_k_predictions)

# Save evaluation results
metrics = {'mrr_at_10': mrr_at_10}
with open('metrics.json', 'w') as f:
    json.dump(metrics, f)

print(f"MRR@10: {mrr_at_10:.4f}")

In [None]:
# Prepare submission
submission = pd.DataFrame({
    'accommodation_id': test_data['accommodation_id'].values,
    'user_id': test_data['user_id'].values
})

for i in range(10):
    submission[f'review_{i+1}'] = np.array(test_data['review_id'])[top_k_predictions[:, i]]

submission.to_csv("submission.csv", index=False)
print("Submission file saved.")