In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
import joblib
import warnings

warnings.filterwarnings('ignore')
pd.set_option('mode.chained_assignment', None)

# --- Step 1: Data Loading and Initial Feature Engineering ---

print("Step 1: Loading data and feature engineering...")

try:
    orders = pd.read_csv('orders.csv', low_memory=False)
    vendors = pd.read_csv('vendors.csv')
    train_locations = pd.read_csv('train_locations.csv')
    test_locations = pd.read_csv('test_locations.csv')
    sample_submission = pd.read_csv('SampleSubmission.csv')
except FileNotFoundError as e:
    print(f"\nError: {e}. Please ensure all CSV files are in the same directory as this notebook.")
    raise

# --- Create a Haversine distance function ---
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371
    if pd.isnull(lat1) or pd.isnull(lon1) or pd.isnull(lat2) or pd.isnull(lon2):
        return np.nan
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# --- Step 2: Create a balanced training dataset with positive and negative samples ---

print("Step 2: Creating positive and negative samples for training...")

# Merge orders with train_locations to get the location_number

orders = orders.merge(train_locations, on=['customer_id'], how='left', suffixes=('_orders', '_loc'))
orders.dropna(subset=['location_number'], inplace=True)
orders['location_number'] = orders['location_number'].astype(int)

# 1. Positive Samples (Actual Orders)

orders['target'] = 1
positive_samples = orders[['customer_id', 'vendor_id', 'location_number', 'target']].copy()

# 2. Negative Samples (Non-Orders)

np.random.seed(42)
unique_customers = orders['customer_id'].unique()

# Corrected line: Rename the 'id' column to 'vendor_id'

vendors.rename(columns={'id': 'vendor_id'}, inplace=True)
unique_vendors = vendors['vendor_id'].unique()
existing_pairs = set(tuple(row) for row in positive_samples[['customer_id', 'vendor_id']].values)
negative_samples_list = []
negative_samples_count = len(positive_samples)

while len(negative_samples_list) < negative_samples_count:
    customer_id = np.random.choice(unique_customers)
    vendor_id = np.random.choice(unique_vendors)
    if (customer_id, vendor_id) not in existing_pairs:
        negative_samples_list.append({'customer_id': customer_id, 'vendor_id': vendor_id, 'location_number': 0, 'target': 0})

negative_samples = pd.DataFrame(negative_samples_list)
training_data = pd.concat([positive_samples, negative_samples], ignore_index=True)


# --- Step 3: Feature Engineering for Training and Test Data ---
print("Step 3: Merging features for training and test data...")

# Merge with vendor features
training_data = training_data.merge(vendors, on='vendor_id', how='left')
# Merge with location features
training_data = training_data.merge(train_locations, on=['customer_id', 'location_number'], how='left', suffixes=('_vendor', '_cust'))

# Prepare the test data from the SampleSubmission file
test_df = sample_submission.copy()
test_df[['customer_id', 'location_number', 'vendor_id']] = test_df['CID X LOC_NUM X VENDOR'].str.split(' X ', expand=True)
test_df['location_number'] = pd.to_numeric(test_df['location_number'])
test_df['vendor_id'] = pd.to_numeric(test_df['vendor_id'])

# Merge with vendor features


test_df = test_df.merge(vendors, on='vendor_id', how='left')

# Merge with location features (using test_locations)

test_df = test_df.merge(test_locations, on=['customer_id', 'location_number'], how='left', suffixes=('_vendor', '_cust'))


# Calculate distance for both sets

training_data['distance_km'] = training_data.apply(
    lambda row: haversine_distance(row['latitude_cust'], row['longitude_cust'], row['latitude_vendor'], row['longitude_vendor']), axis=1
)
test_df['distance_km'] = test_df.apply(
    lambda row: haversine_distance(row['latitude_cust'], row['longitude_cust'], row['latitude_vendor'], row['longitude_vendor']), axis=1
)


# --- Step 4: Train the Final Model ---

print("Step 4: Training the model...")

features = ['distance_km', 'delivery_charge', 'serving_distance', 'is_open',
            'commission', 'discount_percentage', 'vendor_rating', 'prepration_time',
            'rank', 'one_click_vendor', 'country_id', 'city_id', 'vendor_category_id']

# Create the training and testing sets #

X_train = training_data[features]
y_train = training_data['target']
X_test = test_df[features]

# Impute missing values with the mean

imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Train the RandomForestClassifier model

model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train_imputed, y_train)

# --- Step 5: Make Predictions and Create Submission File ---

print("Step 5: Generating submission file...")

# Make predictions on the test set

test_df['target'] = model.predict(X_test_imputed)

# Prepare the final submission file in the required format

submission = test_df[['CID X LOC_NUM X VENDOR', 'target']]
submission.to_csv('submission.csv', index=False)


print("\nAssignment complete! The submission.csv file has been created.")

Step 1: Loading data and feature engineering...
Step 2: Creating positive and negative samples for training...
Step 3: Merging features for training and test data...
Step 4: Training the model...
Step 5: Generating submission file...

Assignment complete! The submission.csv file has been created.


In [10]:
import joblib

# Make sure the 'model' and 'imputer' variables exist in your session
# (If you restarted the notebook, you need to run the full code again)

try:
    joblib.dump(model, 'recommender_model.joblib')
    joblib.dump(imputer, 'imputer.joblib')
    print("Model and imputer saved successfully!")
except NameError:
    print("Error: 'model' or 'imputer' not found. Please run the full project code from start to finish first.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Model and imputer saved successfully!


In [None]:
import os
print(os.getcwd())