In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Load training and test datasets
train_data = pd.read_csv("./train.csv")
test_data = pd.read_csv("./test.csv")

# Define helper function to extract date features
def extract_date_features(dataframe):
    dataframe['release_year'] = pd.to_datetime(dataframe['release_date'], errors='coerce').dt.year
    dataframe['release_month'] = pd.to_datetime(dataframe['release_date'], errors='coerce').dt.month
    dataframe['release_weekday'] = pd.to_datetime(dataframe['release_date'], errors='coerce').dt.weekday

# Step 1: Create 'log_revenue' in training data
train_data['log_revenue'] = np.log(train_data['revenue'].replace(0, np.nan))

# Handle missing values in training data
numeric_columns = train_data.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = train_data.select_dtypes(include=['object']).columns

train_data[numeric_columns] = train_data[numeric_columns].fillna(train_data[numeric_columns].median())
train_data[categorical_columns] = train_data[categorical_columns].fillna("Unknown")

# Step 2: Feature engineering for dates in training data
extract_date_features(train_data)

# Step 3: One-hot encode categorical features in training data
categorical_to_encode = ['main_genre', 'language']
existing_categoricals = [col for col in categorical_to_encode if col in train_data.columns]
train_data = pd.get_dummies(train_data, columns=existing_categoricals, drop_first=True)

# Step 4: Scale numeric features in training data
scaler = StandardScaler()
numeric_features = ['budget', 'popularity', 'runtime', 'release_year', 'release_month', 'release_weekday']
numeric_features = [col for col in numeric_features if col in train_data.columns]
train_data[numeric_features] = scaler.fit_transform(train_data[numeric_features])

# Step 5: Define features and target for training
target = "log_revenue"
features = train_data.columns.difference([target, 'revenue', 'belongs_to_collection', 'genres', 'homepage',
                                          'imdb_id', 'original_language', 'original_title', 'overview',
                                          'poster_path', 'production_companies', 'production_countries',
                                          'release_date', 'spoken_languages', 'status', 'tagline', 'title',
                                          'Keywords', 'cast', 'crew'])

X_train = train_data[features]
y_train = train_data[target]

# Step 6: Calculate Baseline RMSE
baseline_prediction = np.mean(y_train)  # Baseline prediction is the mean of log_revenue
baseline_rmse = np.sqrt(mean_squared_error(y_train, [baseline_prediction] * len(y_train)))
print(f"Baseline RMSE (Mean Prediction): {baseline_rmse}")


# Step 6: Train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 7: Evaluate the model on training data
y_train_pred = rf_model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"Root Mean Squared Error (RMSE) on Training Data: {train_rmse}")

# Step 8: Use test data to make predictions
# Apply the same feature engineering steps to the test data
extract_date_features(test_data)

# Handle missing values in the test dataset
test_numeric_columns = [col for col in numeric_features if col in test_data.columns]
test_data[test_numeric_columns] = test_data[test_numeric_columns].fillna(0)

# Check for existence of categorical columns before handling them
existing_categoricals = [col for col in categorical_to_encode if col in test_data.columns]
if existing_categoricals:
    test_data[existing_categoricals] = test_data[existing_categoricals].fillna("Unknown")
    test_data = pd.get_dummies(test_data, columns=existing_categoricals, drop_first=True)

# Align test data columns to match training data's features
X_test = test_data.reindex(columns=features, fill_value=0)

# Scale numeric features in test data
if test_numeric_columns:
    X_test[test_numeric_columns] = scaler.transform(X_test[test_numeric_columns])

# Predict on the test dataset
test_data['predicted_log_revenue'] = rf_model.predict(X_test)

# Convert predicted log revenue to revenue
test_data['predicted_revenue'] = np.exp(test_data['predicted_log_revenue'])

# Display results
print(test_data[['id', 'predicted_revenue']])


  dataframe['release_year'] = pd.to_datetime(dataframe['release_date'], errors='coerce').dt.year
  dataframe['release_month'] = pd.to_datetime(dataframe['release_date'], errors='coerce').dt.month
  dataframe['release_weekday'] = pd.to_datetime(dataframe['release_date'], errors='coerce').dt.weekday


Baseline RMSE (Mean Prediction): 3.120879515582972
Root Mean Squared Error (RMSE) on Training Data: 0.8746765521215467


  dataframe['release_year'] = pd.to_datetime(dataframe['release_date'], errors='coerce').dt.year
  dataframe['release_month'] = pd.to_datetime(dataframe['release_date'], errors='coerce').dt.month
  dataframe['release_weekday'] = pd.to_datetime(dataframe['release_date'], errors='coerce').dt.weekday


       id  predicted_revenue
0    2175       1.347346e+04
1    2564       9.773048e+06
2    1493       1.719902e+07
3     585       8.367687e+05
4    2038       1.343614e+04
..    ...                ...
595  1588       6.026098e+07
596   439       1.887605e+07
597  2908       1.063776e+07
598  1179       8.921070e+04
599  1443       1.121211e+07

[600 rows x 2 columns]


In [6]:
from xgboost import XGBRegressor


xgb_model = XGBRegressor(
    n_estimators=200,       # More trees for better learning
    learning_rate=0.05,     # Smaller step size for better accuracy
    max_depth=8,            # Allow deeper trees if data is complex
    subsample=0.8,          # Use a portion of training data for regularization
    colsample_bytree=0.8,   # Use a portion of features for each tree
    random_state=42,
)

# Fit the model on training data
xgb_model.fit(X_train, y_train)

# Step 7: Evaluate the model on training data
y_train_pred = xgb_model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"Root Mean Squared Error (RMSE) on Training Data with XGBoost: {train_rmse}")

# Step 8: Use test data to make predictions
# Predictions on the test dataset
test_data['predicted_log_revenue'] = xgb_model.predict(X_test)

# Convert predicted log revenue to revenue
test_data['predicted_revenue'] = np.exp(test_data['predicted_log_revenue'])

# Display results
print(test_data[['id', 'predicted_revenue']])

# Prepare submission DataFrame
submission = test_data[['id', 'predicted_revenue']]

# Rename columns to match the required format
submission.columns = ['id', 'revenue']

# Save to CSV
submission.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully in the required format!")



Root Mean Squared Error (RMSE) on Training Data with XGBoost: 0.5695151501432907
       id  predicted_revenue
0    2175       3.789093e+04
1    2564       1.085605e+07
2    1493       1.342061e+07
3     585       1.425776e+06
4    2038       1.153648e+05
..    ...                ...
595  1588       3.259680e+07
596   439       1.906126e+07
597  2908       9.314447e+06
598  1179       9.386971e+04
599  1443       7.702862e+06

[600 rows x 2 columns]
Submission file 'submission.csv' created successfully in the required format!


In [7]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Load training and test datasets
train_data = pd.read_csv("./train.csv")
test_data = pd.read_csv("./test.csv")

# Define helper function to extract date features
def extract_date_features(dataframe):
    dataframe['release_year'] = pd.to_datetime(dataframe['release_date'], errors='coerce').dt.year
    dataframe['release_month'] = pd.to_datetime(dataframe['release_date'], errors='coerce').dt.month
    dataframe['release_weekday'] = pd.to_datetime(dataframe['release_date'], errors='coerce').dt.weekday

# Step 1: Create 'log_revenue' in training data
train_data['log_revenue'] = np.log(train_data['revenue'].replace(0, np.nan))

# Step 2: Feature engineering for dates in training data
extract_date_features(train_data)

# Step 3: Handle missing values
# For XGBoost, we retain NaN values instead of filling them
numeric_columns = train_data.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = train_data.select_dtypes(include=['object']).columns

train_data[categorical_columns] = train_data[categorical_columns].fillna("Unknown")

# Step 4: One-hot encode categorical features in training data
categorical_to_encode = ['main_genre', 'language']
existing_categoricals = [col for col in categorical_to_encode if col in train_data.columns]
train_data = pd.get_dummies(train_data, columns=existing_categoricals, drop_first=True)

# Step 5: Define features and target for training
target = "log_revenue"
features = train_data.columns.difference([target, 'revenue', 'belongs_to_collection', 'genres', 'homepage',
                                          'imdb_id', 'original_language', 'original_title', 'overview',
                                          'poster_path', 'production_companies', 'production_countries',
                                          'release_date', 'spoken_languages', 'status', 'tagline', 'title',
                                          'Keywords', 'cast', 'crew'])

X_train = train_data[features]
y_train = train_data[target]

# Step 6: XGBoost Grid Search for Hyperparameter Tuning
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [4, 6, 8],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters from GridSearch
print("Best Hyperparameters:", grid_search.best_params_)

# Train the XGBoost model with best parameters
best_xgb_model = grid_search.best_estimator_

# Step 7: Evaluate the model on training data
y_train_pred = best_xgb_model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"Root Mean Squared Error (RMSE) on Training Data: {train_rmse}")

# Step 8: Use test data to make predictions
# Apply the same feature engineering steps to the test data
extract_date_features(test_data)

# Check for existence of categorical columns before handling them
existing_categoricals = [col for col in categorical_to_encode if col in test_data.columns]
if existing_categoricals:
    test_data[existing_categoricals] = test_data[existing_categoricals].fillna("Unknown")
    test_data = pd.get_dummies(test_data, columns=existing_categoricals, drop_first=True)

# Align test data columns to match training data's features
X_test = test_data.reindex(columns=features, fill_value=0)

# Predict on the test dataset
test_data['predicted_log_revenue'] = best_xgb_model.predict(X_test)

# Convert predicted log revenue to revenue
test_data['predicted_revenue'] = np.exp(test_data['predicted_log_revenue'])

# Display results
print(test_data[['id', 'predicted_revenue']])


  dataframe['release_year'] = pd.to_datetime(dataframe['release_date'], errors='coerce').dt.year
  dataframe['release_month'] = pd.to_datetime(dataframe['release_date'], errors='coerce').dt.month
  dataframe['release_weekday'] = pd.to_datetime(dataframe['release_date'], errors='coerce').dt.weekday


Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=1.0; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=1.0; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=1.0; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=200, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=200, subsample=0.8; total time=   0.1s
[CV] END 

  dataframe['release_year'] = pd.to_datetime(dataframe['release_date'], errors='coerce').dt.year
  dataframe['release_month'] = pd.to_datetime(dataframe['release_date'], errors='coerce').dt.month
  dataframe['release_weekday'] = pd.to_datetime(dataframe['release_date'], errors='coerce').dt.weekday
