# MUFG DataScience Challenge 2025 - Baseline Random Forest Model

This notebook creates a baseline Random Forest model for predicting crowdfunding project success with F1 score evaluation.

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


## 1. Data Loading and Preprocessing

In [2]:
# Load training and test data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"\nTarget distribution in training data:")
print(train_df['final_status'].value_counts(normalize=True))

Training data shape: (75690, 13)
Test data shape: (32439, 12)

Target distribution in training data:
final_status
0    0.680473
1    0.319527
Name: proportion, dtype: float64


## 2. Feature Engineering

In [3]:
def engineer_features(df):
    """Create engineered features for the dataset"""
    df = df.copy()
    
    # Convert timestamps to datetime
    timestamp_cols = ['deadline', 'state_changed_at', 'created_at', 'launched_at']
    for col in timestamp_cols:
        df[f'{col}_dt'] = pd.to_datetime(df[col], unit='s')
    
    # Create temporal features
    df['campaign_duration'] = (df['deadline_dt'] - df['launched_at_dt']).dt.days
    df['prep_time'] = (df['launched_at_dt'] - df['created_at_dt']).dt.days
    
    # Extract date components
    df['launch_year'] = df['launched_at_dt'].dt.year
    df['launch_month'] = df['launched_at_dt'].dt.month
    df['launch_day_of_week'] = df['launched_at_dt'].dt.dayofweek
    df['launch_quarter'] = df['launched_at_dt'].dt.quarter
    
    # Text length features
    df['name_length'] = df['name'].str.len()
    df['desc_length'] = df['desc'].str.len()
    df['keywords_length'] = df['keywords'].str.len()
    df['name_word_count'] = df['name'].str.split().str.len()
    df['desc_word_count'] = df['desc'].str.split().str.len()
    
    # Goal amount transformations
    df['log_goal'] = np.log1p(df['goal'])
    df['goal_per_day'] = df['goal'] / np.maximum(df['campaign_duration'], 1)
    
    # Communication feature
    df['disable_communication_int'] = df['disable_communication'].astype(int)
    
    return df

# Apply feature engineering
train_engineered = engineer_features(train_df)
test_engineered = engineer_features(test_df)

print("Feature engineering completed!")
print(f"New training data shape: {train_engineered.shape}")

Feature engineering completed!
New training data shape: (75690, 31)


## 3. Feature Selection and Encoding

In [4]:
# Select features for modeling
feature_cols = [
    # Numerical features
    'goal', 'log_goal', 'goal_per_day',
    'campaign_duration', 'prep_time',
    'launch_year', 'launch_month', 'launch_day_of_week', 'launch_quarter',
    'name_length', 'desc_length', 'keywords_length',
    'name_word_count', 'desc_word_count',
    'disable_communication_int',
    # Categorical features to encode
    'country', 'currency'
]

# Prepare data for modeling
X_train_raw = train_engineered[feature_cols].copy()
y_train = train_engineered['final_status'].copy()
X_test_raw = test_engineered[feature_cols].copy()

print(f"Selected features: {len(feature_cols)}")
print(f"Features: {feature_cols}")

Selected features: 17
Features: ['goal', 'log_goal', 'goal_per_day', 'campaign_duration', 'prep_time', 'launch_year', 'launch_month', 'launch_day_of_week', 'launch_quarter', 'name_length', 'desc_length', 'keywords_length', 'name_word_count', 'desc_word_count', 'disable_communication_int', 'country', 'currency']


In [5]:
# Handle categorical encoding
categorical_features = ['country', 'currency']
label_encoders = {}

X_train = X_train_raw.copy()
X_test = X_test_raw.copy()

for col in categorical_features:
    le = LabelEncoder()
    
    # Fit on combined data to handle unseen categories
    combined_values = pd.concat([X_train[col], X_test[col]]).astype(str)
    le.fit(combined_values)
    
    # Transform both datasets
    X_train[col] = le.transform(X_train[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))
    
    label_encoders[col] = le
    print(f"Encoded {col}: {len(le.classes_)} unique values")

# Handle any remaining missing values
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

print(f"\nFinal training features shape: {X_train.shape}")
print(f"Final test features shape: {X_test.shape}")

Encoded country: 11 unique values
Encoded currency: 9 unique values

Final training features shape: (75690, 17)
Final test features shape: (32439, 17)


## 4. Train/Validation Split

In [6]:
# Split training data into train/validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_train
)

print(f"Training set shape: {X_train_split.shape}")
print(f"Validation set shape: {X_val_split.shape}")
print(f"\nTarget distribution in training split:")
print(pd.Series(y_train_split).value_counts(normalize=True))
print(f"\nTarget distribution in validation split:")
print(pd.Series(y_val_split).value_counts(normalize=True))

Training set shape: (60552, 17)
Validation set shape: (15138, 17)

Target distribution in training split:
final_status
0    0.680473
1    0.319527
Name: proportion, dtype: float64

Target distribution in validation split:
final_status
0    0.680473
1    0.319527
Name: proportion, dtype: float64


## 5. Random Forest Model Training

In [7]:
# Initialize Random Forest with reasonable parameters
rf_model = RandomForestClassifier(
    n_estimators=100,          # Number of trees
    max_depth=15,              # Maximum depth of trees
    min_samples_split=10,      # Minimum samples to split
    min_samples_leaf=5,        # Minimum samples in leaf
    max_features='sqrt',       # Number of features to consider at each split
    random_state=42,           # For reproducibility
    n_jobs=-1,                 # Use all available cores
    class_weight='balanced'    # Handle class imbalance
)

print("Random Forest parameters:")
print(rf_model.get_params())

Random Forest parameters:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 15, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 5, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}


In [8]:
# Train the model
print("Training Random Forest model...")
rf_model.fit(X_train_split, y_train_split)
print("Model training completed!")

Training Random Forest model...
Model training completed!


## 6. Model Evaluation - F1 Score

In [9]:
# Make predictions
y_train_pred = rf_model.predict(X_train_split)
y_val_pred = rf_model.predict(X_val_split)

# Calculate F1 scores
train_f1 = f1_score(y_train_split, y_train_pred)
val_f1 = f1_score(y_val_split, y_val_pred)

print("=== BASELINE RANDOM FOREST RESULTS ===")
print(f"\nF1 Score Results:")
print(f"Training F1 Score: {train_f1:.4f}")
print(f"Validation F1 Score: {val_f1:.4f}")
print(f"Overfitting Check: {train_f1 - val_f1:.4f} (lower is better)")

=== BASELINE RANDOM FOREST RESULTS ===

F1 Score Results:
Training F1 Score: 0.7251
Validation F1 Score: 0.5234
Overfitting Check: 0.2017 (lower is better)


In [10]:
# Detailed classification report for validation set
print("\n=== VALIDATION SET DETAILED RESULTS ===")
print("\nClassification Report:")
print(classification_report(y_val_split, y_val_pred))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_val_split, y_val_pred)
print(cm)
print("\nConfusion Matrix Breakdown:")
print(f"True Negatives (Correctly predicted failures): {cm[0,0]}")
print(f"False Positives (Incorrectly predicted successes): {cm[0,1]}")
print(f"False Negatives (Incorrectly predicted failures): {cm[1,0]}")
print(f"True Positives (Correctly predicted successes): {cm[1,1]}")


=== VALIDATION SET DETAILED RESULTS ===

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.66      0.72     10301
           1       0.46      0.61      0.52      4837

    accuracy                           0.64     15138
   macro avg       0.62      0.64      0.62     15138
weighted avg       0.68      0.64      0.65     15138


Confusion Matrix:
[[6792 3509]
 [1879 2958]]

Confusion Matrix Breakdown:
True Negatives (Correctly predicted failures): 6792
False Positives (Incorrectly predicted successes): 3509
False Negatives (Incorrectly predicted failures): 1879
True Positives (Correctly predicted successes): 2958


## 7. Feature Importance Analysis

In [11]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n=== TOP 10 MOST IMPORTANT FEATURES ===")
print(feature_importance.head(10))

# Show feature importance as percentages
print("\nTop 10 Features (as percentages):")
top10 = feature_importance.head(10).copy()
top10['importance_pct'] = (top10['importance'] * 100).round(2)
for idx, row in top10.iterrows():
    print(f"{row['feature']}: {row['importance_pct']}%")


=== TOP 10 MOST IMPORTANT FEATURES ===
              feature  importance
4           prep_time    0.132005
2        goal_per_day    0.097957
1            log_goal    0.092254
0                goal    0.089533
3   campaign_duration    0.081529
10        desc_length    0.076999
5         launch_year    0.074134
9         name_length    0.068796
11    keywords_length    0.062604
13    desc_word_count    0.058237

Top 10 Features (as percentages):
prep_time: 13.2%
goal_per_day: 9.8%
log_goal: 9.23%
goal: 8.95%
campaign_duration: 8.15%
desc_length: 7.7%
launch_year: 7.41%
name_length: 6.88%
keywords_length: 6.26%
desc_word_count: 5.82%


## 8. Test Set Predictions

In [12]:
# Make predictions on the full training set (for final model)
print("Retraining model on full training dataset...")
rf_final = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

rf_final.fit(X_train, y_train)

# Final training F1 score
y_train_final_pred = rf_final.predict(X_train)
train_final_f1 = f1_score(y_train, y_train_final_pred)

print(f"Final model F1 score on full training set: {train_final_f1:.4f}")

Retraining model on full training dataset...
Final model F1 score on full training set: 0.7113


In [13]:
# Make predictions on test set
test_predictions = rf_final.predict(X_test)
test_probabilities = rf_final.predict_proba(X_test)[:, 1]  # Get probability of success

print(f"Test predictions summary:")
print(f"Total test samples: {len(test_predictions)}")
print(f"Predicted successes: {sum(test_predictions)} ({sum(test_predictions)/len(test_predictions)*100:.1f}%)")
print(f"Predicted failures: {len(test_predictions) - sum(test_predictions)} ({(len(test_predictions) - sum(test_predictions))/len(test_predictions)*100:.1f}%)")

Test predictions summary:
Total test samples: 32439
Predicted successes: 14015 (43.2%)
Predicted failures: 18424 (56.8%)


## 9. Create Submission File

In [14]:
# Create submission file
submission = pd.DataFrame({
    'id': test_df['id'],
    'final_status': test_predictions
})

# Save submission file
submission.to_csv('baseline_rf_submission.csv', index=False)

print("Submission file created: baseline_rf_submission.csv")
print("\nSubmission preview:")
print(submission.head(10))
print(f"\nSubmission shape: {submission.shape}")

Submission file created: baseline_rf_submission.csv

Submission preview:
           id  final_status
0  test_00000             1
1  test_00001             0
2  test_00002             1
3  test_00003             0
4  test_00004             1
5  test_00005             1
6  test_00006             1
7  test_00007             1
8  test_00008             0
9  test_00009             1

Submission shape: (32439, 2)


## 10. Model Summary

In [15]:
print("\n" + "="*50)
print("BASELINE RANDOM FOREST MODEL SUMMARY")
print("="*50)
print(f"\n📊 PERFORMANCE METRICS:")
print(f"   Training F1 Score: {train_f1:.4f}")
print(f"   Validation F1 Score: {val_f1:.4f}")
print(f"   Final Training F1 Score: {train_final_f1:.4f}")

print(f"\n🔧 MODEL CONFIGURATION:")
print(f"   Algorithm: Random Forest")
print(f"   Number of trees: 100")
print(f"   Max depth: 15")
print(f"   Class weight: balanced")
print(f"   Features used: {len(feature_cols)}")

print(f"\n📈 TOP 3 IMPORTANT FEATURES:")
for i, (_, row) in enumerate(feature_importance.head(3).iterrows()):
    print(f"   {i+1}. {row['feature']}: {row['importance']*100:.2f}%")

print(f"\n📋 PREDICTIONS:")
print(f"   Test set size: {len(test_predictions)}")
print(f"   Predicted successes: {sum(test_predictions)} ({sum(test_predictions)/len(test_predictions)*100:.1f}%)")
print(f"   Submission file: baseline_rf_submission.csv")

print(f"\n💡 BASELINE ESTABLISHED!")
print(f"   Validation F1 Score: {val_f1:.4f}")
print(f"   This serves as the benchmark for model improvements.")


BASELINE RANDOM FOREST MODEL SUMMARY

📊 PERFORMANCE METRICS:
   Training F1 Score: 0.7251
   Validation F1 Score: 0.5234
   Final Training F1 Score: 0.7113

🔧 MODEL CONFIGURATION:
   Algorithm: Random Forest
   Number of trees: 100
   Max depth: 15
   Class weight: balanced
   Features used: 17

📈 TOP 3 IMPORTANT FEATURES:
   1. prep_time: 13.20%
   2. goal_per_day: 9.80%
   3. log_goal: 9.23%

📋 PREDICTIONS:
   Test set size: 32439
   Predicted successes: 14015 (43.2%)
   Submission file: baseline_rf_submission.csv

💡 BASELINE ESTABLISHED!
   Validation F1 Score: 0.5234
   This serves as the benchmark for model improvements.
