In [11]:
## Importing Libraries
import pandas as pd

## XGBoost model
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from imblearn.over_sampling import SMOTE

## SKLearn libraries
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.utils import resample
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, f1_score, mean_squared_error, r2_score
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import OneHotEncoder, StandardScaler


## Ideation and Approach: 

1. I will be considering only those who donated and based on the donation, I will be estimating the expected revenue.
2. I will be using the XGBoost model to predict the donation amount for 6, 12, 18 and 24 months.
3. Will be building different XGBoost models to predict revenue for different months

#### 1. Loading and validating the data

In [2]:
donor_df = pd.read_csv('../Data/cleaned_data/data_for_modeling.csv')
donor_df = donor_df[donor_df['donated'] == 1]

In [3]:
donor_df.shape

(987, 30)

In [4]:
donor_df.columns

Index(['TARGET_B', 'TARGET_D6', 'TARGET_D12', 'TARGET_D18', 'TARGET_D24',
       'CONTROL_NUMBER', 'MONTHS_SINCE_ORIGIN', 'NUMBER_OF_RESP', 'DONOR_AGE',
       'IN_HOUSE', 'URBANICITY', 'CLUSTER_CODE', 'HOME_OWNER', 'DONOR_GENDER',
       'INCOME_GROUP', 'PUBLISHED_PHONE', 'WEALTH_RATING', 'MEDIAN_HOME_VALUE',
       'MEDIAN_HOUSEHOLD_INCOME', 'PCT_OWNER_OCCUPIED', 'PEP_STAR',
       'RECENT_STAR_STATUS', 'RECENCY_FREQ_STATUS',
       'RECENT_CARD_RESPONSE_PROP', 'MONTHS_SINCE_LAST_PROM_RESP',
       'LAST_GIFT_AMT', 'NUMBER_PROM_12', 'MONTHS_SINCE_LAST_GIFT',
       'MONTHS_SINCE_FIRST_GIFT', 'donated'],
      dtype='object')

#### 2. Making the dataset for the model

In [6]:
X = donor_df.drop(columns=['TARGET_B', 'TARGET_D6', 'TARGET_D12', 'TARGET_D18', 'TARGET_D24', 'donated'])
y_6 = donor_df['TARGET_D6']
y_12 = donor_df['TARGET_D12']
y_18 = donor_df['TARGET_D18']
y_24 = donor_df['TARGET_D24']

#### Processing the data

In [8]:
# Split the data into training+validation and test sets (80-20 split)
X_train_val, X_test, y_train_val_6, y_test_6 = train_test_split(X, y_6, test_size=0.2, random_state=7)
_, _, y_train_val_12, y_test_12 = train_test_split(X, y_12, test_size=0.2, random_state=7)
_, _, y_train_val_18, y_test_18 = train_test_split(X, y_18, test_size=0.2, random_state=7)
_, _, y_train_val_24, y_test_24 = train_test_split(X, y_24, test_size=0.2, random_state=7)

# Split the training+validation set into training and validation sets (75-25 split)
X_train, X_val, y_train_6, y_val_6 = train_test_split(X_train_val, y_train_val_6, test_size=0.15, random_state=7)
_, _, y_train_12, y_val_12 = train_test_split(X_train_val, y_train_val_12, test_size=0.15, random_state=7)
_, _, y_train_18, y_val_18 = train_test_split(X_train_val, y_train_val_18, test_size=0.15, random_state=7)
_, _, y_train_24, y_val_24 = train_test_split(X_train_val, y_train_val_24, test_size=0.15, random_state=7)

In [10]:
## Selecting categorical and numerical features to use in the pipeline
cat_to_transform = X_train.select_dtypes(include=['object']).columns
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns 

## Defining the pipeline
# Scaling numerical features and encoding categorical features

preprocessor = ColumnTransformer(
    transformers = [
        ('numerical', StandardScaler(), numerical_features),
        ('categorical', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_to_transform)
    ]
)

# Apply the transformations to the training, validation, and test sets
X_train_transformed = preprocessor.fit_transform(X_train)
X_val_transformed = preprocessor.transform(X_val)
X_test_transformed = preprocessor.transform(X_test)



#### Training the model

In [16]:
# Train a regression model for each time period
xgb_model_6 = XGBRegressor(n_estimators=500, 
                                max_depth=15,
                                learning_rate=0.05,
                                 n_jobs=-1,
                                 random_state=7)
xgb_model_6.fit(X_train_transformed, y_train_6)

xgb_model_12 = XGBRegressor(n_estimators=500, 
                                max_depth=15,
                                learning_rate=0.05,
                                 n_jobs=-1,
                                 random_state=7)
xgb_model_12.fit(X_train_transformed, y_train_12)

xgb_model_18 = XGBRegressor(n_estimators=500, 
                                max_depth=15,
                                learning_rate=0.05,
                                 n_jobs=-1,
                                 random_state=7)
xgb_model_18.fit(X_train_transformed, y_train_18)

xgb_model_24 = XGBRegressor(n_estimators=500, 
                                max_depth=15,
                                learning_rate=0.05,
                                 n_jobs=-1,
                                 random_state=7)
xgb_model_24.fit(X_train_transformed, y_train_24)

##### Model Evaluation

In [17]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

In [18]:
# Evaluate each model
mse_6, r2_6 = evaluate_model(xgb_model_6, X_val_transformed, y_val_6)
mse_12, r2_12 = evaluate_model(xgb_model_12, X_val_transformed, y_val_12)
mse_18, r2_18 = evaluate_model(xgb_model_18, X_val_transformed, y_val_18)
mse_24, r2_24 = evaluate_model(xgb_model_24, X_val_transformed, y_val_24)

print(f"6-months: MSE = {mse_6}, R2 = {r2_6}")
print(f"12-months: MSE = {mse_12}, R2 = {r2_12}")
print(f"18-months: MSE = {mse_18}, R2 = {r2_18}")
print(f"24-months: MSE = {mse_24}, R2 = {r2_24}")

6-months: MSE = 157799.96906646877, R2 = -0.30142331260039
12-months: MSE = 1274562.6688466398, R2 = -0.12804561817273719
18-months: MSE = 2958108.4606587244, R2 = -0.2996013075000741
24-months: MSE = 2821463.0745740645, R2 = -0.008328069304253782


#### Evaluating model on Unseen data

In [19]:
# Make predictions for future revenue on the test set
y_pred_6 = xgb_model_6.predict(X_test_transformed)
y_pred_12 = xgb_model_12.predict(X_test_transformed)
y_pred_18 = xgb_model_18.predict(X_test_transformed)
y_pred_24 = xgb_model_24.predict(X_test_transformed)

# Create a DataFrame to hold the predictions
predictions = pd.DataFrame({
    'Actual_6_months': y_test_6,
    'Predicted_6_months': y_pred_6,
    'Actual_12_months': y_test_12,
    'Predicted_12_months': y_pred_12,
    'Actual_18_months': y_test_18,
    'Predicted_18_months': y_pred_18,
    'Actual_24_months': y_test_24,
    'Predicted_24_months': y_pred_24
})

# Display the predictions
predictions.head()

Unnamed: 0,Actual_6_months,Predicted_6_months,Actual_12_months,Predicted_12_months,Actual_18_months,Predicted_18_months,Actual_24_months,Predicted_24_months
19314,150.0,287.095978,700.0,947.118896,1250.0,1160.505981,1550.0,2092.333252
16895,241.5,184.412216,1368.5,908.324829,2254.0,907.931702,2334.5,1732.090942
16954,30.0,348.921661,150.0,966.011536,270.0,1200.976318,290.0,2078.562988
19606,243.0,511.125458,660.9,931.696045,911.3,958.272644,1074.09,1366.987183
16724,50.0,396.565155,325.0,1274.235107,600.0,1659.787964,655.0,1925.692749
