In [635]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [636]:
#Read and preprocess data

def wrangle(filepath):
    
    #Reading Data
    df = pd.read_csv(filepath).set_index("campaign_id")

    #Dropping "is_timer" column due to low cardinality (single value)
    df.drop(columns = ["is_timer"], inplace = True)

    #Changing dtype of categorical columns to categorical
    cat_cols = ["product", "category", "times_of_day", "day_of_week", "sender", "target_audience"]

    for col in cat_cols:
        df[col] = pd.Categorical(df[col])

    #Getting dummies
    df_cat = pd.get_dummies(df[cat_cols])

    #Removing categorical columns from df
    for col in cat_cols:
        df.pop(col)
    #Concating dummies and remaining columns of df
    df = pd.concat([df, df_cat], axis = "columns")

    #Deleting df_cat
    del df_cat
    
    return df

In [637]:
df = wrangle("train_F3fUq2S.csv")
y = df.pop("click_rate")
X = df


In [638]:
df.shape

(1888, 111)

In [639]:
X_test = wrangle("jobathontest.csv")
#Creating missing columns in test_data
for col in df.columns:
    if col not in X_test.columns:
        X_test[col] = 0

#Creating missing columns in train data
for col in X_test.columns:
    if col not in df.columns:
        df[col] = 0

print("The number of columns in X is: ", X.shape[1])
print("The number of columns in X_test is: ", X_test.shape[1])

The number of columns in X is:  117
The number of columns in X_test is:  117


In [640]:
#Splitting into test and validation tests

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 42, test_size = 0.1)

In [641]:
#Random_forest

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state= 42)

rf_params = {"n_estimators": [92],
            "max_depth": [7],
            "max_features" : [0.4],
            "max_samples" : [0.9],
            "min_impurity_decrease": [0.00001]}

rf1_grid = GridSearchCV(estimator= rf, param_grid= rf_params, cv = 5)

rf1_grid.fit(X_train, y_train)


y_pred = rf1_grid.predict(X_train)
r2_rf = r2_score(y_train, y_pred)
print("The training r2_score for the rf model is :", round(r2_rf, 4))
print("The validation r2_score for the rf model is :", 
      round(r2_score(y_valid, rf1_grid.predict(X_valid)), 4))

The training r2_score for the rf model is : 0.6987
The validation r2_score for the rf model is : 0.6144


In [642]:
#XGBoost model

from xgboost import XGBRegressor

xgbr = XGBRegressor(random_state = 42, n_estimators = 22, alpha = 1)

xgbr_params = {"eta": [0.4],
              "gamma": [0.025],
              "max_depth": [12],
               "colsample_bytree": [0.4],
               "colsample_bylevel": [0.7],
               "colsample_bynode": [0.9],
               "min_child_weight" : [0.01],
               "subsample": [1.0],
               "lambda": [0.05],
               "objective": ["reg:logistic"]
              }

xgbr1_grid = GridSearchCV(estimator= xgbr,
                        param_grid= xgbr_params, 
                        cv = 5)
xgbr1_grid.fit(X_train, y_train)

y_pred = xgbr1_grid.predict(X_train)
r2_xgbr = r2_score(y_train, y_pred)
print("The training r2_score for the xgbr model is :", round(r2_xgbr, 4))
print("The validation r2_score for the xgbr model is :", 
      round(r2_score(y_valid, xgbr1_grid.predict(X_valid)), 4))

The training r2_score for the xgbr model is : 0.7336
The validation r2_score for the xgbr model is : 0.6835


In [643]:
xgbr1_pred = xgbr1_grid.predict(X_test)


In [644]:
names = X_train.columns
imp = sorted(zip(map(lambda x: round(x, 4), rf_grid.best_estimator_.feature_importances_), names), 
             reverse=True)


# Remove Unsignificant columns and reiterate

In [645]:
df = wrangle("train_F3fUq2S.csv")
y = df.pop("click_rate")
X = df
non_sig = [imp[i][1] for i in range(0,117) if imp[i][0] < 0.015]

In [646]:
X_test = wrangle("jobathontest.csv")
#Creating missing columns in test_data
for col in df.columns:
    if col not in X_test.columns:
        X_test[col] = 0

#Creating missing columns in train data
for col in X_test.columns:
    if col not in df.columns:
        df[col] = 0
X.drop(columns = non_sig, inplace = True)
X_test.drop(columns = non_sig, inplace = True)

print("The number of columns in X is: ", X.shape[1])
print("The number of columns in X_test is: ", X_test.shape[1])

The number of columns in X is:  15
The number of columns in X_test is:  15


In [647]:
#Splitting into test and validation tests

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 42, test_size = 0.1)

In [648]:
#Random_forest

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state= 42)

rf_params = {"n_estimators": [92],
            "max_depth": [18],
            "max_features" : [0.5],
            "max_samples" : [1.0],
            "min_impurity_decrease": [0.00001]}

rf2_grid = GridSearchCV(estimator= rf, param_grid= rf_params, cv = 5)

rf2_grid.fit(X_train, y_train)


y_pred = rf2_grid.predict(X_train)
r2_rf = r2_score(y_train, y_pred)
print("The training r2_score for the rf model is :", round(r2_rf, 4))
print("The validation r2_score for the rf model is :", 
      round(r2_score(y_valid, rf2_grid.predict(X_valid)), 4))

The training r2_score for the rf model is : 0.8298
The validation r2_score for the rf model is : 0.7102


In [649]:
#XGBoost model

from xgboost import XGBRegressor

xgbr = XGBRegressor(random_state = 42, n_estimators = 600, alpha = 1)

xgbr_params = {"eta": [0.4],
              "gamma": [0.01],
              "max_depth": [12],
               "colsample_bytree": [0.8],
               "colsample_bylevel": [0.7],
               "colsample_bynode": [0.9],
               "min_child_weight" : [0.01],
               "subsample": [0.9],
               "lambda": [0.01],
               "objective": ["reg:logistic"]
              }

xgbr2_grid = GridSearchCV(estimator= xgbr,
                        param_grid= xgbr_params, 
                        cv = 5)
xgbr2_grid.fit(X_train, y_train)

y_pred = xgbr2_grid.predict(X_train)
r2_xgbr = r2_score(y_train, y_pred)
print("The training r2_score for the xgbr model is :", round(r2_xgbr, 4))
print("The validation r2_score for the xgbr model is :", 
      round(r2_score(y_valid, xgbr2_grid.predict(X_valid)), 4))

The training r2_score for the xgbr model is : 0.8307
The validation r2_score for the xgbr model is : 0.7546


# Testing

In [656]:
rf2_pred = rf2_grid.predict(X_test)
xgbr2_pred = xgbr2_grid.predict(X_test)
sub = pd.read_csv("sample_submission_LJ2N3ZQ.csv")
sub["click_rate"] = xgbr2_pred
sub.set_index("campaign_id", inplace = True)
pred_mean = pd.read_csv("submission_mean_encoder_rf_xgb.csv").set_index("campaign_id")
average_pred = (pred_mean + sub)/2

In [657]:
average_pred.head()

Unnamed: 0_level_0,click_rate
campaign_id,Unnamed: 1_level_1
1889,0.049453
1890,0.64162
1891,0.180942
1892,0.180942
1893,0.115953


In [658]:
average_pred.to_csv("final.csv")

In [632]:
sub.head()

Unnamed: 0_level_0,click_rate
campaign_id,Unnamed: 1_level_1
1889,0.021883
1890,0.261223
1891,0.137917
1892,0.136437
1893,0.027368
