In [939]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV


In [940]:
def wrangle(filepath):
    df = pd.read_csv(filepath).set_index("campaign_id")
    
    df.drop(columns = ["is_timer"], inplace = True)
    
    cat_cols = ["product", "category", "times_of_day", "day_of_week", "sender", "target_audience"]

    #for col in cat_cols:
     #   df[col] = pd.Categorical(df[col])
        
    return df

In [941]:
df = wrangle("train_F3fUq2S.csv")
df_test = wrangle("jobathontest.csv")

In [942]:
mean_rate = round(df.click_rate.mean(), 6)


In [943]:
absent_products = list(set(df_test["product"].unique()) - set(df["product"].unique()))

prod_means = df.groupby("product")["click_rate"].mean().to_dict()
for prod in absent_products:
    prod_means[prod] = mean_rate
df["product"] = df["product"].map(prod_means)
df_test["product"] = df_test["product"].map(prod_means)

In [944]:
cat_means = df.groupby("category")["click_rate"].mean().to_dict()
df["category"] = df["category"].map(cat_means)
df_test["category"] = df_test["category"].map(cat_means)

In [945]:
times_dummies = pd.get_dummies(df.pop("times_of_day"))
df = pd.concat([df, times_dummies], axis = "columns")
times_dummies = pd.get_dummies(df_test.pop("times_of_day"))
df_test = pd.concat([df_test, times_dummies], axis = "columns")


In [946]:
day_means = df.groupby("day_of_week")["click_rate"].mean().to_dict()
df["day_of_week"] = df["day_of_week"].map(day_means)
df_test["day_of_week"] = df_test["day_of_week"].map(day_means)

In [947]:
absent_senders = list(set(df_test["sender"].unique()) - set(df["sender"].unique()))

sender_means = df.groupby("sender")["click_rate"].mean().to_dict()

for sender in absent_senders:
    sender_means[sender] = mean_rate
    
df["sender"] = df["sender"].map(sender_means)
df_test["sender"] = df_test["sender"].map(sender_means)

In [948]:
audience_means = df.groupby("target_audience")["click_rate"].mean().to_dict()
df["target_audience"] = df["target_audience"].map(audience_means)
df_test["target_audience"] = df_test["target_audience"].map(audience_means)

In [949]:
#df["no_of_CTA"] = 1/(df["no_of_CTA"]+1)

In [950]:
df_test.isnull().sum()

sender                0
subject_len           0
body_len              0
mean_paragraph_len    0
day_of_week           0
is_weekend            0
category              0
product               0
no_of_CTA             0
mean_CTA_len          0
is_image              0
is_personalised       0
is_quote              0
is_emoticons          0
is_discount           0
is_price              0
is_urgency            0
target_audience       0
Evening               0
Morning               0
Noon                  0
dtype: int64

In [951]:
y = df.pop("click_rate")
X = df
unsig_cols = ["is_discount", "is_price", "is_urgency"]
X.drop(columns = unsig_cols, inplace = True)
df_test.drop(columns = unsig_cols, inplace = True)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 42, test_size = 0.1)

In [952]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state= 42)

rf_params = {"max_depth": [12],
            "max_features": [0.4],
            "max_samples": [1.0],
            "n_estimators": [25],
            "min_impurity_decrease": [0.00002]}

rf_grid = GridSearchCV(estimator= rf, param_grid= rf_params, cv = 8)


rf_grid.fit(X_train, y_train)

y_pred = rf_grid.predict(X_train)
r2_rf = r2_score(y_train, y_pred)
print("The training r2_score for the rf model is :", round(r2_rf, 4))
print("The validation r2_score for the rf model is :", 
      round(r2_score(y_valid, rf_grid.predict(X_valid)), 4))

The training r2_score for the rf model is : 0.7861
The validation r2_score for the rf model is : 0.6393


In [953]:
y_pred1 = rf_grid.predict(df_test)

In [954]:
#XGBoost model

from xgboost import XGBRegressor

xgbr = XGBRegressor(random_state = 42, n_estimators = 20, alpha = 0)

xgbr_params = {"eta": [0.3],
              "gamma": [0.03],
              "max_depth": [12],
               "colsample_bytree": [0.7],
               "colsample_bylevel": [0.7],
               "colsample_bynode": [0.9],
               "min_child_weight" : [0.1],
               "subsample": [0.7],
               "lambda": [0.05],
               "objective": ["reg:logistic"]
              }

xgbr_grid = GridSearchCV(estimator= xgbr,
                        param_grid= xgbr_params, 
                        cv = 5)
xgbr_grid.fit(X_train, y_train)

y_pred = xgbr_grid.predict(X_train)
r2_xgbr = r2_score(y_train, y_pred)
print("The training r2_score for the xgbr model is :", round(r2_xgbr, 4))
print("The validation r2_score for the xgbr model is :", 
      round(r2_score(y_valid, xgbr_grid.predict(X_valid)), 4))

The training r2_score for the xgbr model is : 0.9642
The validation r2_score for the xgbr model is : 0.7553


In [955]:
y_pred2 = xgbr_grid.predict(df_test)

In [956]:
sub = pd.read_csv("sample_submission_LJ2N3ZQ.csv").set_index("campaign_id")
sub["click_rate"] = y_pred2
sub.to_csv("submission_mean_encoder_rf_xgb.csv")