In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from matplotlib import pyplot
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
import numpy as np

from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
import graphviz

def plot_decision_tree(clf, feature_names, class_names):
    # This function requires the pydotplus module and assumes it's been installed.
    # In some cases (typically under Windows) even after running conda install, there is a problem where the
    # pydotplus module is not found when running from within the notebook environment.  The following code
    # may help to guarantee the module is installed in the current notebook environment directory.
    #
    # import sys; sys.executable
    # !{sys.executable} -m pip install pydotplus

    export_graphviz(clf, out_file="adspy_temp.dot", feature_names=feature_names, class_names=class_names, filled = True, impurity = False)
    with open("adspy_temp.dot") as f:
        dot_graph = f.read()
    # Alternate method using pydotplus, if installed.
    # graph = pydotplus.graphviz.graph_from_dot_data(dot_graph)
    # return graph.create_png()
    return graphviz.Source(dot_graph)

def plot_feature_importances(clf, feature_names):
    c_features = len(feature_names)
    plt.barh(range(c_features), clf.feature_importances_)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature name")
    plt.yticks(np.arange(c_features), feature_names)
    
# feature selection
def select_features(X_train, y_train, X_test, k):
    # configure to select all features
    fs = SelectKBest(score_func=f_regression, k='all')
    # learn relationship from training data
    fs.fit(X_train, y_train)
    # transform train input data
    X_train_fs = fs.transform(X_train)
    # transform test input data
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs

In [2]:
df = pd.read_csv("train.csv")
df.drop("Unnamed: 0",axis=1,inplace=True)
df = df[df["days"] != -1]
df = df[df["height"] >= 26 ]
df = df[df["width"] >= 242 ]
df

Unnamed: 0,campaign_id,chain_id,start_date,end_date,format,device,height,width,iremoteid,days,start_day,end_day,shop,budget
0,8963,12,2019-07-21,2019-07-24,banner,DESKTOP,200.0,995.0,['31834'],5.0,7.0,3.0,0.0,3579.344177
1,11875,11,2020-09-01,2020-09-05,butterfly,DESKTOP,486.0,278.0,['C75204'],6.0,2.0,6.0,0.0,5251.781250
2,25899,11,2020-10-29,2020-11-24,butterfly,DESKTOP,488.0,265.0,['CB9645'],28.0,4.0,2.0,0.0,9814.411865
3,25458,10,2020-12-09,2020-12-15,banner,DESKTOP,150.0,1200.0,['C111392'],8.0,3.0,2.0,0.0,11428.571429
4,35293,12,2019-11-14,2019-12-26,banner,DESKTOP,200.0,995.0,['34341'],44.0,4.0,4.0,0.0,9328.322937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6315,33169,12,2019-07-03,2019-07-21,banner,DESKTOP,200.0,995.0,['31150'],20.0,3.0,7.0,0.0,10250.965576
6316,306,11,2020-04-23,2020-05-18,butterfly,DESKTOP,488.0,270.0,"['CB7692', 'CB7693']",27.0,4.0,1.0,0.0,8835.427002
6317,32436,11,2020-11-09,2020-11-17,butterfly,DESKTOP,486.0,273.0,['C104835'],10.0,1.0,2.0,0.0,7412.225952
6318,12732,12,2021-04-27,2021-07-04,butterfly,DESKTOP,298.0,398.0,['C144616'],70.0,2.0,7.0,0.0,19275.559998


In [3]:
# load the dataset
y = df["budget"]
X = df.iloc[:,:len(df.columns)-1]
X["surface"] = X["width"]*X["height"]
X['campaign_id']=X['campaign_id'].astype('category').cat.codes
X['chain_id']=X['chain_id'].astype('category').cat.codes
X['iremoteid']=X['iremoteid'].astype('category').cat.codes
X['shop']=X['shop'].astype('category').cat.codes
X["start_year"] = X["start_date"].apply(lambda m: m.split("-")[0]).astype('category').cat.codes
X["end_year"] = X["end_date"].apply(lambda m: m.split("-")[0]).astype('category').cat.codes
X["start_month"] = X["start_date"].apply(lambda m: m.split("-")[1]).astype('category').cat.codes
X["end_month"] = X["end_date"].apply(lambda m: m.split("-")[1]).astype('category').cat.codes
X.drop(["start_date","end_date"],axis=1,inplace=True)
X["end_day"]=X["end_day"].astype('category').cat.codes
X["start_day"]=X["start_day"].astype('category').cat.codes
X["format"]=X["format"].astype('category').cat.codes
X["device"]=X["device"].astype('category').cat.codes
X=X[["days","surface","format",'start_month']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
X

Unnamed: 0,days,surface,format,start_month
0,5.0,199000.0,0,6
1,6.0,135108.0,1,8
2,28.0,129320.0,1,9
3,8.0,180000.0,0,11
4,44.0,199000.0,0,10
...,...,...,...,...
6315,20.0,199000.0,0,6
6316,27.0,131760.0,1,3
6317,10.0,132678.0,1,10
6318,70.0,118604.0,1,3


In [4]:
from sklearn.compose import make_column_selector

cat_selector = make_column_selector(dtype_include=object)
num_selector = make_column_selector(dtype_include=np.number)
cat_selector(X)

[]

In [5]:
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder

cat_tree_processor = OrdinalEncoder()
num_tree_processor = SimpleImputer(strategy="mean", add_indicator=True)

tree_preprocessor = make_column_transformer(
    (num_tree_processor, num_selector), (cat_tree_processor, cat_selector)
)
tree_preprocessor


ColumnTransformer(transformers=[('simpleimputer',
                                 SimpleImputer(add_indicator=True),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x000001A824A751F0>),
                                ('ordinalencoder', OrdinalEncoder(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x000001A824A3BD30>)])

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

cat_linear_processor = OneHotEncoder(handle_unknown="ignore")
num_linear_processor = make_pipeline(
    StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True)
)

linear_preprocessor = make_column_transformer(
    (num_linear_processor, num_selector), (cat_linear_processor, cat_selector)
)
linear_preprocessor

ColumnTransformer(transformers=[('pipeline',
                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler()),
                                                 ('simpleimputer',
                                                  SimpleImputer(add_indicator=True))]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x000001A824A751F0>),
                                ('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x000001A824A3BD30>)])

In [18]:
from xgboost import XGBRegressor

lasso_pipeline = make_pipeline(tree_preprocessor,XGBRegressor(n_estimators=100, max_depth=5, eta=1.00, subsample=0.73,learning_rate=0.01, colsample_bytree=0.8,alpha = 0.01).fit(X_train,y_train))
lasso_pipeline

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('simpleimputer',
                                                  SimpleImputer(add_indicator=True),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001A824A751F0>),
                                                 ('ordinalencoder',
                                                  OrdinalEncoder(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001A824A3BD30>)])),
                ('xgbregressor',
                 XGBR...
                              eta=1.0, gamma=0, gpu_id=-1, importance_type=None,
                              interaction_constraints='', learning_rate=0.01,
                              max_delta_step=0, max_depth=5, min_child_weight=1,
                              missing=nan, monotone_constraints='()',
                       

In [19]:
from sklearn.neighbors import KNeighborsRegressor

rf_pipeline = make_pipeline(tree_preprocessor, KNeighborsRegressor(n_neighbors = 70, weights = 'uniform'))
rf_pipeline

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('simpleimputer',
                                                  SimpleImputer(add_indicator=True),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001A824A751F0>),
                                                 ('ordinalencoder',
                                                  OrdinalEncoder(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001A824A3BD30>)])),
                ('kneighborsregressor', KNeighborsRegressor(n_neighbors=70))])

In [20]:
from sklearn.ensemble import RandomForestRegressor

gbdt_pipeline = make_pipeline(
    tree_preprocessor, RandomForestRegressor(n_estimators=190,bootstrap=False,max_depth=15,min_samples_leaf=3)
)
gbdt_pipeline

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('simpleimputer',
                                                  SimpleImputer(add_indicator=True),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001A824A751F0>),
                                                 ('ordinalencoder',
                                                  OrdinalEncoder(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001A824A3BD30>)])),
                ('randomforestregressor',
                 RandomForestRegressor(bootstrap=False, max_depth=15,
                                       min_samples_leaf=3, n_estimators=190))])

In [21]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV

estimators = [
    ("Random Forest", rf_pipeline),
    ("Lasso", lasso_pipeline),
    ("Gradient Boosting", gbdt_pipeline),
]

stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=RidgeCV())
stacking_regressor.fit(X_test,y_test)

y_predicted = stacking_regressor.predict(X_train)
RMSE = mean_squared_error(y_train, y_predicted, squared=False)
print("RMSE train:" , RMSE)
y_predicted = stacking_regressor.predict(X_test)
RMSE = mean_squared_error(y_test, y_predicted, squared=False)

print("RMSE:" , RMSE)

RMSE train: 5944.612145640256
RMSE: 2709.4073579369815


In [12]:
df_to_predict = pd.read_csv("predictions.csv")
df_to_predict.drop("Unnamed: 0",axis=1,inplace=True)
df_to_predict["surface"] = df_to_predict["width"]*df_to_predict["height"]
df_to_predict['campaign_id']=df_to_predict['campaign_id'].astype('category').cat.codes
df_to_predict['chain_id']=df_to_predict['chain_id'].astype('category').cat.codes
df_to_predict['iremoteid']=df_to_predict['iremoteid'].astype('category').cat.codes
df_to_predict['shop']=df_to_predict['shop'].astype('category').cat.codes
df_to_predict["start_date"]=df_to_predict["start_date"].astype('category').cat.codes
df_to_predict["end_date"]=df_to_predict["end_date"].astype('category').cat.codes
df_to_predict["end_day"]=df_to_predict["end_day"].astype('category').cat.codes
df_to_predict["start_day"]=df_to_predict["start_day"].astype('category').cat.codes
df_to_predict["format"]=df_to_predict["format"].astype('category').cat.codes
df_to_predict["device"]=df_to_predict["device"].astype('category').cat.codes
df_to_predict = df_to_predict[["days","surface","format",'start_month']]
y_predicted = stacking_regressor.predict(df_to_predict)
df_to_submit = pd.read_csv("sample_submission.csv")
df_to_submit["budget"] = y_predicted
df_to_submit.to_csv("3ejja.csv",index=False)

KeyError: "['start_month'] not in index"

In [None]:
from lightgbm import LGBMRegressor

model = LGBMRegressor(l1=0.1 )
model.fit(X_train, y_train)

y_predicted = model.predict(X_train)
RMSE = mean_squared_error(y_train, y_predicted, squared=False)
print("RMSE train:" , RMSE)
y_predicted = model.predict(X_test)
RMSE = mean_squared_error(y_test, y_predicted, squared=False)

print("RMSE:" , RMSE)
