In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from matplotlib import pyplot
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
import numpy as np

from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
import graphviz

def plot_decision_tree(clf, feature_names, class_names):
    # This function requires the pydotplus module and assumes it's been installed.
    # In some cases (typically under Windows) even after running conda install, there is a problem where the
    # pydotplus module is not found when running from within the notebook environment.  The following code
    # may help to guarantee the module is installed in the current notebook environment directory.
    #
    # import sys; sys.executable
    # !{sys.executable} -m pip install pydotplus

    export_graphviz(clf, out_file="adspy_temp.dot", feature_names=feature_names, class_names=class_names, filled = True, impurity = False)
    with open("adspy_temp.dot") as f:
        dot_graph = f.read()
    # Alternate method using pydotplus, if installed.
    # graph = pydotplus.graphviz.graph_from_dot_data(dot_graph)
    # return graph.create_png()
    return graphviz.Source(dot_graph)

def plot_feature_importances(clf, feature_names):
    c_features = len(feature_names)
    plt.barh(range(c_features), clf.feature_importances_)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature name")
    plt.yticks(np.arange(c_features), feature_names)
    
# feature selection
def select_features(X_train, y_train, X_test):
    # configure to select all features
    fs = SelectKBest(score_func=f_regression, k='all')
    # learn relationship from training data
    fs.fit(X_train, y_train)
    # transform train input data
    X_train_fs = fs.transform(X_train)
    # transform test input data
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs

In [2]:
df = pd.read_csv("train.csv")
df.drop("Unnamed: 0",axis=1,inplace=True)
df = df[df["days"] != -1]
df = df[df["height"] > 10 ]
df

Unnamed: 0,campaign_id,chain_id,start_date,end_date,format,device,height,width,iremoteid,days,start_day,end_day,shop,budget
0,8963,12,2019-07-21,2019-07-24,banner,DESKTOP,200.0,995.0,['31834'],5.0,7.0,3.0,0.0,3579.344177
1,11875,11,2020-09-01,2020-09-05,butterfly,DESKTOP,486.0,278.0,['C75204'],6.0,2.0,6.0,0.0,5251.781250
2,25899,11,2020-10-29,2020-11-24,butterfly,DESKTOP,488.0,265.0,['CB9645'],28.0,4.0,2.0,0.0,9814.411865
3,25458,10,2020-12-09,2020-12-15,banner,DESKTOP,150.0,1200.0,['C111392'],8.0,3.0,2.0,0.0,11428.571429
4,35293,12,2019-11-14,2019-12-26,banner,DESKTOP,200.0,995.0,['34341'],44.0,4.0,4.0,0.0,9328.322937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6315,33169,12,2019-07-03,2019-07-21,banner,DESKTOP,200.0,995.0,['31150'],20.0,3.0,7.0,0.0,10250.965576
6316,306,11,2020-04-23,2020-05-18,butterfly,DESKTOP,488.0,270.0,"['CB7692', 'CB7693']",27.0,4.0,1.0,0.0,8835.427002
6317,32436,11,2020-11-09,2020-11-17,butterfly,DESKTOP,486.0,273.0,['C104835'],10.0,1.0,2.0,0.0,7412.225952
6318,12732,12,2021-04-27,2021-07-04,butterfly,DESKTOP,298.0,398.0,['C144616'],70.0,2.0,7.0,0.0,19275.559998


In [3]:
# load the dataset
y = df["budget"]
X = df.iloc[:,:len(df.columns)-1]
X["surface"] = X["width"]*X["height"]
X['campaign_id']=X['campaign_id'].astype('category').cat.codes
X['chain_id']=X['chain_id'].astype('category').cat.codes
X['iremoteid']=X['iremoteid'].astype('category').cat.codes
X['shop']=X['shop'].astype('category').cat.codes
X["start_date"]=X["start_date"].astype('category').cat.codes
X["end_date"]=X["end_date"].astype('category').cat.codes
X["end_day"]=X["end_day"].astype('category').cat.codes
X["start_day"]=X["start_day"].astype('category').cat.codes
X["format"]=X["format"].astype('category').cat.codes
X["device"]=X["device"].astype('category').cat.codes
X

Unnamed: 0,campaign_id,chain_id,start_date,end_date,format,device,height,width,iremoteid,days,start_day,end_day,shop,surface
0,1821,3,70,75,0,0,200.0,995.0,1322,5.0,6,2,0,199000.0
1,2279,2,398,430,1,0,486.0,278.0,5572,6.0,1,5,0,135108.0
2,4569,2,454,505,1,0,488.0,265.0,6091,28.0,3,1,0,129320.0
3,4502,1,493,526,0,0,150.0,1200.0,2215,8.0,2,1,0,180000.0
4,6027,3,152,204,0,0,200.0,995.0,1405,44.0,3,3,0,199000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6315,5695,3,58,72,0,0,200.0,995.0,1292,20.0,2,6,0,199000.0
6316,150,2,280,327,1,0,488.0,270.0,6068,27.0,3,0,0,131760.0
6317,5595,2,464,498,1,0,486.0,273.0,2093,10.0,0,1,0,132678.0
6318,2427,3,623,716,1,0,298.0,398.0,2994,70.0,1,6,0,118604.0


In [24]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

X=X[["surface","days","width"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

parameters = [{"n_neighbors":[1,2,15,3,4,5,6,7,8,9,10,11],"weights":["uniform", "distance"]}]

clf = GridSearchCV(KNeighborsRegressor(), parameters)
clf.fit(X_train, y_train)
print(clf.best_params_)


{'n_neighbors': 1, 'weights': 'uniform'}


In [21]:
knnreg = KNeighborsRegressor(n_neighbors = 2, weights = 'distance').fit(X_train, y_train)
y_predicted = knnreg.predict(X_test)
RMSE = mean_squared_error(y_test, y_predicted, squared=False)

print("RMSE:" , RMSE)
print('R-squared score (training): {:.3f}'.format(knnreg.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'.format(knnreg.score(X_test, y_test)))

RMSE: 2303.6743220784033
R-squared score (training): 0.999
R-squared score (test): 0.993
