In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV, LassoLarsCV, RidgeCV, ElasticNetCV
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV, RepeatedKFold
import scipy.stats as st
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    r2_score,
    mean_absolute_error,
    mean_squared_error, root_mean_squared_error
)
from sklearn.preprocessing import StandardScaler, RobustScaler
import xgboost as xgb

In [2]:
deltaker="xgb"  ###sett inn ditt (kalle-) navn! 

In [3]:
# 1. Last inn data
url = "https://raw.githubusercontent.com/jensmorten/data-detektiv_kodekveld/ee09cf267b371af7e884be0a70daae492993d86d/data/data2.csv"
#df = pd.read_csv("../data/data2.csv")
df = pd.read_csv(url, sep=",")  

In [4]:
##sjekk kolonner
df.columns

Index(['alder', 'lonn', 'utdanning', 'sko_str', 'avdeling', 'er_leder',
       'favorittfarge', 'ansiennitet', 'prestasjonsscore'],
      dtype='object')

In [5]:
df_d=pd.get_dummies(df)

In [6]:
df_d.columns

Index(['alder', 'lonn', 'utdanning', 'sko_str', 'er_leder', 'ansiennitet',
       'prestasjonsscore', 'avdeling_Finans', 'avdeling_HR', 'avdeling_Salg',
       'avdeling_Utvikling', 'favorittfarge_blaa', 'favorittfarge_groen',
       'favorittfarge_roed'],
      dtype='object')

In [7]:
###velg forklaringsvariabler
mine_variabler=['alder', 'utdanning', 'er_leder','ansiennitet', 'prestasjonsscore','avdeling_Finans','avdeling_HR','avdeling_Salg']  

In [8]:
#filtrer ut variabler
x = df_d[mine_variabler]  

In [9]:
##velg variablene vi skal predikere (ikke rør!) 
y = df['lonn']

In [10]:
float_range = np.arange(0.01, 1, 0.01)

In [11]:
def cv_score(params, X, y, n_splits=5, early_stopping_rounds=50, random_state=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    scores = []
    for train_idx, val_idx in kf.split(X):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]
        model = XGBRegressor(**params, n_estimators=5000)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=early_stopping_rounds,
            verbose=False
        )
        preds = model.predict(X_val)
        scores.append(mean_squared_error(y_val, preds, squared=False))  # RMSE
    return np.mean(scores), np.std(scores)

In [12]:
# 2. Lag enkel modell
#model = LinearRegression()
#model= RidgeCV(cv=5, alphas=float_range)
#model = RandomForestRegressor()
#model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, random_state=42)

In [13]:
##tilpass modellen
#model.fit(x, y)

In [15]:
param_dist = {
    'learning_rate': st.loguniform(0.005, 0.3),
    'max_depth': [3,4,5,6,8,10],
    'min_child_weight': [1,2,3,5,7,10],
    'subsample': [0.5,0.7,0.8,0.9,1.0],
    'colsample_bytree': [0.3,0.5,0.7,0.9,1.0],
    'gamma': [0, 0.1, 0.5, 1, 2, 5],
    'reg_alpha': [0, 0.01, 0.1, 1, 5],
    'reg_lambda': [0.5, 1, 2, 5, 10],
}

base = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, random_state=42)
rkf = RepeatedKFold(n_splits=4, n_repeats=2, random_state=42)

model = RandomizedSearchCV(
    base, param_distributions=param_dist,
    n_iter=100, cv=rkf, scoring='neg_mean_squared_error',
    verbose=2, n_jobs=-1, random_state=42
)

# NOTE: pass early stopping via fit_params in older sklearn; a robust alternative is manual CV (above)
model.fit(x, y)  # if you need early stopping inside each fit, prefer the manual cv function above
print(model.best_params_, model.best_score_)

Fitting 8 folds for each of 100 candidates, totalling 800 fits
{'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.01449439785379907, 'max_depth': 5, 'min_child_weight': 10, 'reg_alpha': 0, 'reg_lambda': 0.5, 'subsample': 0.5} -3563349171.70699


In [16]:
print(model.best_params_, model.best_score_)

{'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.01449439785379907, 'max_depth': 5, 'min_child_weight': 10, 'reg_alpha': 0, 'reg_lambda': 0.5, 'subsample': 0.5} -3563349171.70699


In [17]:
train_pred=model.predict(x)

In [18]:
train_pred[train_pred<500000]=500000

In [19]:
train_score=r2_score(y, train_pred)

In [20]:
print(train_score)

0.9891248941421509


In [21]:
# 1. Last inn test-data
test = pd.read_csv("https://raw.githubusercontent.com/jensmorten/data-detektiv_kodekveld/refs/heads/main/data/test_set.csv", sep=",")

In [22]:
test_d=pd.get_dummies(test)

In [23]:
test_x=test_d[mine_variabler]

In [24]:
test_x

Unnamed: 0,alder,utdanning,er_leder,ansiennitet,prestasjonsscore,avdeling_Finans,avdeling_HR,avdeling_Salg
0,39,13,1,16,3.9,False,False,False
1,33,9,0,14,2.3,False,False,False
2,34,10,1,4,3.6,False,False,False
3,40,13,0,23,2.6,False,False,False
4,41,14,0,17,2.0,False,False,False
...,...,...,...,...,...,...,...,...
57,24,13,0,4,3.9,True,False,False
58,33,13,0,4,5.1,True,False,False
59,24,11,0,6,3.8,True,False,False
60,39,14,0,10,2.0,True,False,False


In [25]:
y_pred=model.predict(test_x)

In [26]:
y_pred[y_pred<500000]=500000

In [27]:
###lag dataframe av prediksjoner
df = pd.DataFrame({
    "id": range(1, len(y_pred) + 1),
    "lonn": np.round(y_pred,0)
})

In [28]:
##large .csv
df.to_csv(f"predictions_{deltaker}.csv", index=False)