### Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.pipeline import Pipeline

### Select parameters

In [2]:
algo_name = '1R'
classification = False
kfold_num = 10

### Read files (Dataset)

In [25]:
X_train = pd.read_csv('../pc_X_train.csv')
y_train = pd.read_csv('../pc_y_train.csv')
y_train = y_train.iloc[:, -1] # With iloc we extract the labels

X_test = pd.read_csv('../pc_X_test.csv')
ids = X_test.iloc[:, 0]

### Normalize Data

In [14]:
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X_train)

### Dimensionality reduction

In [15]:
pca = PCA()
X_pca = pca.fit_transform(X_normalized)

### Create model

In [17]:
parameters = {'pca__n_components': range(1,100,2)}
model = tree.DecisionTreeRegressor(max_depth=1)

pipe = Pipeline(steps=[('pca', pca), ('tree', model)])

### Search for best parameters

In [18]:
grid = GridSearchCV(pipe, parameters, cv=kfold_num, scoring='neg_root_mean_squared_error')
grid.fit(X_train, y_train)

best_params = grid.best_params_
best_score = grid.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)

model = grid.best_estimator_
predictions = grid.predict(X_test)

Best Parameters: {'max_depth': 4}
Best Score: -0.761639941172748


### Read and normalize testset

In [20]:
X_normalized_test = scaler.fit_transform(X_test)
X_pca_test = pca.fit_transform(X_normalized)

### Predict testset

In [22]:
predictions = model.predict(X_pca_test)

### Train-Test-Split for evaluation

In [None]:
X_train_split, X_test_split, Y_train_split, Y_test_split = train_test_split(X_pca, y_train, test_size=0.33, random_state=42, stratify=y_train)