In [1]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from fancyimpute import KNN, NuclearNormMinimization, SoftImpute, BiScaler
from sklearn.linear_model import HuberRegressor
from sklearn.model_selection import KFold

In [5]:
def data_loading():
    """
    This function loads the training and test data, preprocesses it, removes the NaN values and interpolates the missing 
    data using imputation

    Parameters
    ----------
    Returns
    ----------
    X_train: matrix of floats, training input with features
    y_train: array of floats, training output with labels
    X_test: matrix of floats: dim = (100, ?), test input with features
    """
    # Load training data
    train_df = pd.read_csv("train.csv")
    
    print("Training data:")
    print("Shape:", train_df.shape)

    train_df = train_df.dropna(subset=['price_CHF'])
    print("Shape after dropping price_CHF missing:", train_df.shape)
    
    print(train_df.head(2))
    print('\n')

    train_df = pd.get_dummies(train_df, columns=['season'])
    
    # Load test data
    test_df = pd.read_csv("test.csv")

    print("Test data:")
    print(test_df.shape)
    test_df = pd.get_dummies(test_df, columns=['season'])
    print(test_df.head(2))

    # Dummy initialization of the X_train, X_test and y_train   
    X_train = train_df.drop(['price_CHF'],axis=1)
    y_train = train_df['price_CHF']
    X_test = test_df[X_train.columns]

    X = pd.concat([X_train, X_test], axis=0)
    X_incomplete_normalized = BiScaler().fit_transform(np.array(X))
    X_filled_softimpute = SoftImpute().fit_transform(X_incomplete_normalized)
    X = pd.DataFrame(X_filled_softimpute, columns=X.columns)
    X_train = X.iloc[:X_train.shape[0], :]
    X_test = X.iloc[X_train.shape[0]:, :]
    
    assert (X_train.shape[1] == X_test.shape[1]) and (X_train.shape[0] == y_train.shape[0]) and (X_test.shape[0] == 100), "Invalid data shape"
    return X_train, y_train, X_test


In [3]:
def modeling_and_prediction(X_train, y_train, X_test):
    """
    This function defines the model, fits training data and then does the prediction with the test data 

    Parameters
    ----------
    X_train: matrix of floats, training input with 10 features
    y_train: array of floats, training output
    X_test: matrix of floats: dim = (100, ?), test input with 10 features

    Returns
    ----------
    y_test: array of floats: dim = (100,), predictions on test set
    """

    y_pred=np.zeros(X_test.shape[0])
    
    # find the alpha for huber
    alpha_list = np.logspace(-3, 3, 1000)
    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    scores = []
    for alpha in alpha_list:
        score = []
        for train_idx, val_idx in cv.split(X_train):
            X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
            model = HuberRegressor(alpha=alpha, max_iter=100000000)
            model.fit(X_tr, y_tr)
            score.append(model.score(X_val, y_val))
        scores.append(np.mean(score))
    alpha = alpha_list[np.argmax(scores)]
    print("alpha:", alpha)
    model = HuberRegressor(alpha=alpha, max_iter=100000000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    y_pred[y_pred<0] = 0

    assert y_pred.shape == (100,), "Invalid data shape"
    return y_pred


In [4]:
# Data loading
X_train, y_train, X_test = data_loading()
# The function retrieving optimal LR parameters
y_pred=modeling_and_prediction(X_train, y_train, X_test)
# Save results in the required format
dt = pd.DataFrame(y_pred) 
dt.columns = ['price_CHF']
dt.to_csv('results-fancySoft.csv', index=False)
print("\nResults file successfully generated!")

Training data:
Shape: (900, 11)
Shape after dropping price_CHF missing: (631, 11)
   season  price_AUS  price_CHF  price_CZE  price_GER  price_ESP  price_FRA  \
0  spring        NaN   9.644028  -1.686248  -1.748076  -3.666005        NaN   
1  summer        NaN   7.246061  -2.132377  -2.054363  -3.295697  -4.104759   

   price_UK  price_ITA  price_POL  price_SVK  
0 -1.822720  -3.931031        NaN  -3.238197  
1 -1.826021        NaN        NaN  -3.212894  


Test data:
(100, 10)
   price_AUS  price_CZE  price_GER  price_ESP  price_FRA  price_UK  price_ITA  \
0        NaN   0.472985   0.707957        NaN  -1.136441 -0.596703        NaN   
1  -1.184837   0.358019        NaN  -3.199028  -1.069695       NaN  -1.420091   

   price_POL  price_SVK  season_autumn  season_spring  season_summer  \
0   3.298693   1.921886              0              1              0   
1   3.238307        NaN              0              0              1   

   season_winter  
0              0  
1              0 