In [3]:
import numpy as np
import pandas as pd

In [29]:
def data_loading():
    """
    This function loads the training and test data, preprocesses it, removes the NaN values and interpolates the missing 
    data using imputation

    Parameters
    ----------
    Returns
    ----------
    X_train: matrix of floats, training input with features
    y_train: array of floats, training output with labels
    X_test: matrix of floats: dim = (100, ?), test input with features
    """
    # Load training data
    train_df = pd.read_csv("train.csv")
    
    print("Training data:")
    print("Shape:", train_df.shape)

    train_df = train_df.dropna(subset=['price_CHF'])
    print("Shape after dropping price_CHF missing:", train_df.shape)
    
    print(train_df.head(2))
    print('\n')

    train_df = pd.get_dummies(train_df, columns=['season'])
    
    # Load test data
    test_df = pd.read_csv("test.csv")

    print("Test data:")
    print(test_df.shape)
    test_df = pd.get_dummies(test_df, columns=['season'])
    print(test_df.head(2))

    test_df = test_df[train_df.columns]

    # Dummy initialization of the X_train, X_test and y_train   
    X_train = train_df.drop(['price_CHF'],axis=1)
    y_train = train_df['price_CHF']
    X_test = test_df[X_train.columns]

    # TODO: Perform data preprocessing, imputation and extract X_train, y_train and X_test

    assert (X_train.shape[1] == X_test.shape[1]) and (X_train.shape[0] == y_train.shape[0]) and (X_test.shape[0] == 100), "Invalid data shape"
    return X_train, y_train, X_test


In [30]:
X_train, y_train, X_test = data_loading()

Training data:
Shape: (900, 11)
Shape after dropping price_CHF missing: (631, 11)
   season  price_AUS  price_CHF  price_CZE  price_GER  price_ESP  price_FRA  \
0  spring        NaN   9.644028  -1.686248  -1.748076  -3.666005        NaN   
1  summer        NaN   7.246061  -2.132377  -2.054363  -3.295697  -4.104759   

   price_UK  price_ITA  price_POL  price_SVK  
0 -1.822720  -3.931031        NaN  -3.238197  
1 -1.826021        NaN        NaN  -3.212894  


Test data:
(100, 10)
   price_AUS  price_CZE  price_GER  price_ESP  price_FRA  price_UK  price_ITA  \
0        NaN   0.472985   0.707957        NaN  -1.136441 -0.596703        NaN   
1  -1.184837   0.358019        NaN  -3.199028  -1.069695       NaN  -1.420091   

   price_POL  price_SVK  season_autumn  season_spring  season_summer  \
0   3.298693   1.921886              0              1              0   
1   3.238307        NaN              0              0              1   

   season_winter  
0              0  
1              0 

In [23]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 631 entries, 0 to 896
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   season     631 non-null    object 
 1   price_AUS  427 non-null    float64
 2   price_CZE  429 non-null    float64
 3   price_GER  408 non-null    float64
 4   price_ESP  412 non-null    float64
 5   price_FRA  424 non-null    float64
 6   price_UK   411 non-null    float64
 7   price_ITA  424 non-null    float64
 8   price_POL  438 non-null    float64
 9   price_SVK  413 non-null    float64
dtypes: float64(9), object(1)
memory usage: 54.2+ KB


In [24]:
X_train.describe()

Unnamed: 0,price_AUS,price_CZE,price_GER,price_ESP,price_FRA,price_UK,price_ITA,price_POL,price_SVK
count,427.0,429.0,408.0,412.0,424.0,411.0,424.0,438.0,413.0
mean,-0.681994,-0.29234,-0.488096,-4.66941,-2.969189,-1.613521,-2.776184,-0.460496,-0.740783
std,1.037926,1.204183,1.151764,1.15016,1.097618,1.557788,1.124408,1.96064,2.097315
min,-2.362783,-2.144415,-2.238546,-6.871923,-4.182005,-3.94052,-4.330588,-2.9128,-3.49981
25%,-1.698766,-1.568437,-1.674337,-5.562816,-3.905342,-2.717597,-3.782831,-2.404805,-2.942978
50%,-0.603799,0.097281,-0.306794,-4.398992,-3.374985,-1.903227,-2.988029,-0.807664,-0.688546
75%,0.041722,0.736983,0.627788,-3.595973,-2.064893,-0.886007,-1.894125,1.494323,1.477739
max,1.316798,1.710173,1.659539,-2.912415,0.000619,1.594882,0.942776,3.386946,2.520366


In [25]:
y_train.describe()

count    631.000000
mean       4.538611
std        3.328953
min       -3.736940
25%        2.640888
50%        3.872916
75%        7.736682
max        9.933313
Name: price_CHF, dtype: float64

In [26]:
y_train.info()

<class 'pandas.core.series.Series'>
Int64Index: 631 entries, 0 to 896
Series name: price_CHF
Non-Null Count  Dtype  
--------------  -----  
631 non-null    float64
dtypes: float64(1)
memory usage: 9.9 KB


In [None]:
def modeling_and_prediction(X_train, y_train, X_test):
    """
    This function defines the model, fits training data and then does the prediction with the test data 

    Parameters
    ----------
    X_train: matrix of floats, training input with 10 features
    y_train: array of floats, training output
    X_test: matrix of floats: dim = (100, ?), test input with 10 features

    Returns
    ----------
    y_test: array of floats: dim = (100,), predictions on test set
    """

    y_pred=np.zeros(X_test.shape[0])
    #TODO: Define the model and fit it using training data. Then, use test data to make predictions

    assert y_pred.shape == (100,), "Invalid data shape"
    return y_pred


In [None]:
# Data loading
X_train, y_train, X_test = data_loading()
# The function retrieving optimal LR parameters
y_pred=modeling_and_prediction(X_train, y_train, X_test)
# Save results in the required format
dt = pd.DataFrame(y_pred) 
dt.columns = ['price_CHF']
dt.to_csv('results.csv', index=False)
print("\nResults file successfully generated!")