# DataChallenge

## Librairies

In [140]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## Parameters, Hyperparameters, Seed

In [141]:
seed = 42 # We fix the seed for reproducibility

## Metric

In [142]:
def metric(y_true, y_pred):
    f = 1 + 0.2 * (y_pred >= 0.5)              # pondération
    diff = (y_true - y_pred) ** 2              # erreur quadratique
    weighted = f * diff                        # pondération appliquée
    errors = np.mean(weighted, axis=1)         # moyenne par ligne
    final_error = np.sqrt(np.mean(errors))     # racine de la moyenne
    return final_error

## Data Collection

In [143]:
X = pd.read_csv("x_train_T9QMMVq.csv")
print(f"Dimension of the dataset: {X.shape}")
y = pd.read_csv("y_train_R0MqWmu.csv")
print(f"Dimension of the response vector: {y.shape}")

Dimension of the dataset: (202933, 14)
Dimension of the response vector: (202933, 24)


### Train Val Test split

In [144]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=seed)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=seed)
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(121759, 14) (121759, 24)
(40587, 14) (40587, 24)
(40587, 14) (40587, 24)


## Data Processing

### X_train

In [145]:
print(X_train.shape)
print(X_train.head())

(121759, 14)
            ID  Humidity        M12        M13       M14       M15         M4  \
128170  128170  0.000372 -12.128918 -10.038685 -6.692085 -2.725931   2.924100   
10157    10157  0.156735  -0.169991  -0.140449 -0.083268 -0.019521   0.093979   
158118  158118  0.000401   0.403388   0.285006  0.180356  0.148441  -0.041564   
162649  162649  0.867754  -0.032406  -0.026868 -0.029439 -0.005143   0.050611   
36537    36537  0.107927  -1.907945  -1.233831 -0.635659 -0.256506  18.939984   

               M5        M6        M7         R        S1        S2        S3  
128170   0.922374  0.197055  0.031729  0.546034  0.188495  0.928271  0.996086  
10157    0.074284  0.058909  0.016066  1.477164  0.935686  0.891420  1.001477  
158118   0.091291  0.099827  0.126175  1.731135  0.838978  0.938475  0.993774  
162649   0.010286  0.005586 -0.010552  1.004967  1.000340  1.001243  1.000757  
36537   15.370123  8.200569  3.632963  0.637185  0.623612  0.984508  1.011588  


In [146]:
# Find out if there is at least one missing value 
null = X_train.isnull().values
if null.any() >=1 :
    count = len(X_train[X_train.isnull().any(axis=1)])
    print(f"Number of rows with at least one missing value: {count}")
    proportion = count / len(X_train) * 100
    print(f"Proportion of rows with missing values in the dataset: {np.round(proportion,2)}")
else:
    print("There is no missing value.")

There is no missing value.


### y_train

In [147]:
print(y_train.shape)
print(y_train.head())

(121759, 24)
            ID       c01  c02       c03       c04  c05  c06  c07  c08  c09  \
128170  128170  0.996036  0.0  0.996036  0.996036  0.0  0.0  0.0  0.0  0.0   
10157    10157  0.538560  0.0  0.538560  0.538560  0.0  0.0  0.0  0.0  0.0   
158118  158118  0.742687  0.0  0.742687  0.742687  0.0  0.0  0.0  0.0  0.0   
162649  162649  0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0  0.0  0.0   
36537    36537  0.000000  0.0  1.000000  1.000000  0.0  0.0  0.0  0.0  0.0   

        ...  c14  c15  c16  c17       c18  c19  c20  c21       c22  c23  
128170  ...  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.996036  0.0  
10157   ...  0.0  0.0  0.0  0.0  0.538560  0.0  0.0  0.0  0.538560  0.0  
158118  ...  0.0  0.0  0.0  0.0  0.742687  0.0  0.0  0.0  0.742687  0.0  
162649  ...  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  
36537   ...  0.0  0.0  0.0  0.0  0.000000  1.0  0.0  0.0  1.000000  0.0  

[5 rows x 24 columns]


In [148]:
# Find out if there is at least one missing value 
null = y_train.isnull().values
if null.any() >=1 :
    count = len(y_train[y_train.isnull().any(axis=1)])
    print(f"Number of rows with at least one missing value: {count}")
    proportion = count / len(y_train) * 100
    print(f"Proportion of rows with missing values in the dataset: {np.round(proportion,2)}")
else:
    print("There is no missing value.")

There is no missing value.


### Data processing function

In [149]:
def data_processing(X, y=None):
    X = X.copy()
    X.set_index("ID", inplace=True)

    if y is not None:
        y = y.copy()
        y.set_index("ID", inplace=True)
        return X, y
    else:
        return X

In [150]:
X_train, y_train = data_processing(X_train, y_train)
print(X_train.shape, y_train.shape)
print(X_train.head())
print(y_train.head())

(121759, 13) (121759, 23)
        Humidity        M12        M13       M14       M15         M4  \
ID                                                                      
128170  0.000372 -12.128918 -10.038685 -6.692085 -2.725931   2.924100   
10157   0.156735  -0.169991  -0.140449 -0.083268 -0.019521   0.093979   
158118  0.000401   0.403388   0.285006  0.180356  0.148441  -0.041564   
162649  0.867754  -0.032406  -0.026868 -0.029439 -0.005143   0.050611   
36537   0.107927  -1.907945  -1.233831 -0.635659 -0.256506  18.939984   

               M5        M6        M7         R        S1        S2        S3  
ID                                                                             
128170   0.922374  0.197055  0.031729  0.546034  0.188495  0.928271  0.996086  
10157    0.074284  0.058909  0.016066  1.477164  0.935686  0.891420  1.001477  
158118   0.091291  0.099827  0.126175  1.731135  0.838978  0.938475  0.993774  
162649   0.010286  0.005586 -0.010552  1.004967  1.000340  1.0

In [151]:
X_val, y_val = data_processing(X_val, y_val)
X_test, y_test = data_processing(X_test, y_test)

## First model

### Training of the model

In [152]:
first_model = LinearRegression()
first_model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


### Prediction and evaluation of the model

In [153]:
y_val_pred = first_model.predict(X_val)
metric_value = metric(y_val, y_val_pred)
print(metric_value)

0.1697740471993647


## Generating Submission File

In [154]:
X_TEST = pd.read_csv("x_test_9F13O5s.csv", sep=",")
print(X_TEST.shape)
print(X_TEST.head())


(134673, 14)
       ID  Humidity       M12       M13       M14       M15        M4  \
0  202933  0.869916  0.021914 -0.015737  0.009041  0.015662 -0.139344   
1  202934  0.904594 -0.100771 -0.040901  0.006137  0.013496  0.843534   
2  202935  0.688853  0.703585  1.029723  0.785505  0.293766 -0.536002   
3  202936  0.392112  0.648665  0.628007  0.416822  0.065360 -0.309533   
4  202937  0.618233  0.155158  0.285085  0.081453  0.031029 -0.801587   

         M5        M6        M7         R        S1        S2        S3  
0 -0.041396 -0.006051  0.001638  1.008217  1.000922  1.003093  0.998643  
1  0.501194  0.134963  0.024947  1.010621  0.987593  1.003104  1.009033  
2 -0.166262 -0.029509 -0.015855  1.075560  1.015945  1.038617  1.067383  
3 -0.398324 -0.269763 -0.269454  1.160165  1.084636  1.011883  1.008619  
4 -0.709802 -0.279257 -0.089210  1.051850  1.120134  1.003001  1.007571  


In [155]:
X_TEST = data_processing(X_TEST)
print(X_TEST.shape)

(134673, 13)


### Training of the model on all the data

In [156]:
X, y = data_processing(X, y)
first_model = LinearRegression()
first_model.fit(X, y)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


### Prediction of the model

In [157]:
y_TEST_pred = first_model.predict(X_TEST)
m, n = y_TEST_pred.shape
print(m,n)

134673 23


### Creating submission file

In [158]:
columns = ["ID"] + list(y_test.columns)
print(columns)
print(len(columns))
C0 = np.array([i + 202933 for i in range(0,m)])
C0 = C0.reshape(-1, 1)
y_TEST_pred_new = np.hstack((C0, y_TEST_pred))
print(y_TEST_pred_new.shape)

['ID', 'c01', 'c02', 'c03', 'c04', 'c05', 'c06', 'c07', 'c08', 'c09', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15', 'c16', 'c17', 'c18', 'c19', 'c20', 'c21', 'c22', 'c23']
24
(134673, 24)


In [159]:
submission = pd.DataFrame(y_TEST_pred_new, columns=columns)
submission["ID"] = submission["ID"].astype(int)

submission.to_csv('submission.csv', index=False, sep=',')
print(submission.shape)
print(submission.head())

(134673, 24)
       ID       c01       c02       c03       c04       c05       c06  \
0  202933  0.072469  0.125340  0.256462  0.256462  0.007071  0.007071   
1  202934  0.101072  0.082315  0.322102  0.322102  0.018947  0.018947   
2  202935  0.186692 -0.022285  0.394463  0.394463  0.092071  0.092071   
3  202936 -0.229659  0.235811  0.210325  0.210325  0.060241  0.060241   
4  202937  0.085346  0.059133  0.263732  0.263732  0.126013  0.126013   

        c07       c08       c09  ...       c14  c15       c16       c17  \
0  0.125340  0.008401 -0.010309  ...  0.078609  0.0  0.013664 -0.024597   
1  0.082315  0.002977  0.001809  ...  0.133093  0.0  0.017549 -0.012449   
2 -0.022285 -0.017240  0.009686  ...  0.193837  0.0  0.007364  0.061230   
3  0.235811  0.011738  0.017665  ... -0.021106  0.0  0.008834  0.071016   
4  0.059133 -0.005896 -0.017399  ...  0.116266  0.0  0.003245 -0.032879   

        c18       c19       c20       c21       c22       c23  
0 -0.021361 -0.010533  0.125340  