# DataChallenge

## Librairies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

## Parameters, Hyperparameters, Seed

In [2]:
seed = 42
drop_humidity = False
feature_modif = False
selectkbest = True

## Data Collection

In [3]:
X = pd.read_csv("x_train_T9QMMVq.csv")
print(f"Dimension of the dataset: {X.shape}")
y = pd.read_csv("y_train_R0MqWmu.csv")
print(f"Dimension of the response vector: {y.shape}")
X_TEST = pd.read_csv("x_test_9F13O5s.csv", sep=",")
print(f"Dimension of the test dataset: {X_TEST.shape}")

Dimension of the dataset: (202933, 14)
Dimension of the response vector: (202933, 24)
Dimension of the test dataset: (134673, 14)


### Train Test split

Shuffle

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(142053, 14) (142053, 24)
(60880, 14) (60880, 24)


Times series

In [5]:
split = int(0.7 * len(X))
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(142053, 14) (142053, 24)
(60880, 14) (60880, 24)


## Data Processing

In [6]:
print(X_train.head())

   ID  Humidity       M12       M13       M14       M15        M4        M5  \
0   0  0.098160 -0.175981 -0.086469 -0.041465 -0.021153  0.197597  0.054646   
1   1  0.000307 -0.066416  0.036071  0.032636 -0.000573  2.568494  1.883142   
2   2  0.000388  0.190943  0.187540  0.143680  0.092635 -0.147460 -0.021174   
3   3  0.761003 -0.151393 -0.083723 -0.048982 -0.018259  0.045380  0.102427   
4   4  0.107808  0.074818  0.042692  0.026169  0.019134 -0.056284 -0.011193   

         M6        M7         R        S1        S2        S3  
0 -0.009277  0.001855  1.007242  1.013007  1.000563  0.999397  
1  0.779251  0.262231  0.971428  0.996735  1.002226  1.013063  
2  0.040079  0.065790  1.302238  0.905275  0.953600  0.986347  
3  0.012915  0.004453  1.013741  1.004315  1.012301  1.009465  
4  0.010233  0.012205  0.998659  1.005154  1.000096  0.999553  


In [7]:
print(y_train.head())

   ID       c01  c02       c03       c04  c05  c06  c07  c08  c09  ...  c14  \
0   0  0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0  0.0  0.0  ...  0.0   
1   1  0.000000  0.0  0.176471  0.176471  0.0  0.0  0.0  0.0  0.0  ...  0.0   
2   2  0.128465  0.0  0.128465  0.128465  0.0  0.0  0.0  0.0  0.0  ...  0.0   
3   3  0.000000  0.0  0.263736  0.263736  0.0  0.0  0.0  0.0  0.0  ...  0.0   
4   4  0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0  0.0  0.0  ...  0.0   

   c15  c16  c17       c18       c19  c20  c21       c22       c23  
0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.0  0.000000  0.000000  
1  0.0  0.0  0.0  0.000000  0.176471  0.0  0.0  0.176471  0.000000  
2  0.0  0.0  0.0  0.128465  0.000000  0.0  0.0  0.128465  0.000000  
3  0.0  0.0  0.0  0.000000  0.000000  0.0  0.0  0.263736  0.263736  
4  0.0  0.0  0.0  0.000000  0.000000  0.0  0.0  0.000000  0.000000  

[5 rows x 24 columns]


In [8]:
np.mean(X_train["Humidity"]), np.mean(X_test["Humidity"]), np.mean(X_TEST["Humidity"])

(np.float64(0.22526816450151718),
 np.float64(0.22578323684204102),
 np.float64(0.5121110517453115))

The humidity in X_TEST is much higher than in X. This skews our prediction on the TEST. To remove it, set drop_humidity = True.

### Feature engineering

In [9]:
print(X.columns)

Index(['ID', 'Humidity', 'M12', 'M13', 'M14', 'M15', 'M4', 'M5', 'M6', 'M7',
       'R', 'S1', 'S2', 'S3'],
      dtype='object')


In [10]:
def feature_engineering(X):
    X = X.copy()
    X["M_big"] = X["M12"] + X["M13"] + X["M14"] + X["M15"]
    X["M_small"] = X["M4"] + X["M5"] + X["M6"] + X["M7"]
    X["S"] = X["S1"] + X["S2"] + X["S3"]
    X["Total"] = X["Humidity"] + X["M_big"] + X["M_small"] + X["R"] +  X["S"]
    return X

### Feature selection

In [11]:
X_temp = X_train.copy()
X_temp.drop("ID", axis=1, inplace=True)
print(X_temp.shape)
y_temp = y_train.copy()
y_temp.drop("ID", axis=1, inplace=True)
print(y_temp.shape)

(142053, 13)
(142053, 23)


In [12]:
from sklearn.feature_selection import SelectKBest, f_regression

# Si y_train a plusieurs colonnes, on peut agréger par la moyenne (ou utiliser une cible spécifique)
y_mean = y_temp.mean(axis=1)

selector = SelectKBest(score_func=f_regression, k='all')
selector.fit(X_temp, y_mean)

scores = selector.scores_
features = X_temp.columns
sorted_features = sorted(zip(features, scores), key=lambda x: x[1], reverse=True)

for f, s in sorted_features:
    print(f"{f:<15}  score = {s:.3f}")

S1               score = 11488.684
M4               score = 10251.519
M5               score = 6665.094
S3               score = 3643.683
M6               score = 3409.937
M12              score = 2446.283
M7               score = 2143.107
M13              score = 1982.158
M14              score = 1886.512
M15              score = 1847.968
R                score = 860.980
S2               score = 436.640
Humidity         score = 150.682


S1, M4 et M5 semblent avoir une corrélation linéaire forte avec y_mean.

In [13]:
from sklearn.feature_selection import mutual_info_regression

selector_2 = SelectKBest(mutual_info_regression, k='all')
selector_2.fit(X_temp, y_mean)

0,1,2
,score_func,<function mut...001293BBD2AC0>
,k,'all'


In [14]:
scores_2 = zip(features, selector_2.scores_)
for f, s in scores_2:
    print(f"{f:<15}  mutual_info = {s:.3f}")

Humidity         mutual_info = 0.299
M12              mutual_info = 0.554
M13              mutual_info = 0.613
M14              mutual_info = 0.651
M15              mutual_info = 0.650
M4               mutual_info = 0.478
M5               mutual_info = 0.525
M6               mutual_info = 0.442
M7               mutual_info = 0.421
R                mutual_info = 0.680
S1               mutual_info = 1.136
S2               mutual_info = 0.424
S3               mutual_info = 0.803


Si mutual_info est faible, cela signifie que le capteur associé n’apporte pas beaucoup d’information indépendante pour prédire la cible. Donc son information est déjà contenue dans les données des autres capteurs. On peut donc enlever ce capteur qui apporte des informations redondantes.

Notre objectif est uniquement prédictif donc on peut regarder seulement les capteurs qui ont le mutual_info le plus élevé.

In [15]:
k_mutual = 0
scores = zip(features, selector_2.scores_)
for f, s in scores:
    if s > 0.5:
        k_mutual += 1
        print(f"{f:<15}  mutual_info = {s:.3f}")

M12              mutual_info = 0.554
M13              mutual_info = 0.613
M14              mutual_info = 0.651
M15              mutual_info = 0.650
M5               mutual_info = 0.525
R                mutual_info = 0.680
S1               mutual_info = 1.136
S3               mutual_info = 0.803


In [16]:
from sklearn.feature_selection import mutual_info_regression

selector_3 = SelectKBest(mutual_info_regression, k=k_mutual)
selector_3.fit(X_temp, y_mean)

0,1,2
,score_func,<function mut...001293BBD2AC0>
,k,8


In [17]:
X_selected_temp = selector_3.transform(X_temp)
print(X_selected_temp.shape)
selected_features = X_temp.columns[selector_3.get_support()]
print(selected_features)

(142053, 8)
Index(['M12', 'M13', 'M14', 'M15', 'M5', 'R', 'S1', 'S3'], dtype='object')


### Data processing function

In [18]:
def data_processing(X, y=None):
    X = X.copy()
    X.set_index("ID", inplace=True)
    if drop_humidity:
        X.drop("Humidity", axis=1, inplace=True)
    if feature_modif:
        X = feature_engineering(X)
    if selectkbest:
        X = X[selected_features].copy()
        print(X.columns)

    if y is not None:
        y = y.copy()
        y.set_index("ID", inplace=True)
        return X, y
    else:
        return X

In [19]:
X_train, y_train = data_processing(X_train, y_train)
X_test, y_test = data_processing(X_test, y_test)
X_TEST = data_processing(X_TEST, None)

Index(['M12', 'M13', 'M14', 'M15', 'M5', 'R', 'S1', 'S3'], dtype='object')
Index(['M12', 'M13', 'M14', 'M15', 'M5', 'R', 'S1', 'S3'], dtype='object')
Index(['M12', 'M13', 'M14', 'M15', 'M5', 'R', 'S1', 'S3'], dtype='object')


In [20]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
print(X_TEST.shape)

(142053, 8) (142053, 23)
(60880, 8) (60880, 23)
(134673, 8)


## Model

Random Forest Benchmark

In [21]:
rf = RandomForestRegressor(
        n_estimators=5,
        max_depth=7,
        min_samples_split=0.01,
        min_samples_leaf=30,
        random_state=seed
    )

# Entraînement
rf.fit(X_train, y_train)

0,1,2
,n_estimators,5
,criterion,'squared_error'
,max_depth,7
,min_samples_split,0.01
,min_samples_leaf,30
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [22]:
# Prédictions
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

y_train_pred.shape, y_test_pred.shape

((142053, 23), (60880, 23))

In [23]:
y_train_pred_clipped = np.clip(y_train_pred, 0, 1)
y_test_pred_clipped = np.clip(y_test_pred, 0, 1)

rmse_train = root_mean_squared_error(y_train, y_train_pred_clipped)
rmse_test = root_mean_squared_error(y_test, y_test_pred_clipped)


print(f"rmse train: {rmse_train}")
print(f"rmse test: {rmse_test}")

# rmse train: 0.10208407509348237
# rmse test: 0.10237637697308155

rmse train: 0.11115577132952625
rmse test: 0.11181936885149638


## Generating Submission File

### Training of the model on all the data

In [24]:
X, y = data_processing(X, y)
print(X.shape, y.shape)
rf.fit(X, y)

Index(['M12', 'M13', 'M14', 'M15', 'M5', 'R', 'S1', 'S3'], dtype='object')
(202933, 8) (202933, 23)


0,1,2
,n_estimators,5
,criterion,'squared_error'
,max_depth,7
,min_samples_split,0.01
,min_samples_leaf,30
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### Prediction of the model

In [25]:
y_TEST_pred = rf.predict(X_TEST)
y_TEST_pred_clipped = np.clip(y_TEST_pred, 0, 1)
m, n = y_TEST_pred_clipped.shape
print(m,n)

134673 23


### Creating submission file

In [26]:
columns = ["ID"] + list(y_test.columns)
print(columns)
print(len(columns))
C0 = np.array([i + 202933 for i in range(0,m)])
C0 = C0.reshape(-1, 1)
y_TEST_pred_new = np.hstack((C0, y_TEST_pred_clipped))
print(y_TEST_pred_new.shape)

['ID', 'c01', 'c02', 'c03', 'c04', 'c05', 'c06', 'c07', 'c08', 'c09', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15', 'c16', 'c17', 'c18', 'c19', 'c20', 'c21', 'c22', 'c23']
24
(134673, 24)


In [27]:
submission = pd.DataFrame(y_TEST_pred_new, columns=columns)
submission["ID"] = submission["ID"].astype(int)

submission.to_csv('submission.csv', index=False, sep=',')
print(submission.shape)
print(submission.head())

(134673, 24)
       ID       c01       c02       c03       c04       c05       c06  \
0  202933  0.010019  0.006204  0.028607  0.028607  0.002604  0.002604   
1  202934  0.127599  0.000685  0.321124  0.321124  0.000000  0.000000   
2  202935  0.002018  0.020427  0.458733  0.458733  0.430761  0.430761   
3  202936  0.080448  0.013942  0.265349  0.265349  0.132274  0.132274   
4  202937  0.041874  0.288550  0.343054  0.343054  0.010064  0.010064   

        c07       c08       c09  ...       c14  c15       c16       c17  \
0  0.006204  0.000026  0.000218  ...  0.003656  0.0  0.002903  0.000361   
1  0.000685  0.000009  0.013813  ...  0.122468  0.0  0.004835  0.002438   
2  0.020427  0.000000  0.000000  ...  0.000000  0.0  0.000000  0.005527   
3  0.013942  0.000000  0.000000  ...  0.010193  0.0  0.000000  0.002229   
4  0.288550  0.000000  0.000403  ...  0.040156  0.0  0.000000  0.000044   

        c18       c19       c20       c21       c22       c23  
0  0.000109  0.003686  0.006204  