# DataChallenge

## Librairies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

## Parameters, Hyperparameters, Seed

In [2]:
seed = 42
train = 1

## Data Collection

In [3]:
X = pd.read_csv("x_train_T9QMMVq.csv")
print(f"Dimension of the dataset: {X.shape}")
y = pd.read_csv("y_train_R0MqWmu.csv")
print(f"Dimension of the response vector: {y.shape}")
X_TEST = pd.read_csv("x_test_9F13O5s.csv", sep=",")
print(f"Dimension of the test dataset: {X_TEST.shape}")

Dimension of the dataset: (202933, 14)
Dimension of the response vector: (202933, 24)
Dimension of the test dataset: (134673, 14)


### Train Test split

Shuffle

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(142053, 14) (142053, 24)
(60880, 14) (60880, 24)


Times series

In [5]:
split = int(0.8 * len(X))
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(162346, 14) (162346, 24)
(40587, 14) (40587, 24)


## Data Processing

In [6]:
print(X_train.head())

   ID  Humidity       M12       M13       M14       M15        M4        M5  \
0   0  0.098160 -0.175981 -0.086469 -0.041465 -0.021153  0.197597  0.054646   
1   1  0.000307 -0.066416  0.036071  0.032636 -0.000573  2.568494  1.883142   
2   2  0.000388  0.190943  0.187540  0.143680  0.092635 -0.147460 -0.021174   
3   3  0.761003 -0.151393 -0.083723 -0.048982 -0.018259  0.045380  0.102427   
4   4  0.107808  0.074818  0.042692  0.026169  0.019134 -0.056284 -0.011193   

         M6        M7         R        S1        S2        S3  
0 -0.009277  0.001855  1.007242  1.013007  1.000563  0.999397  
1  0.779251  0.262231  0.971428  0.996735  1.002226  1.013063  
2  0.040079  0.065790  1.302238  0.905275  0.953600  0.986347  
3  0.012915  0.004453  1.013741  1.004315  1.012301  1.009465  
4  0.010233  0.012205  0.998659  1.005154  1.000096  0.999553  


In [7]:
print(y_train.head())

   ID       c01  c02       c03       c04  c05  c06  c07  c08  c09  ...  c14  \
0   0  0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0  0.0  0.0  ...  0.0   
1   1  0.000000  0.0  0.176471  0.176471  0.0  0.0  0.0  0.0  0.0  ...  0.0   
2   2  0.128465  0.0  0.128465  0.128465  0.0  0.0  0.0  0.0  0.0  ...  0.0   
3   3  0.000000  0.0  0.263736  0.263736  0.0  0.0  0.0  0.0  0.0  ...  0.0   
4   4  0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0  0.0  0.0  ...  0.0   

   c15  c16  c17       c18       c19  c20  c21       c22       c23  
0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.0  0.000000  0.000000  
1  0.0  0.0  0.0  0.000000  0.176471  0.0  0.0  0.176471  0.000000  
2  0.0  0.0  0.0  0.128465  0.000000  0.0  0.0  0.128465  0.000000  
3  0.0  0.0  0.0  0.000000  0.000000  0.0  0.0  0.263736  0.263736  
4  0.0  0.0  0.0  0.000000  0.000000  0.0  0.0  0.000000  0.000000  

[5 rows x 24 columns]


In [8]:
np.mean(X_train["Humidity"]), np.mean(X_test["Humidity"]), np.mean(X_TEST["Humidity"])

(np.float64(0.22516337926518237),
 np.float64(0.22645990244886746),
 np.float64(0.5121110517453115))

L'humidité est beaucoup plus élevée dans X_TEST que dans X. Cela fausse notre prédiction.

### Data processing function

In [9]:
# We remove the columns that can be determined without model
remove_target = ["c04","c06","c07","c15","c20","c21","c22"]

In [10]:
def data_processing(X, y=None, train=None):
    X = X.copy()
    X.set_index("ID", inplace=True)

    if y is not None:
        y = y.copy()
        y.set_index("ID", inplace=True)

        if train is not None:
            # mauvaise idée, dégrade le score sur le TEST
            y.drop(remove_target, axis = 1, inplace=True)
            
        return X, y
    else:
        return X

In [11]:
X_train, y_train = data_processing(X_train, y_train)
X_test, y_test = data_processing(X_test, y_test)
X_TEST = data_processing(X_TEST, None)

In [12]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
print(X_TEST.shape)

(162346, 13) (162346, 23)
(40587, 13) (40587, 23)
(134673, 13)


Reconstruct prediction

In [13]:
def reconstruct(y_pred):
    index = [3, 5, 6, 14, 19, 20, 21]
    col = [2, 4, 1, None, 1, 4, 2]
    for i in range(len(remove_target)):
        if remove_target[i] != "c15":
            y_pred = np.insert(y_pred, index[i], y_pred[:,col[i]], axis=1)
        else:
            y_pred = np.insert(y_pred, index[i], np.zeros(len(y_pred[:,0])), axis=1)
    return y_pred

## Importance weighting

In [14]:
type(X_train), type(X_TEST)

(pandas.core.frame.DataFrame, pandas.core.frame.DataFrame)

In [15]:
def importance_weighting(X_train):
    # On transforme en format numpy
    X_TEST_np = X_TEST.to_numpy()
    X_train_np = X_train.to_numpy()

    # Concaténation verticale (empilement des lignes)
    X_all = np.vstack([X_train_np, X_TEST_np])
    # Création du label : 0 = train, 1 = test
    y_all = np.concatenate([
        np.zeros(len(X_train_np), dtype=int),
        np.ones(len(X_TEST_np), dtype=int)
    ])

    # On entraîne un classifieur à dire si un point vient du train (0) ou du test (1)
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_all, y_all)

    # Probabilité d'appartenir à X_TEST
    train_probs = clf.predict_proba(X_train_np)[:, 1]  # P(test | x)
    
    # Poids plus important pour les données du train qui sont proches des données du test
    weights = train_probs / (1 - train_probs + 1e-6)
    
    # clipping pour éviter les poids extrêmes
    weights = np.clip(weights, 0, 10)

    # Normalisation des poids
    weights /= np.mean(weights)

    print(f"AUC séparation train/test : {clf.score(X_all, y_all):.3f}")

    return weights

In [16]:
weights = importance_weighting(X_train)
np.min(weights), np.mean(weights), np.max(weights)

AUC séparation train/test : 0.666


(np.float64(0.0002300024865947043),
 np.float64(0.9999999999999998),
 np.float64(10.708704347676408))

C'est une mauvaise idée, ça dégrade énormément le score sur le TEST.

## Model

Random Forest (Benchmark)

In [17]:
rf = RandomForestRegressor(n_estimators=5, max_depth=7, min_samples_split=0.01, min_samples_leaf=30, random_state=seed)

# rf.fit(X_train, y_train, sample_weight=weights)

rf.fit(X_train, y_train)

0,1,2
,n_estimators,5
,criterion,'squared_error'
,max_depth,7
,min_samples_split,0.01
,min_samples_leaf,30
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [18]:
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

y_train_pred.shape, y_test_pred.shape

((162346, 23), (40587, 23))

In [19]:
y_train_pred_clipped = np.clip(y_train_pred, 0, 1)
y_test_pred_clipped = np.clip(y_test_pred, 0, 1)

# y_test_pred_clipped = reconstruct(y_test_pred_clipped) # (n, 23)

rmse_train = root_mean_squared_error(y_train, y_train_pred_clipped)
rmse_test = root_mean_squared_error(y_test, y_test_pred_clipped)


print(f"rmse train: {rmse_train}")
print(f"rmse test: {rmse_test}")

# rmse train: 0.10763108488017599
# rmse test: 0.10776090921481252

rmse train: 0.10763108488017599
rmse test: 0.10776090921481252


## Generating Submission File

### Training of the model on all the data

In [20]:
X, y = data_processing(X, y)
# weights = importance_weighting(X)
# rf.fit(X, y, sample_weight=weights)
rf.fit(X, y)

0,1,2
,n_estimators,5
,criterion,'squared_error'
,max_depth,7
,min_samples_split,0.01
,min_samples_leaf,30
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### Prediction of the model

In [21]:
y_TEST_pred = rf.predict(X_TEST)
y_TEST_pred_clipped = np.clip(y_TEST_pred, 0, 1)
# y_TEST_pred_clipped = reconstruct(y_TEST_pred_clipped)
m, n = y_TEST_pred_clipped.shape
print(m,n)

134673 23


### Creating submission file

In [22]:
columns = ["ID"] + list(y_test.columns)
print(columns)
print(len(columns))
C0 = np.array([i + 202933 for i in range(0,m)])
C0 = C0.reshape(-1, 1)
y_TEST_pred_new = np.hstack((C0, y_TEST_pred_clipped))
print(y_TEST_pred_new.shape)

['ID', 'c01', 'c02', 'c03', 'c04', 'c05', 'c06', 'c07', 'c08', 'c09', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15', 'c16', 'c17', 'c18', 'c19', 'c20', 'c21', 'c22', 'c23']
24
(134673, 24)


In [23]:
submission = pd.DataFrame(y_TEST_pred_new, columns=columns)
submission["ID"] = submission["ID"].astype(int)

submission.to_csv('submission.csv', index=False, sep=',')
print(submission.shape)
print(submission.head())

(134673, 24)
       ID       c01       c02       c03       c04       c05       c06  \
0  202933  0.034743  0.007517  0.058500  0.058500  0.000007  0.000007   
1  202934  0.120045  0.003234  0.371503  0.371503  0.000000  0.000000   
2  202935  0.002018  0.020427  0.458733  0.458733  0.430761  0.430761   
3  202936  0.099553  0.015103  0.209191  0.209191  0.055341  0.055341   
4  202937  0.183000  0.032698  0.224145  0.224145  0.008445  0.008445   

        c07       c08       c09  ...       c14  c15       c16       c17  \
0  0.007517  0.000019  0.000144  ...  0.006833  0.0  0.002723  0.004982   
1  0.003234  0.000644  0.007625  ...  0.107810  0.0  0.007847  0.003074   
2  0.020427  0.000000  0.000000  ...  0.000000  0.0  0.000000  0.005527   
3  0.015103  0.000000  0.000000  ...  0.019935  0.0  0.000000  0.005698   
4  0.032698  0.000000  0.000000  ...  0.044163  0.0  0.000000  0.000000   

        c18       c19       c20       c21       c22       c23  
0  0.000153  0.002193  0.007517  

## Conclusion:

1) Supprimer les colonnes de y qui sont égales ou constantes n'est pas une bonne idée.
2) Accorder plus de poids (d'importance) aux lignes du train qui ressemblent à des lignes du TEST n'est également pas une bonne idée. Le problème vient peut être du fait que certaines lignes du TEST ne sont proches d'aucune ligne du train. Et donc les lignes du TEST qui sont proches d'une ou de plusieurs lignes du train sont bien prédites mais les autres sont très mal prédites. Cela s'explique par le fait que notre modèle ne généralise plus assez bien car en augmentant le poids de certaines lignes on a ajouté de la variance.