In [None]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score

In [None]:
data = pd.read_csv(r'/kaggle/input/playground-series-s3e15/data.csv')

In [None]:
ProfileReport(data)

In [None]:
data.columns

In [None]:
data.columns = ['id', 'author', 'geometry', 'pressure', 'mass_flux', 'x_e_out', 'D_e', 'D_h', 'length', 'chf_exp']

In [None]:
data

In [None]:
train = data[~data.x_e_out.isna()]
test = data[data.x_e_out.isna()]

In [None]:
train.describe()

In [None]:
test.describe()

### Sanity check

In [None]:
tmp = data.drop(['id', 'x_e_out'], axis=1)
tmp_target = data.x_e_out.isna() * 1.0

In [None]:
X_tmp_train, X_tmp_test, y_tmp_train, y_tmp_test = train_test_split(tmp, tmp_target)

In [None]:
X_tmp_train.isna().sum()

In [None]:
fill_num = X_tmp_train.median()

In [None]:
X_tmp_train.fillna(fill_num).isna().sum()

In [None]:
X_tmp_train['author'].value_counts()

In [None]:
X_tmp_train['geometry'].value_counts()

In [None]:
X_tmp_train = X_tmp_train.fillna(fill_num)
X_tmp_test = X_tmp_test.fillna(fill_num)

In [None]:
X_tmp_train[['author', 'geometry']] = X_tmp_train[['author', 'geometry']].fillna({'author': 'Thompson', 'geometry': 'tube'})
X_tmp_test[['author', 'geometry']] = X_tmp_test[['author', 'geometry']].fillna({'author': 'Thompson', 'geometry': 'tube'})

In [None]:
X_tmp_train

In [None]:
geometry_encoder = pd.DataFrame({
    'geometry': X_tmp_train.geometry,
    'target': y_tmp_train
}).groupby('geometry').target.mean()

author_encoder = pd.DataFrame({
    'author': X_tmp_train.author,
    'target': y_tmp_train
}).groupby('author').target.mean()

In [None]:
X_tmp_train.author

In [None]:
X_tmp_train.author = X_tmp_train.author.map(author_encoder)
X_tmp_train.geometry = X_tmp_train.geometry.map(geometry_encoder)

X_tmp_test.author = X_tmp_test.author.map(author_encoder)
X_tmp_test.geometry = X_tmp_test.geometry.map(geometry_encoder)

In [None]:
X_tmp_train

In [None]:
clf = RandomForestClassifier(max_depth=3)
clf.fit(X_tmp_train, y_tmp_train)


In [None]:
print(clf.score(X_tmp_train, y_tmp_train))
print(clf.score(X_tmp_test, y_tmp_test))

In [None]:
y_tmp_train.mean()

In [None]:
1-y_tmp_train.mean()

In [None]:
train_proba = clf.predict_proba(X_tmp_train)[:,1]
train_proba.mean(), train_proba.std()

In [None]:
roc_auc_score(y_tmp_train, train_proba)

In [None]:
roc_auc_score(y_tmp_test, clf.predict_proba(X_tmp_test)[:,1])

### Conclusion
Train and test are not different from each other.
They came out of the same general distribution.
We can say that the omissions occurred by accident.

## Train the main model

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train.drop(['id', 'x_e_out'],axis=1), train.x_e_out)

In [None]:
# Fix NA

fill_num = X_train.median()
X_train = X_train.fillna(fill_num)
X_val = X_val.fillna(fill_num)


X_train[['author', 'geometry']] = X_train[['author', 'geometry']].fillna({'author': 'Thompson', 'geometry': 'tube'})
X_val[['author', 'geometry']] = X_val[['author', 'geometry']].fillna({'author': 'Thompson', 'geometry': 'tube'})

In [None]:
geometry_encoder = pd.DataFrame({
    'geometry': X_train.geometry,
    'target': y_train
}).groupby('geometry').target.mean()

author_encoder = pd.DataFrame({
    'author': X_train.author,
    'target': y_train
}).groupby('author').target.mean()

X_train.author = X_train.author.map(author_encoder)
X_train.geometry = X_train.geometry.map(geometry_encoder)

X_val.author = X_val.author.map(author_encoder)
X_val.geometry = X_val.geometry.map(geometry_encoder)

In [None]:
X_train.isna().sum()

In [None]:
X_val.isna().sum()

In [None]:
model = RandomForestRegressor(max_depth=5)
model.fit(X_train, y_train)

In [None]:
np.mean((model.predict(X_train) - y_train)**2)**0.5

In [None]:
np.mean((model.predict(X_val) - y_val)**2)**0.5

In [None]:
X_test = test.drop(['id', 'x_e_out'],axis=1)
X_test[['author', 'geometry']] = X_test[['author', 'geometry']].fillna({'author': 'Thompson', 'geometry': 'tube'})
X_test = X_test.fillna(fill_num)
X_test.author = X_test.author.map(author_encoder)
X_test.geometry = X_test.geometry.map(geometry_encoder)

In [None]:
model.predict(X_test)

In [None]:
submit = pd.read_csv('/kaggle/input/playground-series-s3e15/sample_submission.csv')

In [None]:
submit['x_e_out [-]'] = model.predict(X_test)

In [None]:
submit.to_csv('random_forest.csv', index=False)