In [None]:
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import seaborn as sns
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, mean_absolute_error, log_loss
import xgboost as xgb
from imblearn.under_sampling import RandomUnderSampler

In [None]:
!pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages
!mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

## Order of Operations
1. Splitting data
2. over/undersampling
3. normalization/data cleanup
4. fit data
5. check metrics

In [None]:
train_df = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/train.csv")
test_df = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/test.csv")
submission_df = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv")

In [None]:
cats=['EJ']

In [None]:
modes= train_df.mode().iloc[0]
train_df.fillna(modes,inplace=True)
test_df.fillna(modes,inplace=True)

In [None]:
train_df.head()

In [None]:
X = train_df.copy().drop(columns=['Id','Class'])
y = train_df['Class']
test_df = test_df.drop(columns=['Id'])

In [None]:
test_df.shape,X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.25,random_state=42)

In [None]:
len(X_train),len(X_test),len(y_train),len(y_test)

In [None]:
undersampler = RandomUnderSampler(random_state =42)

In [None]:
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train,y_train)

In [None]:
ct = make_column_transformer(
    (StandardScaler(), ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN',
       'BP', 'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS',
       'CU', 'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY',
       'EB', 'EE', 'EG', 'EH', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI',
       'FL', 'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL']),
    (OneHotEncoder(handle_unknown='ignore'),['EJ'])
)

In [None]:
ct.fit(X_train_resampled)
X_train_norm = ct.transform(X_train_resampled)
X_test_norm = ct.transform(X_test)

In [None]:
X_train_norm

In [None]:
import xgboost

In [None]:
len(X_train_norm),len(y_train)

In [None]:
xgb_clf = xgb.XGBClassifier(learning_rate=.2,max_depth=5,n_estimators=125,random_state=42)
xgb_clf.fit(X_train_norm, y_train_resampled)

In [None]:
print(mean_absolute_error(y_test,xgb_clf.predict(X_test_norm)))
print(log_loss(y_test,xgb_clf.predict(X_test_norm)))

In [None]:
rf = RandomForestClassifier(120,min_samples_leaf=5,max_depth=7,random_state=42)
rf.fit(X_train_norm,y_train_resampled)

In [None]:
print(mean_absolute_error(y_test,rf.predict(X_test_norm)))
print(log_loss(y_test,rf.predict(X_test_norm)))

In [None]:
from tabpfn.scripts.transformer_prediction_interface import TabPFNClassifier


In [None]:
stacking_clf = StackingClassifier(
    estimators=[
        ('xgb',xgb.XGBClassifier(learning_rate=.2,max_depth=5,n_estimators=125,random_state=42)),
        ('tabfn', TabPFNClassifier(N_ensemble_configurations=64))
    ],stack_method="predict_proba"
)
stacking_clf.fit(X_train_norm,y_train_resampled)

In [None]:
print(mean_absolute_error(y_test,stacking_clf.predict(X_test_norm)))
print(log_loss(y_test,stacking_clf.predict(X_test_norm)))

In [None]:
pred_5 = pd.DataFrame(stacking_clf.predict_proba(test_df))
pred_5

In [None]:
def ensemble():
    clf = xgb.XGBClassifier(learning_rate=.2,max_depth=5,n_estimators=125)
    clf.fit(X_train_norm, y_train_resampled)
    clf2= TabPFNClassifier(N_ensemble_configurations=64)
    clf2.fit(X_train_norm,y_train_resampled)
    probs1 = clf.predict_proba(test_df)
    probs2 = clf2.predict_proba(test_df)
    
    return (probs1 + probs2)/2
    

In [None]:
pred_6 = pd.DataFrame(ensemble())
pred_6

In [None]:
test_df = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/test.csv")
sample_submission=pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv")
sample_submission['Id']      = test_df.reset_index()['Id']
sample_submission["class_0"] = pred_6[0]
sample_submission["class_1"] = pred_6[1]
sample_submission.set_index('Id').to_csv('submission.csv')
sample_submission