In [1]:
import pandas as pd
df = pd.read_csv('./df_train2.csv')
df_t = pd.read_csv('./df_test2.csv')

In [2]:
def feature_engineering(df,df_test,target,n_jobs=4):
    from openfe import OpenFE, transform
    train_x,test_x,train_y,n_jobs = df.drop(target,axis=1),df_test,df[target],n_jobs
    ofe = OpenFE()
    features = ofe.fit(data=train_x, label=train_y, n_jobs=n_jobs)  # generate new features
    train_x, test_x = transform(train_x, test_x, features, n_jobs=n_jobs) # transform the train and test data according to generated features.
    return train_x, test_x

def ag_tuning(df_train,df_test,id_name='0620',label='Survived',loss='log_loss',presets='best_quality',dynamic_stacking=True,folds=10,time=60*3):
    from autogluon.tabular import TabularPredictor
    from autogluon.features.generators import AutoMLPipelineFeatureGenerator
    auto_ml_pipeline_feature_generator = AutoMLPipelineFeatureGenerator()
    from autogluon.tabular.configs.hyperparameter_configs import get_hyperparameter_config
    
    custom_hyperparameters = get_hyperparameter_config('default')
    
    def get_oof_ts(predictor,_train_x,_test_x,ids,label):
        predictions_oof = predictor.predict_oof()
        prdict_y = predictor.predict(_test_x)
        pd.DataFrame({'oofs': predictions_oof, 'predict_y': prdict_y}).to_csv(f'model_hc_systhis_{ids}.csv',index=False)
        _train_x[ids],_test_x[ids]  = predictions_oof,prdict_y
        _train_x.to_csv(f'model_hc_tr_{ids}.csv',index=False)
        _test_x.to_csv(f'model_hc_ts_{ids}.csv',index=False)
        return predictions_oof,prdict_y
    
    predictor = TabularPredictor(
        label=label,
        # problem_type=problem,#'binary',
        eval_metric=loss #'acc'
    ).fit(
        df_train, #df, #df_train,
        presets='best_quality',
    #     unlabeled_data = df_test2,   #df_test2.drop(columns='Survived'), #df_train[800:].drop(columns='Survived'),
        # auto_stack=True,
        feature_generator=auto_ml_pipeline_feature_generator,
        ag_args_ensemble = {
            'fold_fitting_strategy':'sequential_local',
            "stopping_metric": "log_loss",
        },
        dynamic_stacking=dynamic_stacking,
    #     num_stack_levels=1,
        num_bag_folds=folds,
        # time_limit=time,
        # included_model_types = ['XGB','NN_TORCH','CATBOOST','LIGHTGBM','FAST_AI'],
    #     excluded_model_types = ['NN_TORCH'],
        # use_bag_holdout=True,
        # hyperparameters=custom_hyperparameters,
        hyperparameter_tune_kwargs='auto',#hyperparameter_tune_kwargs,
        # keep_only_best = True,
    )
    oofs,predictions = get_oof_ts(predictor,df_train,df_test,id_name,label)
    return predictions,oofs,predictor
# show_info(predictor,train_x)

def show_info(predictor,df_train):
    print('++++++++++++++++++++++++++++++++++++++++++++++')
    print(predictor.fit_summary())
    print('++++++++++++++++++++++++++++++++++++++++++++++')
    print(predictor.leaderboard(df_train,silent=True))
    print('++++++++++++++++++++++++++++++++++++++++++++++')
    print(predictor.feature_importance(df_train,silent=True))
    print('++++++++++++++++++++++++++++++++++++++++++++++')

from sklearn.metrics import mean_squared_error,roc_auc_score
def hill_climbers(files,train_path,test_path,label='Survived',func=roc_auc_score):
    from hillclimbers import climb_hill, partial
    
    train,test = pd.read_csv(train_path),pd.read_csv(test_path)
    oofs,pred = pd.DataFrame(),pd.DataFrame()
    for i in range(len(files)):
        tmp = pd.read_csv(files[i])
        oofs[f'm{i}'],pred[f'm{i}'] = tmp['oofs'].dropna(),tmp['predict_y'].dropna()

    test_preds, oof_preds = climb_hill(
        train=train, 
        oof_pred_df=oofs, 
        test_pred_df=pred,
        target=label,
        objective="maximize",
        eval_metric=partial(func),
        negative_weights=True, 
        precision=0.001,
        plot_hill=True, 
        plot_hist=True,
        return_oof_preds=True
    )
    return test_preds, oof_preds
# a,b = hill_climbers(files,train_path,test_path=test_path,label='Survived')

def titanic_acc(predictions):
    from sklearn.metrics import accuracy_score
    acc= pd.read_csv('./acc.csv')
    acc_ans = acc.Survived
    print(accuracy_score(acc_ans,predictions)) 

def submission(predictions,test_x,cols,label,id_name):
    test_x[label] = predictions
    test_x[cols].to_csv(f'submission_{id_name}.csv',index=False)
    
    
def pipline(df1,df1_test,label='Survived',time=60*3,id_name='0620_v3',loss='log_loss',cols=['1','2']):
    # train_x, test_x = feature_engineering(df1,df1_test,label)
    # train_x[label] = df1[label]
    # train_x.to_csv('tr.csv',index=False)
    # test_x.to_csv('ts.csv',index=False)
    train_x,test_x = df1,df1_test
    predictions,oofs,predictor = ag_tuning(train_x,test_x,id_name=id_name,time=time,loss=loss,label=label)
    show_info(predictor,train_x)
    # titanic_acc(predictions)
    submission(predictions,test_x,cols,label,id_name)
    return predictions


In [3]:
import numpy as np
def transform(df):
    features = df.columns.tolist()
    sorted_new =  np.sort(df.values, axis=1)
    df['mean_features'] = 0.1 * df[features].mean(axis=1)
    df['std_features'] = df[features].std(axis=1)
    df['max_features'] = df[features].max(axis=1)
    df['min_features'] = df[features].min(axis=1)
    df['median_features'] = 0.1 * df[features].median(axis=1)
    df['sum_features'] = 0.1 * df[features].sum(axis=1)
    df1 = pd.concat([df, pd.DataFrame(sorted_new, index=df.index)], axis=1)
    df1 = df1.drop(features, axis=1)
    df1.columns = df1.columns.astype('str')
    return df

df_train = transform(df.drop(['Target'],axis=1))
df_test = transform(df_t.drop([],axis=1))
df_train

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,mean_features,std_features,max_features,min_features,median_features,sum_features
0,1,1,1,9238,1,1,126.0,1,1,19,...,0,11.1,0.6,2.02,26.722913,1538.108351,9238.0,0.00,0.100,962.024857
1,1,17,1,9238,1,1,125.0,1,19,19,...,0,11.1,0.6,2.02,26.775333,1538.010902,9238.0,0.00,0.100,963.912000
2,1,17,2,9254,1,1,137.0,1,3,19,...,0,16.2,0.3,-0.92,26.759111,1540.869739,9254.0,-0.92,0.100,963.328000
3,1,1,3,9500,1,1,131.0,1,19,3,...,0,11.1,0.6,2.02,27.503420,1581.694142,9500.0,0.00,0.201,990.123125
4,1,1,2,9500,1,1,132.0,1,19,37,...,0,7.6,2.6,0.32,27.595796,1581.530601,9500.0,0.00,0.230,993.448667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76513,1,17,1,9254,1,1,121.0,1,19,1,...,0,13.9,-0.3,0.79,26.783769,1540.722942,9254.0,-0.30,0.100,964.215667
76514,1,1,6,9254,1,1,125.0,1,1,38,...,0,9.4,-0.8,-3.12,26.888764,1540.591116,9254.0,-3.12,0.100,967.995500
76515,5,17,1,9085,1,1,138.0,1,37,37,...,1,9.4,-0.8,-3.12,26.551889,1512.214797,9085.0,-3.12,0.450,955.868000
76516,1,1,3,9070,1,1,136.0,1,38,37,...,0,7.6,2.6,0.32,26.306444,1510.071542,9070.0,0.00,0.100,947.032000


In [4]:
df_cp = df_train.combine_first(df)
df_cp

Unnamed: 0,Admission grade,Age at enrollment,Application mode,Application order,Course,Curricular units 1st sem (approved),Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (grade),...,Scholarship holder,Target,Tuition fees up to date,Unemployment rate,max_features,mean_features,median_features,min_features,std_features,sum_features
0,122.6,18,1,1,9238,6,0,6,6,14.500000,...,1,Graduate,1,11.1,9238.0,26.722913,0.100,0.00,1538.108351,962.024857
1,119.8,18,17,1,9238,4,0,6,8,11.600000,...,0,Dropout,1,11.1,9238.0,26.775333,0.100,0.00,1538.010902,963.912000
2,144.7,18,17,2,9254,0,0,6,0,0.000000,...,0,Dropout,1,16.2,9254.0,26.759111,0.100,-0.92,1540.869739,963.328000
3,126.1,18,1,3,9500,7,0,7,9,12.591250,...,1,Enrolled,1,11.1,9500.0,27.503420,0.201,0.00,1581.694142,990.123125
4,120.1,18,1,2,9500,6,0,7,12,12.933333,...,0,Graduate,1,7.6,9500.0,27.595796,0.230,0.00,1581.530601,993.448667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76513,116.5,18,17,1,9254,6,0,6,9,10.666667,...,1,Graduate,1,13.9,9254.0,26.783769,0.100,-0.30,1540.722942,964.215667
76514,131.6,19,1,6,9254,4,0,6,22,13.000000,...,0,Graduate,1,9.4,9254.0,26.888764,0.100,-3.12,1540.591116,967.995500
76515,123.3,19,17,1,9085,4,0,5,13,12.500000,...,0,Enrolled,1,9.4,9085.0,26.551889,0.450,-3.12,1512.214797,955.868000
76516,124.8,18,1,3,9070,0,0,6,0,0.000000,...,0,Dropout,1,7.6,9070.0,26.306444,0.100,0.00,1510.071542,947.032000


In [5]:
df_test_cp  = df_test.combine_first(df_t)
df_test_cp

Unnamed: 0,Admission grade,Age at enrollment,Application mode,Application order,Course,Curricular units 1st sem (approved),Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (grade),...,Previous qualification (grade),Scholarship holder,Tuition fees up to date,Unemployment rate,max_features,mean_features,median_features,min_features,std_features,sum_features
0,152.1,18,1,1,9500,0,0,7,0,0.000000,...,141.0,0,0,13.9,9500.0,27.376361,0.0395,-0.30,1582.002355,985.549000
1,116.5,19,1,1,9238,6,0,6,7,14.857143,...,128.0,0,1,11.1,9238.0,26.712714,0.1000,0.00,1538.118351,961.657714
2,114.2,18,1,1,9238,6,0,6,11,12.000000,...,118.0,1,1,15.5,9238.0,26.698444,0.1000,-4.06,1538.117590,961.144000
3,130.0,23,44,1,9147,5,2,6,15,11.500000,...,130.0,1,1,8.9,9147.0,26.775861,0.3000,0.00,1522.457632,963.931000
4,106.0,26,39,1,9670,3,0,6,9,11.000000,...,110.0,0,1,7.6,9670.0,27.992185,0.2300,0.00,1609.929880,1007.718667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51007,124.7,19,1,2,171,0,0,0,0,0.000000,...,128.0,0,1,15.5,171.0,1.549833,0.0500,-4.06,40.045161,55.794000
51008,140.0,33,39,1,9119,0,0,5,6,0.000000,...,133.1,0,0,9.4,9119.0,26.640500,0.1000,-3.12,1517.926969,959.058000
51009,120.4,20,1,1,171,0,0,0,0,0.000000,...,127.0,0,0,15.5,171.0,1.321222,0.0000,-4.06,39.414092,47.564000
51010,126.3,18,1,3,9773,5,0,6,8,12.600000,...,132.0,0,0,7.6,9773.0,28.278944,0.2800,0.00,1627.160057,1018.042000


In [6]:
pipline(df_cp.sample(frac=0.7),df_test_cp,label='Target',loss='log_loss',time=60*60*6,id_name='academic_0624_v1',cols=['id','Target'])

2024-06-24 02:39:54,882	INFO timeout.py:54 -- Reached timeout of 14.73409137556347 seconds. Stopping all trials.
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
2024-06-24 02:39:54,900	INFO tune.py:1016 -- Wrote the latest version of all result files and experiment state to 'E:/proj/my_nbooks/mycode/academic/AutogluonModels/ag-20240623_172750/models/NeuralNetTorch_r89_BAG_L1' in 0.0150s.
- fa6448c3: FileNotFoundError('Could not fetch metrics for fa6448c3: both result.json and progress.csv were not found at E:/proj/my_nbooks/mycode/academic/AutogluonModels/ag-20240623_172750/models/NeuralNetTorch_r89_BAG_L1/fa6448c3')
- eb08b5ba: FileNotFoundError('Could not fetch metrics for eb08b5ba: both result.json and progress.csv were not found at E:/

++++++++++++++++++++++++++++++++++++++++++++++
*** Summary of fit() ***
Estimated performance of each model:
                          model  score_val eval_metric  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0           WeightedEnsemble_L3  -0.446944    log_loss       4.045482  80.381608                0.012000           7.781796            3       True         49
1           WeightedEnsemble_L2  -0.446944    log_loss       4.047482  80.499042                0.013999           7.899230            2       True         48
2            XGBoost_r95_BAG_L1  -0.449831    log_loss       0.406260  14.153855                0.406260          14.153855            1       True         46
3          CatBoost_r163_BAG_L1  -0.450737    log_loss       0.045991  13.938991                0.045991          13.938991            1       True         43
4          ExtraTrees_r4_BAG_L1  -0.458633    log_loss       1.390878  10.911598                



*** End of fit() summary ***
{'model_types': {'KNeighborsDist_BAG_L1': 'StackerEnsembleModel_KNN', 'RandomForestGini_BAG_L1': 'StackerEnsembleModel_RF', 'RandomForestEntr_BAG_L1': 'StackerEnsembleModel_RF', 'CatBoost_BAG_L1\\T1': 'StackerEnsembleModel_CatBoost', 'XGBoost_BAG_L1\\T2': 'StackerEnsembleModel_XGBoost', 'LightGBMLarge_BAG_L1': 'StackerEnsembleModel_LGB', 'CatBoost_r177_BAG_L1': 'StackerEnsembleModel_CatBoost', 'LightGBM_r131_BAG_L1': 'StackerEnsembleModel_LGB', 'CatBoost_r9_BAG_L1': 'StackerEnsembleModel_CatBoost', 'LightGBM_r96_BAG_L1': 'StackerEnsembleModel_LGB', 'CatBoost_r137_BAG_L1': 'StackerEnsembleModel_CatBoost', 'CatBoost_r13_BAG_L1': 'StackerEnsembleModel_CatBoost', 'CatBoost_r50_BAG_L1': 'StackerEnsembleModel_CatBoost', 'CatBoost_r69_BAG_L1': 'StackerEnsembleModel_CatBoost', 'LightGBM_r161_BAG_L1': 'StackerEnsembleModel_LGB', 'NeuralNetFastAI_r143_BAG_L1': 'StackerEnsembleModel_NNFastAiTabular', 'CatBoost_r70_BAG_L1': 'StackerEnsembleModel_CatBoost', 'NeuralNetFa

These features in provided data are not utilized by the predictor and will be ignored: ['academic_0624_v1']


                          model  score_test  score_val eval_metric  \
0         KNeighborsDist_BAG_L1   -0.000039  -1.690782    log_loss   
1       RandomForestEntr_BAG_L1   -0.110315  -0.519145    log_loss   
2       RandomForestGini_BAG_L1   -0.111313  -0.520388    log_loss   
3         ExtraTrees_r49_BAG_L1   -0.113509  -0.490311    log_loss   
4       RandomForest_r39_BAG_L1   -0.160980  -0.590270    log_loss   
5      RandomForest_r127_BAG_L1   -0.252110  -0.525006    log_loss   
6        ExtraTrees_r126_BAG_L1   -0.256666  -0.467616    log_loss   
7           WeightedEnsemble_L2   -0.309237  -0.446944    log_loss   
8           WeightedEnsemble_L3   -0.319793  -0.446944    log_loss   
9          ExtraTrees_r4_BAG_L1   -0.400508  -0.458633    log_loss   
10      RandomForest_r34_BAG_L1   -0.410376  -0.460673    log_loss   
11           XGBoost_r95_BAG_L1   -0.428276  -0.449831    log_loss   
12         CatBoost_r163_BAG_L1   -0.441499  -0.450737    log_loss   
13         CatBoost_

KeyError: "['id'] not in index"

In [8]:
# import 
tmp = pd.read_csv('./model_hc_ts_academic_0624_v1.csv')
test = pd.read_csv('df_test.csv')
test['Target'] = tmp['academic_0624_v1']
test[['id','Target']].to_csv('submission_academic_0624_v1.csv',index=False)

In [None]:
def submission(predictions,test_x,cols,label,id_name):
    test_x[label] = predictions
    test_x[cols].to_csv(f'submission_{id_name}.csv',index=False)
# test = pandas.read_csv('df_test.csv')
submission(pd,test,['id','Target'],'Target','academic_0624_v1_flaml')

In [9]:
!kaggle competitions submit -c playground-series-s4e6 -f submission_academic_0624_v1.csv -m "Message"

Successfully submitted to Classification with an Academic Success Dataset



  0%|          | 0.00/809k [00:00<?, ?B/s]
  1%|          | 8.00k/809k [00:00<00:42, 19.4kB/s]
100%|██████████| 809k/809k [00:03<00:00, 252kB/s]  
