In [0]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

In [0]:
#Immunization complience might be introduced a target leakage "immunization_compliance

In [0]:
df = spark.table("avengers.default.ai_generated_dataset_complete_v3").toPandas()
Zeros = df[df['profile_next_90d'] == 0]
Ones = df[df['profile_next_90d'] == 1]


ones_train, ones_test = train_test_split(Ones , test_size=0.2, random_state=42)
zeros_train, zeros_test = train_test_split(Zeros , test_size=0.2, random_state=42)

Train = pd.concat([ones_train,zeros_train])
Test = pd.concat([ones_test,zeros_test])

X_train =  Train[["pilot_id","year_month",'gender','aircraft','aeromedical_class_current','dental_readiness','pha_status',"age","flight_hours_last_12mo","abnormal_labs_6mo","encounters_6mo","pha_overdue_flag","flight_hours_total","readiness_score","result","base"]]
y_train = Train['profile_next_90d']

X_test =  Test[["pilot_id","year_month",'gender','aircraft','aeromedical_class_current','dental_readiness','pha_status',"age","flight_hours_last_12mo","abnormal_labs_6mo","encounters_6mo","pha_overdue_flag","flight_hours_total","readiness_score","result","base"]]
y_test = Test['profile_next_90d']


#Preprocessing 
numeric_features = ["age","flight_hours_last_12mo","abnormal_labs_6mo","encounters_6mo","pha_overdue_flag","flight_hours_total","readiness_score"]
categorical_features = ["gender","aircraft","aeromedical_class_current","dental_readiness","pha_status"]

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

gb_clf = GradientBoostingClassifier(random_state=42)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),("classifier", gb_clf)])

param_grid = {"classifier__n_estimators": [100, 200],
              "classifier__learning_rate": [0.01,0.1,0.2],
              "classifier__max_depth": [3, 5, 7]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1',n_jobs=1)

grid_search.fit(X_train, y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)


y_pred = grid_search.predict(X_test)
print("Test set score: ", grid_search.score(X_test, y_test))
print(classification_report(y_test, y_pred))


In [0]:
best_pipeline = grid_search.best_estimator_
classifier = best_pipeline.named_steps['classifier']
preprocessor = best_pipeline.named_steps['preprocessor']
importances = classifier.feature_importances_
numeric_features = ["age","flight_hours_last_12mo","abnormal_labs_6mo","encounters_6mo","pha_overdue_flag","flight_hours_total","readiness_score"]
categorical_features = ["gender","aircraft","aeromedical_class_current","dental_readiness","pha_status"]
categorical_features = preprocessor.named_transformers_['cat']["onehot"].get_feature_names_out(categorical_features)

all_feature_names = np.concatenate([numeric_features,categorical_features])
feat_importances = pd.Series(importances, index=all_feature_names)
feat_importances.nlargest(10).plot(kind='barh')

In [0]:
y_pred_test = best_pipeline.predict(X_test)
y_pred_train = best_pipeline.predict(X_train)
y_prob_test = best_pipeline.predict_proba(X_test)
y_prob_train = best_pipeline.predict_proba(X_train)
X_test['Predicted'] = y_pred_test
X_test['Probability'] = np.round(y_prob_test[:,1],3)
X_test['Actual'] = y_test
X_test['set'] = 'test'
X_train['Predicted'] = y_pred_train
X_train['Actual'] = y_train
X_train['Probability'] = np.round(y_prob_train[:,1],3)
X_train['set'] = 'train'
all = pd.concat([X_train, X_test])
spark.createDataFrame(all).write \
                                .format('delta') \
                                .option('maxRecordsPerFile', 2000) \
                                .option("overwriteSchema", "false") \
                                .mode('overwrite') \
                                .saveAsTable("avengers.default.gbtresults_with_ai_V3")


In [0]:
import pyspark.sql.functions as f
spark.table('avengers.default.gbtresults_with_ai_V3').filter(f.col('Actual')==1).display()

In [0]:
from pyspark.sql.functions import col
spark.table('avengers.default.gbtresults_with_ai_V2').filter(col("pilot_id")=='P0286').display()


In [0]:
spark.table("avengers.default.ai_generated_dataset_complete_v2").count()

In [0]:
spark.table("avengers.default.ai_generated_dataset_complete_v2").filter(col('pilot_id')=='P0454').display()