In [None]:
! pip install xgboost
! pip install pandas
! pip install scikit-learn
! pip install pyarrow

In [None]:
################################ Libraries ################################
from Input_Variables.read_vars import xgboost_regression_model_storage_location, \
                                      linear_regression_model_storage_location, \
                                      random_forest_regression_model_storage_location, \
                                      factorization_machines_regression_model_storage, \
                                      xgboost_classification_model_storage_location, \
                                      logistic_regression_classification_model_storage_location, \
                                      random_forest_classification_model_storage_location, \
                                      multilayer_perceptron_classification_model_storage_location, \
                                      naive_bayes_classification_model_storage_location, \
                                      factorization_machine_classification_model_storage_location, \
                                      random_seed
from Read_In_Data.read_data import Reading_Data
from Data_Pipeline.scaling_pipeline import Feature_Transformations
from Model_Creation.regression_models import Create_Regression_Models
from Model_Creation.classification_models import Create_Classification_Models
import os


################################ Read In Modules ################################
reading_data=Reading_Data()
feature_transformations=Feature_Transformations()
create_regression_models=Create_Regression_Models()
create_classification_models=Create_Classification_Models()


################################ Regression, Classification, Or Both ################################
train_regression=False
train_classification=True


################################ Read In Data ################################
# Training Summary Stats Data
training_files=list(map(lambda x: os.path.join(os.path.abspath('/cephfs/summary_stats/all_train_bool_updated'),x),
                                               os.listdir('/cephfs/summary_stats/all_train_bool_updated')))
training_files=[i for i in training_files if not ('.crc' in i or 'SUCCESS' in i)]


# Cross Validation Summary Stats Data
val_files=list(map(lambda x: os.path.join(os.path.abspath('/cephfs/summary_stats/all_val_bool_updated'), x),
                                          os.listdir('/cephfs/summary_stats/all_val_bool_updated')))
val_files=[i for i in val_files if not ('.crc' in i or 'SUCCESS' in i)]


# Calling DataFrames
summary_stats_train=reading_data.read_in_all_summary_stats(file_list=training_files)
summary_stats_val=reading_data.read_in_all_summary_stats(file_list=val_files)


################################ Combine Train and Cross Validation ################################
df_train_val_combined=summary_stats_train.union(summary_stats_val)
df_train_val_combined.show(2)
print((df_train_val_combined.count(), len(df_train_val_combined.columns)))


################################ Stages: Scaling Using Custom Transformer ################################
pipeline_transformation_stages=feature_transformations.numerical_scaling(df=df_train_val_combined)

In [None]:
from xgboost.spark import SparkXGBClassifier
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import FMClassifier

from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

In [None]:
features_col="features"
label_name="target"
prediction_column_name="prediction"
num_folds=3

In [None]:
mapping= {0: 0, 1: 1, -1: 2}
df_train_val_combined=df_train_val_combined.replace(to_replace=mapping, subset=['target'])

In [None]:
layers = [4, 2, 2]

In [None]:
mlp=FMClassifier(featuresCol=features_col, 
               labelCol=label_name,
                seed=random_seed)

In [None]:
pipeline_transformation_stages.pop()

In [None]:
pipeline_transformation_stages

In [None]:
pipeline_transformation_stages.append(mlp)
pipeline=Pipeline(stages=pipeline_transformation_stages)

In [None]:
model=pipeline.fit(df_train_val_combined)

In [None]:
ml_df=df_train_val_combined.withColumn("foldCol", df_train_val_combined.NumId % num_folds)

evaluator_logloss=MulticlassClassificationEvaluator(metricName='logLoss',
                                                    labelCol=label_name,
                                                    predictionCol=prediction_column_name)
paramGrid=ParamGridBuilder().build()

In [None]:
crossval=CrossValidator(estimator=xgb,
                        evaluator=evaluator_logloss,
                        estimatorParamMaps=paramGrid,
                        foldCol='foldCol',
                        collectSubModels=False)

In [None]:
pipeline_transformation_stages.append(crossval)
pipeline=Pipeline(stages=pipeline_transformation_stages)

model=pipeline.fit(ml_df)

In [None]:
        location_counter=0
        model_types=['XGBoost', 'Logistic_Regression', 'Random_Forest', 'Multilayer_Perceptron', 'Naive_Bayes', 'Factorization_Machine']
        model_mapping={'XGBoost': SparkXGBClassifier(features_col=self.features_col, 
                                                     label_col=self.label_name,
                                                     random_state=random_seed,
                                                     use_gpu=True),
                       
                       'Logistic_Regression': LogisticRegression(featuresCol=self.features_col, 
                                                                 labelCol=self.label_name,
                                                                 standardization=False),
                       
                       'Random_Forest': RandomForestClassifier(featuresCol=self.features_col, 
                                                               labelCol=self.label_name,
                                                               seed=random_seed),
                       
                       'Multilayer_Perceptron': MultilayerPerceptronClassifier(featuresCol=self.features_col, 
                                                                               labelCol=self.label_name,
                                                                               seed=random_seed),
                       
                       'Naive_Bayes': NaiveBayes(featuresCol=self.features_col, 
                                                 labelCol=self.label_name),
                       
                       'Factorization_Machine': FMClassifier(featuresCol=self.features_col, 
                                                             labelCol=self.label_name,
                                                             seed=random_seed)
                      }
        
        ml_df=ml_df.withColumn("foldCol", ml_df.NumId % num_folds)
        
        evaluator_logloss=MulticlassClassificationEvaluator(metricName='logLoss',
                                                            labelCol=self.label_name,
                                                            predictionCol=self.prediction_column_name)
        paramGrid=ParamGridBuilder().build()
        
        for model_type in model_types:
            if location_counter > 0:
                stages.pop()
                print(f'Currently on {model_type} Model')
            else:
                print(f'Currently on {model_type} Model')
            crossval=CrossValidator(estimator=model_mapping[model_type],
                                    evaluator=evaluator_logloss,
                                    estimatorParamMaps=paramGrid,
                                    foldCol='foldCol',
                                    collectSubModels=False)

            print('Cross Validation Occuring')
            stages.append(crossval)
            pipeline=Pipeline(stages=stages)

            model=pipeline.fit(ml_df)

            model.write().overwrite().save(classification_models_storage_locations[location_counter])
            print(f'Model Saved to {classification_models_storage_locations[location_counter]}')
            location_counter+=1

In [17]:
from pyspark.ml import Pipeline, PipelineModel

In [1]:
################################ Libraries ################################
# from Input_Variables.read_vars import xgb_reg_model_storage_location, xgb_class_model_storage_location, random_seed, \
#                                       evaluation_metrics_output_storage, \
#                                       feature_importance_storage_location, \
#                                       overall_feature_importance_plot_location
from Read_In_Data.read_data import Reading_Data

from Model_Predictions.pyspark_model_preds import Model_Predictions
from Model_Evaluation.pyspark_model_eval import Evaluate_Model
from Feature_Importance.model_feature_importance import Feature_Importance
from Model_Plots.xgboost_classification_plots import XGBoost_Classification_Plot
import os


################################ Read In Modules ################################
reading_data=Reading_Data()

# model_predictions=Model_Predictions()
# evaluate_model=Evaluate_Model()
# feature_importance=Feature_Importance()
# xgboost_classification_plot=XGBoost_Classification_Plot()


################################ Read In Data ################################
# Testing Summary Stats Data
test_files=list(map(lambda x: os.path.join(os.path.abspath('/cephfs/summary_stats/all_test_bool_updated'), x),
                                           os.listdir('/cephfs/summary_stats/all_test_bool_updated')))
test_files=[i for i in test_files if not ('.crc' in i or 'SUCCESS' in i)]

# Calling DataFrames
summary_stats_test=reading_data.read_in_all_summary_stats(file_list=test_files)
summary_stats_test.show(2)
print((summary_stats_test.count(), len(summary_stats_test.columns)))

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/31 20:58:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


                                                                                

23/05/31 20:58:33 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+-----+-----+-----------------+----------------+-------------+-------------+------------------+------------------+------------------+------+-----+-----+--------------------+--------------------+-----------------+------------------+----------+----------+---------------+------------+------+----------+--------+------------------+-------------------------+------------+-------------------------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
|NumId|Chunk|ShortTermVariance|LongTermVariance|VarianceRatio|SampleEntropy|PermutationEntropy|              Mean|            StdDev|Median|  Min|  Max|        AvgFirstDiff|          AvgSecDiff|     StdFirstDiff|        StdSecDiff|CountAbove|CountBelow|TotalOutOfRange|DiffPrevious|target|Sex_Female|Sex_Male|Treatment_yes_both|Tr

In [30]:
pipelineModel=PipelineModel.load("/cephfs/Saved_Models/No_Hyperparameter_Tuning/Classification/XGBoost")

In [31]:
preds=pipelineModel.transform(summary_stats_test)

In [34]:
preds.select('NumId', 'Chunk', 'rawPrediction', 'probability', 'prediction', 'target').show()

  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.



+-----+-----+--------------------+--------------------+----------+------+
|NumId|Chunk|       rawPrediction|         probability|prediction|target|
+-----+-----+--------------------+--------------------+----------+------+
|   12|    1|[1.14917623996734...|[0.51960194110870...|       0.0|     1|
|   12|    2|[1.18391394615173...|[0.51820838451385...|       0.0|     0|
|   12|    7|[0.92625272274017...|[0.43397551774978...|       0.0|     0|
|   12|   17|[1.78566992282867...|[0.77350419759750...|       0.0|     0|
|   12|   19|[1.63358390331268...|[0.78384935855865...|       0.0|     0|
|   12|   20|[1.21677529811859...|[0.59110909700393...|       0.0|     0|
|   12|   30|[1.44567775726318...|[0.66280370950698...|       0.0|     0|
|   12|   33|[0.45569711923599...|[0.35991567373275...|       1.0|     0|
|   12|   37|[1.68670558929443...|[0.83340883255004...|       0.0|     1|
|   12|   40|[1.02925384044647...|[0.60148757696151...|       0.0|     0|
|   12|   45|[1.28640818595886...|[0.6

                                                                                

In [33]:
preds.show(2)

  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.



+-----+-----+-----------------+----------------+-------------+-------------+------------------+------------------+------------------+------+----+-----+--------------------+--------------------+-----------------+-----------------+----------+----------+---------------+------------+------+----------+--------+------------------+-------------------------+------------+-------------------------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------------------+-----------------------+--------------------+--------------------+-------------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+--------------------+----------------------+--------------------+--------------------+----------+--------------------+
|NumId|Chunk|ShortTermVariance|LongTermVariance|VarianceRatio|SampleEntrop

                                                                                

In [26]:
pipelineModel=PipelineModel.load("/cephfs/Saved_Models/No_Hyperparameter_Tuning/Regression/XGBoost")

In [27]:
preds=pipelineModel.transform(summary_stats_test)

In [29]:
preds.select('prediction', 'DiffPrevious').show()

  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.



+-------------------+------------+
|         prediction|DiffPrevious|
+-------------------+------------+
| 15.160192489624023|          54|
| 11.676578521728516|           0|
|  5.882169723510742|          -7|
| 3.9520092010498047|           0|
|  6.482405185699463|           3|
| 1.2447888851165771|          -5|
| 10.977035522460938|           3|
| 12.524152755737305|          -1|
|  6.483696937561035|          16|
| 3.5730857849121094|          -2|
|   7.81408166885376|           0|
|-16.302438735961914|         -18|
|  5.971994876861572|          18|
| 3.9098498821258545|          12|
| -34.38546371459961|         -60|
|-18.264915466308594|         -16|
| 10.801153182983398|          16|
| -23.75125503540039|         -35|
| 13.340191841125488|           0|
| 15.376773834228516|          12|
+-------------------+------------+
only showing top 20 rows



                                                                                