In [1]:
from pyspark.sql.functions import isnull,sum

In [2]:
# Importing pyspark and starting session to use sprak functionality 
import pyspark 
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,when,isnan,count
from pyspark.sql.types import IntegerType, StructType,StructField, FloatType , DoubleType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler,StandardScaler
from pyspark.ml.stat import Correlation
from pyspark.sql.functions import regexp_replace
from pyspark.ml.regression import LinearRegression 
spark = SparkSession.builder.appName("hotel").getOrCreate()


In [3]:
# reading csv file
dataset = spark.read.csv('hotel_bookings.csv',inferSchema=True,header=True) # infer schema automatocally guess column datatypes
dataset.show(5)

+------------+-----------+---------+-----------------+------------------+------------------------+-------------------------+-----------------------+--------------------+------+--------+------+----+-------+--------------+--------------------+-----------------+----------------------+------------------------------+------------------+------------------+---------------+------------+-----+-------+--------------------+-------------+----+---------------------------+-------------------------+------------------+-----------------------+
|       hotel|is_canceled|lead_time|arrival_date_year|arrival_date_month|arrival_date_week_number|arrival_date_day_of_month|stays_in_weekend_nights|stays_in_week_nights|adults|children|babies|meal|country|market_segment|distribution_channel|is_repeated_guest|previous_cancellations|previous_bookings_not_canceled|reserved_room_type|assigned_room_type|booking_changes|deposit_type|agent|company|days_in_waiting_list|customer_type| adr|required_car_parking_spaces|total_

In [4]:
dataset.printSchema()

root
 |-- hotel: string (nullable = true)
 |-- is_canceled: integer (nullable = true)
 |-- lead_time: integer (nullable = true)
 |-- arrival_date_year: integer (nullable = true)
 |-- arrival_date_month: string (nullable = true)
 |-- arrival_date_week_number: integer (nullable = true)
 |-- arrival_date_day_of_month: integer (nullable = true)
 |-- stays_in_weekend_nights: integer (nullable = true)
 |-- stays_in_week_nights: integer (nullable = true)
 |-- adults: integer (nullable = true)
 |-- children: string (nullable = true)
 |-- babies: integer (nullable = true)
 |-- meal: string (nullable = true)
 |-- country: string (nullable = true)
 |-- market_segment: string (nullable = true)
 |-- distribution_channel: string (nullable = true)
 |-- is_repeated_guest: integer (nullable = true)
 |-- previous_cancellations: integer (nullable = true)
 |-- previous_bookings_not_canceled: integer (nullable = true)
 |-- reserved_room_type: string (nullable = true)
 |-- assigned_room_type: string (nullab

In [5]:
dataset.count()

119390

In [6]:
distinct_counts = {}

for col in dataset.columns:
    distinct_counts[col] = dataset.select(col).distinct().count()

print(distinct_counts)

{'hotel': 2, 'is_canceled': 2, 'lead_time': 479, 'arrival_date_year': 3, 'arrival_date_month': 12, 'arrival_date_week_number': 53, 'arrival_date_day_of_month': 31, 'stays_in_weekend_nights': 17, 'stays_in_week_nights': 35, 'adults': 14, 'children': 6, 'babies': 5, 'meal': 5, 'country': 178, 'market_segment': 8, 'distribution_channel': 5, 'is_repeated_guest': 2, 'previous_cancellations': 15, 'previous_bookings_not_canceled': 73, 'reserved_room_type': 10, 'assigned_room_type': 12, 'booking_changes': 21, 'deposit_type': 3, 'agent': 334, 'company': 353, 'days_in_waiting_list': 128, 'customer_type': 4, 'adr': 8879, 'required_car_parking_spaces': 5, 'total_of_special_requests': 6, 'reservation_status': 3, 'reservation_status_date': 926}


In [7]:
from pyspark.sql.functions import count, countDistinct

# Create an empty dictionary to store the duplicate counts for each column
duplicate_counts = {}

# Loop over all columns in the DataFrame
for col in dataset.columns:
    # Count the total number of rows in the column
    total_count = dataset.select(col).count()
    # Count the number of distinct rows in the column
    distinct_count = dataset.select(col).distinct().count()
    # Count the number of duplicate rows in the column
    duplicate_count = total_count - distinct_count
    # Store the duplicate count for this column in the dictionary
    duplicate_counts[col] = duplicate_count

# Print the duplicate counts for all columns
print(duplicate_counts)


{'hotel': 119388, 'is_canceled': 119388, 'lead_time': 118911, 'arrival_date_year': 119387, 'arrival_date_month': 119378, 'arrival_date_week_number': 119337, 'arrival_date_day_of_month': 119359, 'stays_in_weekend_nights': 119373, 'stays_in_week_nights': 119355, 'adults': 119376, 'children': 119384, 'babies': 119385, 'meal': 119385, 'country': 119212, 'market_segment': 119382, 'distribution_channel': 119385, 'is_repeated_guest': 119388, 'previous_cancellations': 119375, 'previous_bookings_not_canceled': 119317, 'reserved_room_type': 119380, 'assigned_room_type': 119378, 'booking_changes': 119369, 'deposit_type': 119387, 'agent': 119056, 'company': 119037, 'days_in_waiting_list': 119262, 'customer_type': 119386, 'adr': 110511, 'required_car_parking_spaces': 119385, 'total_of_special_requests': 119384, 'reservation_status': 119387, 'reservation_status_date': 118464}


In [8]:
print("Number of rows in the original DataFrame:", dataset.count())
dataset = dataset.distinct() # droping a distinct/duplicates values
dataset.count()
print("Number of rows after droping duplicates DataFrame:", dataset.count())

Number of rows in the original DataFrame: 119390
Number of rows after droping duplicates DataFrame: 87396


In [9]:
null_count = dataset.select([sum(isnull(c).cast("int")).alias(c) for c in dataset.columns])
null_count.show()

+-----+-----------+---------+-----------------+------------------+------------------------+-------------------------+-----------------------+--------------------+------+--------+------+----+-------+--------------+--------------------+-----------------+----------------------+------------------------------+------------------+------------------+---------------+------------+-----+-------+--------------------+-------------+---+---------------------------+-------------------------+------------------+-----------------------+
|hotel|is_canceled|lead_time|arrival_date_year|arrival_date_month|arrival_date_week_number|arrival_date_day_of_month|stays_in_weekend_nights|stays_in_week_nights|adults|children|babies|meal|country|market_segment|distribution_channel|is_repeated_guest|previous_cancellations|previous_bookings_not_canceled|reserved_room_type|assigned_room_type|booking_changes|deposit_type|agent|company|days_in_waiting_list|customer_type|adr|required_car_parking_spaces|total_of_special_reque

In [10]:
df = dataset.drop("adr", "country", "distribution_channel", "deposit_type", "agent", "company","arrival_date_week_number")
df.show()

+------------+-----------+---------+-----------------+------------------+-------------------------+-----------------------+--------------------+------+--------+------+----+--------------+-----------------+----------------------+------------------------------+------------------+------------------+---------------+--------------------+---------------+---------------------------+-------------------------+------------------+-----------------------+
|       hotel|is_canceled|lead_time|arrival_date_year|arrival_date_month|arrival_date_day_of_month|stays_in_weekend_nights|stays_in_week_nights|adults|children|babies|meal|market_segment|is_repeated_guest|previous_cancellations|previous_bookings_not_canceled|reserved_room_type|assigned_room_type|booking_changes|days_in_waiting_list|  customer_type|required_car_parking_spaces|total_of_special_requests|reservation_status|reservation_status_date|
+------------+-----------+---------+-----------------+------------------+-------------------------+-----

In [11]:
df = df.withColumnRenamed("arrival_date_year", "arrival_year").withColumnRenamed("arrival_date_month", "arrival_month").withColumnRenamed("arrival_date_day_of_month", "arrival_day")
df.show(5)

+------------+-----------+---------+------------+-------------+-----------+-----------------------+--------------------+------+--------+------+----+--------------+-----------------+----------------------+------------------------------+------------------+------------------+---------------+--------------------+-------------+---------------------------+-------------------------+------------------+-----------------------+
|       hotel|is_canceled|lead_time|arrival_year|arrival_month|arrival_day|stays_in_weekend_nights|stays_in_week_nights|adults|children|babies|meal|market_segment|is_repeated_guest|previous_cancellations|previous_bookings_not_canceled|reserved_room_type|assigned_room_type|booking_changes|days_in_waiting_list|customer_type|required_car_parking_spaces|total_of_special_requests|reservation_status|reservation_status_date|
+------------+-----------+---------+------------+-------------+-----------+-----------------------+--------------------+------+--------+------+----+--------

In [12]:
column_names = df.columns

# print the column names
print(column_names)

['hotel', 'is_canceled', 'lead_time', 'arrival_year', 'arrival_month', 'arrival_day', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'meal', 'market_segment', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'reserved_room_type', 'assigned_room_type', 'booking_changes', 'days_in_waiting_list', 'customer_type', 'required_car_parking_spaces', 'total_of_special_requests', 'reservation_status', 'reservation_status_date']


In [13]:
df.toPandas().to_excel('final.xlsx', sheet_name = 'Sheet1', index = False)

In [14]:
df.printSchema()

root
 |-- hotel: string (nullable = true)
 |-- is_canceled: integer (nullable = true)
 |-- lead_time: integer (nullable = true)
 |-- arrival_year: integer (nullable = true)
 |-- arrival_month: string (nullable = true)
 |-- arrival_day: integer (nullable = true)
 |-- stays_in_weekend_nights: integer (nullable = true)
 |-- stays_in_week_nights: integer (nullable = true)
 |-- adults: integer (nullable = true)
 |-- children: string (nullable = true)
 |-- babies: integer (nullable = true)
 |-- meal: string (nullable = true)
 |-- market_segment: string (nullable = true)
 |-- is_repeated_guest: integer (nullable = true)
 |-- previous_cancellations: integer (nullable = true)
 |-- previous_bookings_not_canceled: integer (nullable = true)
 |-- reserved_room_type: string (nullable = true)
 |-- assigned_room_type: string (nullable = true)
 |-- booking_changes: integer (nullable = true)
 |-- days_in_waiting_list: integer (nullable = true)
 |-- customer_type: string (nullable = true)
 |-- required_c

In [15]:
strinx = StringIndexer(inputCols = ["hotel","arrival_month","children","meal","market_segment","reserved_room_type","assigned_room_type","customer_type","reservation_status"],outputCols =["hotel_trans","arrival_month_trans","children_trans","meal_trans","market_segment_trans","reserved_room_type_trans","assigned_room_type_trans","customer_type_trans","reservation_status_trans"] )  

In [16]:
df = strinx.fit(df).transform(df)

In [17]:
df.show()

+------------+-----------+---------+------------+-------------+-----------+-----------------------+--------------------+------+--------+------+----+--------------+-----------------+----------------------+------------------------------+------------------+------------------+---------------+--------------------+---------------+---------------------------+-------------------------+------------------+-----------------------+-----------+-------------------+--------------+----------+--------------------+------------------------+------------------------+-------------------+------------------------+
|       hotel|is_canceled|lead_time|arrival_year|arrival_month|arrival_day|stays_in_weekend_nights|stays_in_week_nights|adults|children|babies|meal|market_segment|is_repeated_guest|previous_cancellations|previous_bookings_not_canceled|reserved_room_type|assigned_room_type|booking_changes|days_in_waiting_list|  customer_type|required_car_parking_spaces|total_of_special_requests|reservation_status|res

In [18]:
from pyspark.sql.functions import *
from pyspark.sql.functions import year, month, dayofweek
df = df.withColumn('dayOfWeek', dayofweek(col('reservation_status_date')))
df = df.withColumn('month', month(col('reservation_status_date')))
df = df.withColumn('year', year(col('reservation_status_date')))

In [19]:
df.show()

+------------+-----------+---------+------------+-------------+-----------+-----------------------+--------------------+------+--------+------+----+--------------+-----------------+----------------------+------------------------------+------------------+------------------+---------------+--------------------+---------------+---------------------------+-------------------------+------------------+-----------------------+-----------+-------------------+--------------+----------+--------------------+------------------------+------------------------+-------------------+------------------------+---------+-----+----+
|       hotel|is_canceled|lead_time|arrival_year|arrival_month|arrival_day|stays_in_weekend_nights|stays_in_week_nights|adults|children|babies|meal|market_segment|is_repeated_guest|previous_cancellations|previous_bookings_not_canceled|reserved_room_type|assigned_room_type|booking_changes|days_in_waiting_list|  customer_type|required_car_parking_spaces|total_of_special_requests|r

In [20]:
onenc = OneHotEncoder(inputCols = ["hotel_trans","arrival_month_trans","children_trans","meal_trans","market_segment_trans","reserved_room_type_trans","assigned_room_type_trans","customer_type_trans","reservation_status_trans"],
                      outputCols =["hotel_VEC","arrival_month_VEC","children_VEC","meal_VEC","market_segment_VEC","reserved_room_type_VEC","assigned_room_type_VEC","customer_type_VEC","reservation_status_VEC"] )
df = onenc.fit(df).transform(df)

In [21]:
df.show(5,vertical=True)

-RECORD 0----------------------------------------
 hotel                          | Resort Hotel   
 is_canceled                    | 0              
 lead_time                      | 9              
 arrival_year                   | 2015           
 arrival_month                  | July           
 arrival_day                    | 13             
 stays_in_weekend_nights        | 1              
 stays_in_week_nights           | 1              
 adults                         | 2              
 children                       | 0              
 babies                         | 0              
 meal                           | HB             
 market_segment                 | Online TA      
 is_repeated_guest              | 0              
 previous_cancellations         | 0              
 previous_bookings_not_canceled | 0              
 reserved_room_type             | A              
 assigned_room_type             | A              
 booking_changes                | 0              


In [22]:
column_names = df.columns

# print the column names
print(column_names)

['hotel', 'is_canceled', 'lead_time', 'arrival_year', 'arrival_month', 'arrival_day', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'meal', 'market_segment', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'reserved_room_type', 'assigned_room_type', 'booking_changes', 'days_in_waiting_list', 'customer_type', 'required_car_parking_spaces', 'total_of_special_requests', 'reservation_status', 'reservation_status_date', 'hotel_trans', 'arrival_month_trans', 'children_trans', 'meal_trans', 'market_segment_trans', 'reserved_room_type_trans', 'assigned_room_type_trans', 'customer_type_trans', 'reservation_status_trans', 'dayOfWeek', 'month', 'year', 'hotel_VEC', 'arrival_month_VEC', 'children_VEC', 'meal_VEC', 'market_segment_VEC', 'reserved_room_type_VEC', 'assigned_room_type_VEC', 'customer_type_VEC', 'reservation_status_VEC']


In [23]:
df_dropped = df.drop("hotel","arrival_month","children","meal","market_segment","reserved_room_type","assigned_room_type","customer_type","reservation_status","reservation_status_date")
df_dropped.show()

+-----------+---------+------------+-----------+-----------------------+--------------------+------+------+-----------------+----------------------+------------------------------+---------------+--------------------+---------------------------+-------------------------+-----------+-------------------+--------------+----------+--------------------+------------------------+------------------------+-------------------+------------------------+---------+-----+----+---------+-----------------+-------------+-------------+------------------+----------------------+----------------------+-----------------+----------------------+
|is_canceled|lead_time|arrival_year|arrival_day|stays_in_weekend_nights|stays_in_week_nights|adults|babies|is_repeated_guest|previous_cancellations|previous_bookings_not_canceled|booking_changes|days_in_waiting_list|required_car_parking_spaces|total_of_special_requests|hotel_trans|arrival_month_trans|children_trans|meal_trans|market_segment_trans|reserved_room_type_tran

In [24]:
new_column_names = df_dropped.columns


In [25]:
Va = VectorAssembler()
vA = Va.setParams(inputCols=new_column_names, outputCol='features')
df_dropped = vA.transform(df_dropped)
df.show(5,vertical=True)

-RECORD 0----------------------------------------
 hotel                          | Resort Hotel   
 is_canceled                    | 0              
 lead_time                      | 9              
 arrival_year                   | 2015           
 arrival_month                  | July           
 arrival_day                    | 13             
 stays_in_weekend_nights        | 1              
 stays_in_week_nights           | 1              
 adults                         | 2              
 children                       | 0              
 babies                         | 0              
 meal                           | HB             
 market_segment                 | Online TA      
 is_repeated_guest              | 0              
 previous_cancellations         | 0              
 previous_bookings_not_canceled | 0              
 reserved_room_type             | A              
 assigned_room_type             | A              
 booking_changes                | 0              


In [26]:
from pyspark.ml.feature import MinMaxScaler
# scaler = MinMaxScaler(inputCol="features", outputCol="features")
# scalerModel = scaler.fit(df_dropped)

scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
# rescale each feature to range [min, max].
scaledData = scaler.fit(df_dropped).transform(df_dropped)
scaledData.show()


+-----------+---------+------------+-----------+-----------------------+--------------------+------+------+-----------------+----------------------+------------------------------+---------------+--------------------+---------------------------+-------------------------+-----------+-------------------+--------------+----------+--------------------+------------------------+------------------------+-------------------+------------------------+---------+-----+----+---------+-----------------+-------------+-------------+------------------+----------------------+----------------------+-----------------+----------------------+--------------------+--------------------+
|is_canceled|lead_time|arrival_year|arrival_day|stays_in_weekend_nights|stays_in_week_nights|adults|babies|is_repeated_guest|previous_cancellations|previous_bookings_not_canceled|booking_changes|days_in_waiting_list|required_car_parking_spaces|total_of_special_requests|hotel_trans|arrival_month_trans|children_trans|meal_trans|ma

In [27]:
splt = scaledData.randomSplit([0.7,0.3])
train_df= splt[0]
test_df = splt[1]

In [28]:
#logistic Regression
from pyspark.ml.classification import LogisticRegression
logic = LogisticRegression(featuresCol = 'features',labelCol = 'is_canceled', maxIter = 500)
logicModel = logic.fit(train_df)
predictions = logicModel.transform(test_df)
predictions.select('is_canceled', 'rawPrediction', 'prediction', 'probability').show(10)

+-----------+--------------------+----------+--------------------+
|is_canceled|       rawPrediction|prediction|         probability|
+-----------+--------------------+----------+--------------------+
|          0|[19.6068810229503...|       0.0|[0.99999999694620...|
|          0|[19.7669960119323...|       0.0|[0.99999999739802...|
|          0|[20.0012668535961...|       0.0|[0.99999999794145...|
|          0|[19.7943833416078...|       0.0|[0.99999999746832...|
|          0|[19.5039269256352...|       0.0|[0.99999999661505...|
|          0|[19.8820073592314...|       0.0|[0.99999999768071...|
|          0|[19.4659800180892...|       0.0|[0.99999999648413...|
|          0|[19.0605170966538...|       0.0|[0.99999999472621...|
|          0|[19.4711047502582...|       0.0|[0.99999999650210...|
|          0|[19.0824349249650...|       0.0|[0.99999999484054...|
+-----------+--------------------+----------+--------------------+
only showing top 10 rows



In [29]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()
evaluator.setLabelCol("is_canceled")
evaluator.setRawPredictionCol("prediction")
print('Test Area Under ROC', evaluator.evaluate(predictions))

Test Area Under ROC 1.0


In [30]:
accu = logicModel.summary
accu.accuracy

1.0

In [31]:
#Decision Tree
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = 'scaledFeatures', labelCol = 'is_canceled', maxDepth = 3)
dtModel = dt.fit(train_df)
predictions = dtModel.transform(test_df)
predictions.select('is_canceled', 'rawPrediction', 'prediction', 'probability').show(10)


+-----------+-------------+----------+-----------+
|is_canceled|rawPrediction|prediction|probability|
+-----------+-------------+----------+-----------+
|          0|[44437.0,0.0]|       0.0|  [1.0,0.0]|
|          0|[44437.0,0.0]|       0.0|  [1.0,0.0]|
|          0|[44437.0,0.0]|       0.0|  [1.0,0.0]|
|          0|[44437.0,0.0]|       0.0|  [1.0,0.0]|
|          0|[44437.0,0.0]|       0.0|  [1.0,0.0]|
|          0|[44437.0,0.0]|       0.0|  [1.0,0.0]|
|          0|[44437.0,0.0]|       0.0|  [1.0,0.0]|
|          0|[44437.0,0.0]|       0.0|  [1.0,0.0]|
|          0|[44437.0,0.0]|       0.0|  [1.0,0.0]|
|          0|[44437.0,0.0]|       0.0|  [1.0,0.0]|
+-----------+-------------+----------+-----------+
only showing top 10 rows



In [32]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()
evaluator.setLabelCol("is_canceled")
evaluator.setRawPredictionCol("prediction")
print('Test Area Under ROC', evaluator.evaluate(predictions))

Test Area Under ROC 1.0


In [33]:
#Random Forrest
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'scaledFeatures', labelCol = 'is_canceled')
rfModel = rf.fit(train_df)
predictions = dtModel.transform(test_df)
predictions.select('is_canceled', 'rawPrediction', 'prediction', 'probability').show(10)

+-----------+-------------+----------+-----------+
|is_canceled|rawPrediction|prediction|probability|
+-----------+-------------+----------+-----------+
|          0|[44437.0,0.0]|       0.0|  [1.0,0.0]|
|          0|[44437.0,0.0]|       0.0|  [1.0,0.0]|
|          0|[44437.0,0.0]|       0.0|  [1.0,0.0]|
|          0|[44437.0,0.0]|       0.0|  [1.0,0.0]|
|          0|[44437.0,0.0]|       0.0|  [1.0,0.0]|
|          0|[44437.0,0.0]|       0.0|  [1.0,0.0]|
|          0|[44437.0,0.0]|       0.0|  [1.0,0.0]|
|          0|[44437.0,0.0]|       0.0|  [1.0,0.0]|
|          0|[44437.0,0.0]|       0.0|  [1.0,0.0]|
|          0|[44437.0,0.0]|       0.0|  [1.0,0.0]|
+-----------+-------------+----------+-----------+
only showing top 10 rows



In [34]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()
evaluator.setLabelCol("is_canceled")
evaluator.setRawPredictionCol("prediction")
print('Test Area Under ROC', evaluator.evaluate(predictions))

Test Area Under ROC 1.0


In [35]:
#Gradient Boosting
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(featuresCol = 'scaledFeatures', labelCol = 'is_canceled', maxIter=10)
gbtModel = gbt.fit(train_df)
predictions = gbtModel.transform(test_df)
predictions.select('is_canceled', 'rawPrediction', 'prediction', 'probability').show(10)

+-----------+--------------------+----------+--------------------+
|is_canceled|       rawPrediction|prediction|         probability|
+-----------+--------------------+----------+--------------------+
|          0|[1.32590267922034...|       0.0|[0.93412217565278...|
|          0|[1.32590267922034...|       0.0|[0.93412217565278...|
|          0|[1.32590267922034...|       0.0|[0.93412217565278...|
|          0|[1.32590267922034...|       0.0|[0.93412217565278...|
|          0|[1.32590267922034...|       0.0|[0.93412217565278...|
|          0|[1.32590267922034...|       0.0|[0.93412217565278...|
|          0|[1.32590267922034...|       0.0|[0.93412217565278...|
|          0|[1.32590267922034...|       0.0|[0.93412217565278...|
|          0|[1.32590267922034...|       0.0|[0.93412217565278...|
|          0|[1.32590267922034...|       0.0|[0.93412217565278...|
+-----------+--------------------+----------+--------------------+
only showing top 10 rows



In [36]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()
evaluator.setLabelCol("is_canceled")
evaluator.setRawPredictionCol("prediction")
print('Test Area Under ROC', evaluator.evaluate(predictions))

Test Area Under ROC 1.0
