## Load the csv file

In [16]:
# Finds the spark path 
import findspark
findspark.init()

import pyspark
import pyspark.sql
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

spark = SparkSession.builder \
     .master("local") \
     .appName("hotels") \
     .getOrCreate()

hotels_df = spark.read.csv("../input/Hotels_data_Changed.csv", header=True)

### Get the highest discount code for features

In [17]:
from pyspark.sql import Row

def rowToKeyValue(row):
    key = (row['WeekDay'], row["Snapshot Date"], row["Checkin Date"], row["DayDiff"], row["Hotel Name"])
    val = ([row["Discount Code"]], row['DiscountPerc'])
    return (key,val)

def reduceToMaxDiscountPerKey(val1, val2):
    codes1, discount1 = val1
    codes2, discount2 = val2
    if (discount1 > discount2):
        return val1
    elif(discount2 > discount1):
        return val2
    else: # In case the discounts are equals, merge the prices to same array
        return (codes1+ codes2, discount1)

def flatMapDiscountCodes(row):
    key, val = row
    codes = val[0]
    # Return list of key & code
    return [(key, code) for code in codes]

def rddToRow(rddRow):
    return Row(WeekDay=rddRow[0][0], SnapshotDate=rddRow[0][1], CheckinDate=rddRow[0][2],\
                DayDiff=rddRow[0][3], HotelName=rddRow[0][4], DiscountCode=rddRow[1])

hotelsBestDiscountCode_df = hotels_df.rdd\
                .map(rowToKeyValue)\
                .reduceByKey(reduceToMaxDiscountPerKey)\
                .flatMap(flatMapDiscountCodes)\
                .map(rddToRow).toDF()
hotelsBestDiscountCode_df.show()

+-----------+-------+------------+--------------------+------------+-------+
|CheckinDate|DayDiff|DiscountCode|           HotelName|SnapshotDate|WeekDay|
+-----------+-------+------------+--------------------+------------+-------+
| 2015-08-12|     26|           2|Best Western Plus...|  2015-07-17|    Wed|
| 2015-08-19|     33|           2|Best Western Plus...|  2015-07-17|    Wed|
| 2015-08-13|     27|           2|The Peninsula New...|  2015-07-17|    Thu|
| 2015-07-26|      9|           1|Eventi Hotel a Ki...|  2015-07-17|    Sun|
| 2015-08-12|     26|           2|Eventi Hotel a Ki...|  2015-07-17|    Wed|
| 2015-08-07|     21|           1|Grand Hyatt New York|  2015-07-17|    Fri|
| 2015-08-09|     23|           1|Grand Hyatt New York|  2015-07-17|    Sun|
| 2015-08-12|     26|           1|Grand Hyatt New York|  2015-07-17|    Wed|
| 2015-08-13|     27|           3|Grand Hyatt New York|  2015-07-17|    Thu|
| 2015-07-22|      5|           2|Hilton New York F...|  2015-07-17|    Wed|

## Normalize the data

In [18]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
import pandas as pd

# Transform string values to numeric
indexers = [StringIndexer(inputCol="WeekDay", outputCol="WeekDayIndex"),
            StringIndexer(inputCol="HotelName", outputCol="HotelNameIndex"),]
pipeline = Pipeline(stages=indexers)
hotelsWithIndexedStrings_df = pipeline.fit(hotelsBestDiscountCode_df).transform(hotelsBestDiscountCode_df)

# Extract date values
dateYearValue = udf(lambda x: pd.to_datetime(x).year, IntegerType())
dateDayValue = udf(lambda x: pd.to_datetime(x).day, IntegerType())
dateMonthValue = udf(lambda x: pd.to_datetime(x).month, IntegerType())

hotelsWithDateIndexed_df = hotelsWithIndexedStrings_df\
                     .withColumn('SnapshotDateYear', dateYearValue(col('SnapshotDate')))\
                     .withColumn('SnapshotDateMonth', dateMonthValue(col('SnapshotDate')))\
                     .withColumn('SnapshotDateDay', dateDayValue(col('SnapshotDate')))\
                     .withColumn('CheckinDateYear', dateYearValue(col('CheckinDate')))\
                     .withColumn('CheckinDateMonth', dateMonthValue(col('CheckinDate')))\
                     .withColumn('CheckinDateDay', dateDayValue(col('CheckinDate')))
# Convert string column to int
hotelsWithIntCoulmn_df = hotelsWithDateIndexed_df.withColumn("DayDiff",\
                                   hotelsWithDateIndexed_df["DayDiff"].cast("integer"))\
                                .withColumn('DiscountCode', hotelsWithDateIndexed_df['DiscountCode'].cast('integer'))

# Remove unneccesary columns
hotelsWithoutColumns_df = hotelsWithIntCoulmn_df.drop('SnapshotDate').drop('CheckinDate')\
                         .drop('HotelName').drop('WeekDay')
hotelsWithoutColumns_df.show()

+-------+------------+------------+--------------+----------------+-----------------+---------------+---------------+----------------+--------------+
|DayDiff|DiscountCode|WeekDayIndex|HotelNameIndex|SnapshotDateYear|SnapshotDateMonth|SnapshotDateDay|CheckinDateYear|CheckinDateMonth|CheckinDateDay|
+-------+------------+------------+--------------+----------------+-----------------+---------------+---------------+----------------+--------------+
|     26|           2|         0.0|         144.0|            2015|                7|             17|           2015|               8|            12|
|     33|           2|         0.0|         144.0|            2015|                7|             17|           2015|               8|            19|
|     27|           2|         1.0|          73.0|            2015|                7|             17|           2015|               8|            13|
|      9|           1|         6.0|          83.0|            2015|                7|             17

### Distinct values

In [4]:
from pyspark.sql.functions import col, countDistinct

def transposeDF(df):
    pandas_df = df.toPandas().transpose().reset_index()
    return spark.createDataFrame(pandas_df)
    
# Show distinct values count
distinctValuesDF = hotelsWithoutColumns_df.agg(*(countDistinct(col(c)).alias(c) for c in hotelsWithoutColumns_df.columns));
transposeDF(distinctValuesDF).show()

+-----------------+---+
|            index|  0|
+-----------------+---+
|          DayDiff| 34|
|     DiscountCode|  4|
|     WeekDayIndex|  7|
|   HotelNameIndex|554|
| SnapshotDateYear|  2|
|SnapshotDateMonth|  7|
|  SnapshotDateDay| 31|
|  CheckinDateYear|  2|
| CheckinDateMonth|  8|
|   CheckinDateDay| 31|
+-----------------+---+



## Columns statsitics

In [5]:
# Data exploration
describe_df = hotelsWithoutColumns_df.describe(hotelsWithoutColumns_df.columns);
transposeDF(describe_df).show()

+-----------------+------+------------------+------------------+----+-----+
|            index|     0|                 1|                 2|   3|    4|
+-----------------+------+------------------+------------------+----+-----+
|          summary| count|              mean|            stddev| min|  max|
|          DayDiff|115580|17.473083578473783|10.042081429292136|   1|   34|
|     DiscountCode|115580| 2.410892888042914|1.0198765486288277|   1|    4|
|     WeekDayIndex|115580| 2.696954490396262| 1.998808961886477| 0.0|  6.0|
|   HotelNameIndex|115580| 65.97189825229279| 80.55926349229914| 0.0|553.0|
| SnapshotDateYear|115580|2015.0051046893927|0.0712648263844711|2015| 2016|
|SnapshotDateMonth|115580| 9.564544038761031|1.6544450517640836|   1|   12|
|  SnapshotDateDay|115580| 16.81938051566015| 8.636772522445504|   1|   31|
|  CheckinDateYear|115580|2015.0822028032533|0.2746746350709511|2015| 2016|
| CheckinDateMonth|115580|  9.22885447309223|2.7938784591523005|   1|   12|
|   CheckinD

### Column correlations

In [6]:
for column in hotelsWithoutColumns_df.columns:
    corr = hotelsWithoutColumns_df.corr('DiscountCode', column)
    print("Column %s correlation: %s" % (column, corr))


Column DayDiff correlation: 0.03384978595429131
Column DiscountCode correlation: 1.0
Column WeekDayIndex correlation: -0.0444292123126588
Column HotelNameIndex correlation: -0.05653999461890728
Column SnapshotDateYear correlation: 0.0012586486536448303
Column SnapshotDateMonth correlation: 0.010092504674174543
Column SnapshotDateDay correlation: -0.0065184066646494674
Column CheckinDateYear correlation: 0.0017019981329915415
Column CheckinDateMonth correlation: 0.008843572569188144
Column CheckinDateDay correlation: -0.010942946419650216


## Run decision tree

In [7]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler

# Create vector of all features expect the label
assembler = VectorAssembler(
    inputCols=[x for x in hotelsWithoutColumns_df.columns if x != 'DiscountCode'],
    outputCol='features')

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = hotelsWithoutColumns_df.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol='DiscountCode', featuresCol='features',\
                            impurity='entropy', maxDepth=20, maxBins=554)

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[assembler, dt])

# Train model.  This also runs the indexers.
tree_model = pipeline.fit(trainingData)

## Evaluate the model

In [9]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Make predictions.
predictions = tree_model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "DiscountCode", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="DiscountCode", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Accuaracy = %g " %  accuracy)

+----------+------------+--------------------+
|prediction|DiscountCode|            features|
+----------+------------+--------------------+
|       4.0|           1|[1.0,0.0,1.0,2015...|
|       2.0|           1|[1.0,0.0,2.0,2015...|
|       1.0|           1|[1.0,0.0,2.0,2015...|
|       3.0|           1|[1.0,0.0,2.0,2015...|
|       1.0|           1|[1.0,0.0,3.0,2015...|
+----------+------------+--------------------+
only showing top 5 rows

Test Accuaracy = 0.654097 


### Decision Tree AUC using Spark

In [19]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import functions as F

# Binary classifications
for label in range(1,5):
    # Filter only relevanot to class predictions
    predictions.where("prediction == %s OR DiscountCode == %s" % (label, label)).select('prediction', 'DiscountCode')
                .withColumn('binaryPrediction', F.when(col('prediction')==label,1.0).otherwise(0.0))\
                .withColumn('binaryDiscountCode', F.when(col('DiscountCode')==label,1.0).otherwise(0.0))
            
    binaryPredictions.select('prediction', 'DiscountCode', 'binaryPrediction', 'binaryDiscountCode').show(5)
    
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="binaryPrediction", labelCol="binaryDiscountCode")
    auc = evaluator.evaluate(binaryPredictions)
    
    print("Class %s area under roc = %s" % (label, auc))


IndentationError: unexpected indent (<ipython-input-19-73b7ed4a1766>, line 8)

### Decision Tree ROC

In [None]:
from sklearn.metrics import roc_curve, auc
from scipy import interp
from itertools import cycle

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for label in range(1, 5):
    labelPredictions = predictions.where("prediction == %s OR DiscountCode == %s" % (label, label))\
            .select('prediction', 'DiscountCode').collect()
    
    expected = labelPredictions.map(lambda x : x[0])
    predicted = labelPredictions.map(lambda x : x[1])

    fpr[i], tpr[i], _ = roc_curve(expected, predicted, pos_label=label)
    roc_auc[i] = auc(fpr[i], tpr[i])

# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(1, 5)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(1, 5):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= 4

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure()
lw=2
colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'red'])
for i, color in zip(range(1,5), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i+1.0, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Decision Tree ROC')
plt.legend(loc="lower right")
plt.show()

## Run naive bayes - old

In [None]:
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel

model = NaiveBayes.train(training_data, 1.0)

NaiveBayes_predictionAndLabel = test_data.map(lambda p: (float(model.predict(p.features)), p.label))

naive_metrics = MulticlassMetrics(NaiveBayes_predictionAndLabel)

print('Accuracy {}'.format(naive_metrics.accuracy))
print('False positive rate {}'.format(naive_metrics.weightedFalsePositiveRate))
print(naive_metrics.confusionMatrix())

## Run naive bayes - new

In [None]:
from pyspark.ml.classification import NaiveBayes

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = hotelsWithoutColumns_df.randomSplit([0.9, 0.1])

# Train a NaiveBayes model.
dt = NaiveBayes(labelCol='DiscountCode', featuresCol='features')

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[assembler, dt])

# Train model.  This also runs the indexers.
naive_model = pipeline.fit(trainingData)

## Naive bayes model evaluation

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Make predictions.
naive_predictions = naive_model.transform(testData)

# Select example rows to display.
naive_predictions.select("prediction", "DiscountCode", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="DiscountCode", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Accuaracy = %g " %  accuracy)

## Print naive bayes auc

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import functions as F

# Binary classifications
for label in range(1,5):
    # Filter only relevanot to class predictions
    binaryPredictions = naive_predictions.where("prediction == %s OR DiscountCode == %s" % (label, label))\
                .withColumn('binaryPrediction', F.when(col('prediction')==label,1.0).otherwise(0.0))\
                .withColumn('binaryDiscountCode', F.when(col('DiscountCode')==label,1.0).otherwise(0.0))
            
    binaryPredictions.select('prediction', 'DiscountCode', 'binaryPrediction', 'binaryDiscountCode').show(5)
    
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="binaryPrediction", labelCol="binaryDiscountCode")
    auc = evaluator.evaluate(binaryPredictions)
    
    print("Class %s area under roc = %s" % (label, auc))


## Baive bayes ROC

In [None]:
from sklearn.metrics import roc_curve, auc
from scipy import interp
from itertools import cycle

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for label in range(1, 5):
    labelPredictions = naive_predictions.where("prediction == %s OR DiscountCode == %s" % (label, label))\
            .select('prediction', 'DiscountCode').collect()
    
    expected = labelPredictions.map(lambda x : x[0])
    predicted = labelPredictions.map(lambda x : x[1])

    fpr[i], tpr[i], _ = roc_curve(expected, predicted, pos_label=label)
    roc_auc[i] = auc(fpr[i], tpr[i])

# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(1, 5)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(1, 5):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= 4

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure()
lw=2
colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'red'])
for i, color in zip(range(1,5), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i+1.0, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Decision Tree ROC')
plt.legend(loc="lower right")
plt.show()