In [None]:
# File location and type
file_location = "/FileStore/tables/cruise_ship_info.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)


In [None]:
df.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 23| 

In [None]:
from pyspark.ml.feature import StringIndexer

In [None]:
df.groupby('Cruise_line').count().show()

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|            Costa|   11|
|              P&O|    6|
|           Cunard|    3|
|Regent_Seven_Seas|    5|
|              MSC|    8|
|         Carnival|   22|
|          Crystal|    2|
|           Orient|    1|
|         Princess|   17|
|        Silversea|    4|
|         Seabourn|    3|
| Holland_American|   14|
|         Windstar|    3|
|           Disney|    2|
|        Norwegian|   13|
|          Oceania|    3|
|          Azamara|    2|
|        Celebrity|   10|
|             Star|    6|
|  Royal_Caribbean|   23|
+-----------------+-----+



In [None]:
group_Cruise_line = df.groupBy("Cruise_line").count()
group_Cruise_line.agg({'Cruise_line': 'Count'}).show()

+------------------+
|count(Cruise_line)|
+------------------+
|                20|
+------------------+



In [None]:
indexer = StringIndexer(inputCol='Cruise_line', outputCol='Cruise_line_number')
indexed = indexer.fit(df).transform(df)
df2 = indexed

In [None]:
df2.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_line_number|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|              16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|              16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|               1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|               1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|               1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2

In [None]:
group_Cruise_line_indexed = df2.groupBy("Cruise_line_number").count()
group_Cruise_line_indexed.agg({'Cruise_line_number': 'Count'}).show()

+-------------------------+
|count(Cruise_line_number)|
+-------------------------+
|                       20|
+-------------------------+



In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
assembler = VectorAssembler(
            inputCols=["Age","Tonnage","passengers","length","cabins","passenger_density","Cruise_line_number"],
            outputCol="features")

In [None]:
df3 = assembler.transform(df2)

In [None]:
df3.select('features', 'crew').show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
|[22.0,70.367,20.5...| 9.2|
|[15.0,70.367,20.5...| 9.2|
|[23.0,70.367,20.5...| 9.2|
|[19.0,70.367,20.5...| 9.2|
|[6.0,110.23899999...|11.5|
|[10.0,110.0,29.74...|11.6|
|[28.0,46.052,14.5...| 6.6|
|[18.0,70.367,20.5...| 9.2|
|[17.0,70.367,20.5...| 9.2|
|[11.0,86.0,21.24,...| 9.3|
|[8.0,110.0,29.74,...|11.6|
|[9.0,88.5,21.24,9...|10.3|
|[15.0,70.367,20.5...| 9.2|
|[12.0,88.5,21.24,...| 9.3|
|[20.0,70.367,20.5...| 9.2|
+--------------------+----+
only showing top 20 rows



In [None]:
df_final = df3.select('features', 'crew')

In [None]:
train_data, test_data = df_final.randomSplit([0.7, 0.3]) 

In [None]:
from pyspark.ml.regression import LinearRegression

In [None]:
lr = LinearRegression(labelCol='crew')

In [None]:
lr_model = lr.fit(train_data)

In [None]:
test_results = lr_model.evaluate(test_data)

In [None]:
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
| 0.37318926575133027|
|  1.8576930828460005|
| -0.7315006153829602|
| 0.48901318228686286|
| 0.09494131776075498|
|  0.9001709838415124|
| -0.5938143277508452|
|  1.1204247691339368|
|-0.00352002163126...|
|-0.17067038344779117|
|   2.060334550557487|
|-0.21404498227257207|
|  0.9087840486075827|
| -0.9443759918752477|
|  0.4697212928768616|
|-0.26341288566246757|
|  0.8263333842975022|
|  -1.229560674842956|
|   0.261279034284021|
|  -0.452316834697136|
+--------------------+
only showing top 20 rows



In [None]:
print("RMSE: {}".format(test_results.rootMeanSquaredError))
print("MSE: {}".format(test_results.meanSquaredError))
print("R2: {}".format(test_results.r2))

RMSE: 0.7512827679788865
MSE: 0.5644257974620174
R2: 0.9566660117142847


In [None]:
new_data = test_data.select('features')

In [None]:
predictions = lr_model.transform(new_data)

In [None]:
predictions.show()

+--------------------+-------------------+
|            features|         prediction|
+--------------------+-------------------+
|[5.0,115.0,35.74,...| 11.826810734248669|
|[5.0,122.0,28.5,1...|     4.842306917154|
|[6.0,30.276999999...|   4.28150061538296|
|[6.0,93.0,23.94,9...| 10.600986817713137|
|[7.0,158.0,43.7,1...| 13.505058682239245|
|[8.0,91.0,22.44,9...| 10.099829016158488|
|[9.0,85.0,19.68,9...|  9.283814327750845|
|[9.0,113.0,26.74,...| 11.259575230866064|
|[9.0,116.0,26.0,9...|  11.00352002163127|
|[10.0,68.0,10.8,7...| 6.5306703834477915|
|[10.0,151.4,26.2,...| 10.469665449442513|
|[11.0,90.09,25.01...|  8.694044982272572|
|[11.0,91.0,20.32,...|  9.081215951392418|
|[11.0,138.0,31.14...| 12.794375991875247|
|[12.0,2.329,0.94,...|0.13027870712313838|
|[12.0,50.0,7.0,7....|  4.713412885662468|
|[12.0,88.5,21.24,...|  9.463666615702497|
|[12.0,88.5,21.24,...| 10.529560674842957|
|[12.0,108.865,27....| 10.738720965715979|
|[13.0,61.0,13.8,7...|  6.452316834697136|
+----------