# Consulting Project: Cruise Ship Crew Estimations
Estimate how many crew members a Hyundai Heavy Industry ship will require, taking into account the minimum number of crew members different cruise companies view as "acceptable minimums."

In [1]:
#libraries
from pyspark.sql import SparkSession

In [2]:
#start spark session
spark = SparkSession.builder.appName("Cruise").getOrCreate()

In [3]:
#import data
cruiseData = spark.read.csv("cruise_ship_info.csv", inferSchema = True, header = True)

In [4]:
#check out what's in the CSV
for entry in cruiseData.head(5):
    print("\n")
    print(entry)



Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55)


Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55)


Row(Ship_name='Celebration', Cruise_line='Carnival', Age=26, Tonnage=47.262, passengers=14.86, length=7.22, cabins=7.43, passenger_density=31.8, crew=6.7)


Row(Ship_name='Conquest', Cruise_line='Carnival', Age=11, Tonnage=110.0, passengers=29.74, length=9.53, cabins=14.88, passenger_density=36.99, crew=19.1)


Row(Ship_name='Destiny', Cruise_line='Carnival', Age=17, Tonnage=101.353, passengers=26.42, length=8.92, cabins=13.21, passenger_density=38.36, crew=10.0)


In [5]:
#get column data
cruiseData.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [6]:
#find minimums by cruise line
from pyspark.sql.functions import min, max
lines = cruiseData.groupBy("Cruise_line")
lines.agg({"Tonnage": "min", "crew": "min"}).show()

+-----------------+------------------+---------+
|      Cruise_line|      min(Tonnage)|min(crew)|
+-----------------+------------------+---------+
|            Costa|              25.0|     3.85|
|              P&O|              45.0|      5.2|
|           Cunard|            70.327|      9.0|
|Regent_Seven_Seas|              12.5|     1.46|
|              MSC|            16.852|     2.97|
|         Carnival|            46.052|      6.6|
|          Crystal|            51.004|     5.45|
|           Orient|             22.08|      3.5|
|         Princess|30.276999999999997|     3.73|
|        Silversea|              16.8|     1.97|
|         Seabourn|              10.0|      1.6|
| Holland_American|             33.92|      4.6|
|         Windstar|              5.35|     0.88|
|           Disney|              83.0|     9.45|
|        Norwegian|              28.0|      3.8|
|          Oceania|30.276999999999997|      4.0|
|          Azamara|30.276999999999997|     3.55|
|        Celebrity| 

In [7]:
#Set-up the Linear Regression
from pyspark.ml.feature import StringIndexer

In [8]:
#Change cruise line name to an index number
indexer = StringIndexer(inputCol="Cruise_line", outputCol="clIndex")
indexed = indexer.fit(cruiseData).transform(cruiseData)
indexed.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|clIndex|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|   16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|   16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|    1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|    1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|    1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|    1.0|
|    Elation|   Carnival| 15|            70.367|     20

In [9]:
#Vector Assembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols = ["cabins", "passenger_density", "passengers"],
                           outputCol = "features")
output = assembler.transform(indexed)

In [10]:
#Build DF for cruise
cruise_final = output.select("features", "crew")

In [11]:
#Train-test Split
train_data, test_data = cruise_final.randomSplit([.7, .3])

In [12]:
#Linear Regression Model
from pyspark.ml.regression import LinearRegression
LinReg = LinearRegression(labelCol = "crew")
model = LinReg.fit(train_data)
print("Coeff: ", model.coefficients)
print("y-Intercept: ", model.intercept)

Coeff:  [0.871458810877878,0.03595689225815899,-0.0560746679363631]
y-Intercept:  -0.3604215913717642


In [13]:
#evaluate the model's error + variance capacity
test_results = model.evaluate(test_data)
print("RMSE: ", test_results.rootMeanSquaredError)
print("r2 :", test_results.r2)

RMSE:  1.0269055637529605
r2 : 0.9024514938696815


In [14]:
#for reference - check out the mean/average, etc. 
train_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               105|
|   mean| 7.855142857142858|
| stddev|3.6068455518866944|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+



In [15]:
#see if there's a correlation between cabins and crew. 
from pyspark.sql.functions import corr
indexed.select(corr("cabins", "crew")).show()

+------------------+
|corr(cabins, crew)|
+------------------+
|0.9508226063578497|
+------------------+



## Finding
So, based on this model, cabins and passenger density drive how many crew members beyond the minimum each line should have. 