In [0]:
#import Data
df1 = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/komalpatil1820@gwmail.gwu.edu/cruise_ship_info.csv")
df1.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 23| 

In [0]:
df1.printSchema()


root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Tonnage: string (nullable = true)
 |-- passengers: string (nullable = true)
 |-- length: string (nullable = true)
 |-- cabins: string (nullable = true)
 |-- passenger_density: string (nullable = true)
 |-- crew: string (nullable = true)



In [0]:
df1.columns

Out[98]: ['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [0]:
# File location and type
file_location = "dbfs:/FileStore/shared_uploads/komalpatil1820@gwmail.gwu.edu/cruise_ship_info.csv"
file_type = "csv"
 
# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","
 
# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)
 
display(df)

Ship_name,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew
Journey,Azamara,6,30.277,6.94,5.94,3.55,42.64,3.55
Quest,Azamara,6,30.277,6.94,5.94,3.55,42.64,3.55
Celebration,Carnival,26,47.262,14.86,7.22,7.43,31.8,6.7
Conquest,Carnival,11,110.0,29.74,9.53,14.88,36.99,19.1
Destiny,Carnival,17,101.353,26.42,8.92,13.21,38.36,10.0
Ecstasy,Carnival,22,70.367,20.52,8.55,10.2,34.29,9.2
Elation,Carnival,15,70.367,20.52,8.55,10.2,34.29,9.2
Fantasy,Carnival,23,70.367,20.56,8.55,10.22,34.23,9.2
Fascination,Carnival,19,70.367,20.52,8.55,10.2,34.29,9.2
Freedom,Carnival,6,110.239,37.0,9.51,14.87,29.79,11.5


In [0]:
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [0]:
# Change the Column "Cruise_Line" to Categorical Variable
from pyspark.ml.feature import StringIndexer
indexer=StringIndexer(inputCol='Cruise_line',outputCol='Category')
indexed=indexer.fit(df).transform(df)
indexed.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Category|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|    16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|    16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|     1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|     1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|     1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|     1.0|
|    Elation|   Carnival| 15|            70.36

In [0]:
indexed.columns

Out[102]: ['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Category']

In [0]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [0]:
#Creating a column with all the features as a dense vector
assembler = VectorAssembler(inputCols=['Category', 'Age', 'Tonnage', 'passengers','length','cabins','passenger_density'], outputCol='features')

In [0]:
output = assembler.transform(indexed)
output.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------+--------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Category|            features|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------+--------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|    16.0|[16.0,6.0,30.2769...|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|    16.0|[16.0,6.0,30.2769...|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|     1.0|[1.0,26.0,47.262,...|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|     1.0|[1.0,11.0,110.0,2...|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|     1.0|[1.0,1

In [0]:
final_data = output.select('features', 'crew')

final_data.show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[16.0,6.0,30.2769...|3.55|
|[16.0,6.0,30.2769...|3.55|
|[1.0,26.0,47.262,...| 6.7|
|[1.0,11.0,110.0,2...|19.1|
|[1.0,17.0,101.353...|10.0|
|[1.0,22.0,70.367,...| 9.2|
|[1.0,15.0,70.367,...| 9.2|
|[1.0,23.0,70.367,...| 9.2|
|[1.0,19.0,70.367,...| 9.2|
|[1.0,6.0,110.2389...|11.5|
|[1.0,10.0,110.0,2...|11.6|
|[1.0,28.0,46.052,...| 6.6|
|[1.0,18.0,70.367,...| 9.2|
|[1.0,17.0,70.367,...| 9.2|
|[1.0,11.0,86.0,21...| 9.3|
|[1.0,8.0,110.0,29...|11.6|
|[1.0,9.0,88.5,21....|10.3|
|[1.0,15.0,70.367,...| 9.2|
|[1.0,12.0,88.5,21...| 9.3|
|[1.0,20.0,70.367,...| 9.2|
+--------------------+----+
only showing top 20 rows



In [0]:
#Randomly dividing the data to Train and Test data, 70% and 30% repectively
train_data, test_data = final_data.randomSplit([0.7, 0.3])

print("Train data:", train_data.describe().show())
print("Test data:", test_data.describe().show())

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              114|
|   mean|7.904385964912283|
| stddev| 3.65303196536497|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+

Train data: None
+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                44|
|   mean| 7.508636363636364|
| stddev|3.1035123320080444|
|    min|               0.6|
|    max|             12.38|
+-------+------------------+

Test data: None


In [0]:
# Build linear regression model
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(labelCol='crew', featuresCol='features')
# Fit the model
lr_model = lr.fit(train_data)

In [0]:
# Cross Validation    
check_data = test_data.select('features')
pred = lr_model.transform(check_data)
pred.show(5)
pred.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[0.0,11.0,138.0,3...|12.905521778646586|
|[0.0,16.0,78.491,...| 8.222910194871663|
|[0.0,17.0,70.0,20...| 7.592958948966413|
|[0.0,18.0,70.0,18...| 7.930006059845462|
|[1.0,10.0,110.0,2...|12.150923479783906|
+--------------------+------------------+
only showing top 5 rows

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[0.0,11.0,138.0,3...|12.905521778646586|
|[0.0,16.0,78.491,...| 8.222910194871663|
|[0.0,17.0,70.0,20...| 7.592958948966413|
|[0.0,18.0,70.0,18...| 7.930006059845462|
|[1.0,10.0,110.0,2...|12.150923479783906|
|[2.0,9.0,113.0,26...|11.350433980753552|
|[2.0,9.0,116.0,26...|11.139072317061979|
|[2.0,10.0,91.6270...| 9.227582817710738|
|[2.0,15.0,108.806...|11.022898132697833|
|[2.0,18.0,77.499,...| 8.487478679061988|
|[2.0,29.0,44.348,...| 5.497139921166762|
|[3.0,16.

In [0]:
def modelsummary(model, param_names):
    import numpy as np
    print ("Note: the last rows are the information for Intercept")
    print ("##","-------------------------------------------------")
    print ("##","  Estimate   |   Std.Error | t Values  |  P-value")
    coef = np.append(list(model.coefficients), model.intercept)
    Summary=model.summary
    param_names.append('intercept')
 
    for i in range(len(Summary.pValues)):
        print ("##",'{:10.6f}'.format(coef[i]),\
        '{:14.6f}'.format(Summary.coefficientStandardErrors[i]),\
        '{:12.3f}'.format(Summary.tValues[i]),\
        '{:12.6f}'.format(Summary.pValues[i]), \
        param_names[i])
 
    print ("##",'---')
    print ("##","Mean squared error: % .6f" \
           % Summary.meanSquaredError, ", RMSE: % .6f" \
           % Summary.rootMeanSquaredError )
    print ("##","Multiple R-squared: %f" % Summary.r2, "," )
    print ("##","Multiple Adjusted R-squared: %f" % Summary.r2adj, ", \
            Total iterations: %i"% Summary.totalIterations)

In [0]:
print("Intercept :{}".format(lr_model.intercept))
print("Coefficients :{}".format(lr_model.coefficients))

Intercept :-1.3021999624381049
Coefficients :[0.06846019489552832,-0.015384534326733592,0.0047852716280720005,-0.14112226794713478,0.467217726228977,0.8679796404213674,-0.0038116482630337085]


In [0]:
print("Coefficients: {} Intercept: {}".format(lr_model.coefficients,lr_model.intercept))
param_names = ['Category', 'Age', 'Tonnage', 'passengers','length','cabins','passenger_density']
modelsummary(lr_model, param_names)

Coefficients: [0.06846019489552832,-0.015384534326733592,0.0047852716280720005,-0.14112226794713478,0.467217726228977,0.8679796404213674,-0.0038116482630337085] Intercept: -1.3021999624381049
Note: the last rows are the information for Intercept
## -------------------------------------------------
##   Estimate   |   Std.Error | t Values  |  P-value
##   0.068460       0.034571        1.980     0.050265 Category
##  -0.015385       0.020817       -0.739     0.461519 Age
##   0.004785       0.015611        0.307     0.759806 Tonnage
##  -0.141122       0.061670       -2.288     0.024101 passengers
##   0.467218       0.163422        2.859     0.005119 length
##   0.867980       0.107006        8.112     0.000000 cabins
##  -0.003812       0.022117       -0.172     0.863501 passenger_density
##  -1.302200       1.792135       -0.727     0.469061 intercept
## ---
## Mean squared error:  1.103090 , RMSE:  1.050281
## Multiple R-squared: 0.916607 ,
## Multiple Adjusted R-squared: 0.911100 ,

In [0]:
# Cross Validation
pred_test = lr_model.transform(train_data)
pred_test.show(5)
pred_test.show()

+--------------------+----+------------------+
|            features|crew|        prediction|
+--------------------+----+------------------+
|[0.0,4.0,220.0,54...|21.0|20.871096454452957|
|[0.0,5.0,160.0,36...|13.6|15.056961918321965|
|[0.0,6.0,158.0,43...|13.6|13.936526386016387|
|[0.0,7.0,158.0,43...|13.6|13.860403547279885|
|[0.0,9.0,90.09,25...|8.69| 9.314012650334503|
+--------------------+----+------------------+
only showing top 5 rows

+--------------------+-----+------------------+
|            features| crew|        prediction|
+--------------------+-----+------------------+
|[0.0,4.0,220.0,54...| 21.0|20.871096454452957|
|[0.0,5.0,160.0,36...| 13.6|15.056961918321965|
|[0.0,6.0,158.0,43...| 13.6|13.936526386016387|
|[0.0,7.0,158.0,43...| 13.6|13.860403547279885|
|[0.0,9.0,90.09,25...| 8.69| 9.314012650334503|
|[0.0,10.0,90.09,2...| 8.58| 8.916717074222367|
|[0.0,10.0,138.0,3...|11.85| 12.92090631297332|
|[0.0,11.0,90.09,2...| 8.48| 8.901332539895632|
|[0.0,12.0,90.09,2...| 8

In [0]:
#Predictions on Test Data
 
pred_test = lr_model.transform(test_data)
pred_test.show(5)

+--------------------+-----+------------------+
|            features| crew|        prediction|
+--------------------+-----+------------------+
|[0.0,11.0,138.0,3...|11.85|12.905521778646586|
|[0.0,16.0,78.491,...| 7.65| 8.222910194871663|
|[0.0,17.0,70.0,20...|  7.2| 7.592958948966413|
|[0.0,18.0,70.0,18...|  7.2| 7.930006059845462|
|[1.0,10.0,110.0,2...| 11.6|12.150923479783906|
+--------------------+-----+------------------+
only showing top 5 rows



In [0]:
#Printing the insights that we receive from the testing of the model
test_model = lr_model.evaluate(test_data)
print("R-Squared :{}".format(test_model.r2))
print("RMSE :{}".format(test_model.rootMeanSquaredError))

R-Squared :0.9614161409868212
RMSE :0.6026486610080325


In [0]:
#Evaluation of the Linear Regression Model
 
from pyspark.ml.evaluation import RegressionEvaluator
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="crew",
                                predictionCol="prediction",
                                metricName="rmse")
# metricName Supports: - "rmse" (default): root mean squared error - 
# "mse": mean squared error - 
# "r2": R^2^ metric - 
# "mae": mean absolute error
 
rmse = evaluator.evaluate(pred_test)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
 

Root Mean Squared Error (RMSE) on test data = 0.602649


In [0]:
from pyspark.sql.functions import corr

df.select(corr('Tonnage', 'crew')).show()
df.select(corr('Age', 'crew')).show()
df.select(corr('passengers', 'crew')).show()
df.select(corr('length', 'crew')).show()
df.select(corr('cabins', 'crew')).show()

+-------------------+
|corr(Tonnage, crew)|
+-------------------+
| 0.9275688115449388|
+-------------------+

+-------------------+
|    corr(Age, crew)|
+-------------------+
|-0.5306565039638852|
+-------------------+

+----------------------+
|corr(passengers, crew)|
+----------------------+
|    0.9152341306065384|
+----------------------+

+------------------+
|corr(length, crew)|
+------------------+
| 0.895856627101658|
+------------------+

+------------------+
|corr(cabins, crew)|
+------------------+
|0.9508226063578497|
+------------------+



In [0]:
#Bonus quetion:
#This presumption indicates that the ship's properties (such as its tonnage, passenger capacity, length, etc.) are all independent of one another in the context of this cruise crew prediction problem. Due to the assumption, there is no chance of overestimating or underestimating the effect of any feature on the outcome variable (staff size) as a result of feature correlations.
#The strong R-squared value of the model further implies that the selected variables are highly predictive of the crew size, demonstrating that they are pertinent and instructive in predicting the outcome variable. Also, the low RMSE value shows that the projected values of the model are reasonably near to the actual values, indicating that the model is capable of reliably estimating the crew size for new ships based on their specific characteristics.
#The final factor that might have affected the performance of the model is the algorithm (linear regression) and implementation (pyspark). For forecasting continuous target variables, linear regression is a straightforward but effective approach, and pyspark is a well-liked distributed computing system that can effectively handle huge datasets.
#In general, the model's strong performance may be attributed to the combination of pertinent and instructive features as well as the assumption of observational independence, which enables the model to precisely estimate each feature's impact on the outcome variable.