<a href="https://colab.research.google.com/github/immik26/Model-cruise-data/blob/main/cruise_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [136]:
!pip install pyarrow



In [137]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz
!tar xf spark-3.3.3-bin-hadoop3.tgz
!pip install -q findspark

In [138]:
!ls /usr/lib/jvm/

java-1.11.0-openjdk-amd64  java-1.8.0-openjdk-amd64
java-11-openjdk-amd64	   java-8-openjdk-amd64


In [139]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.3-bin-hadoop3"

In [140]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [141]:
from pyspark.sql.functions import *

In [142]:
import pyspark

In [143]:
df=spark.read.csv("/content/cruise_ship_info (1).csv",inferSchema=True,header=True)

In [144]:
df.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 23| 

In [145]:
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [146]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [147]:
feat = VectorAssembler(inputCols=["Age","Tonnage","passengers","length","cabins","passenger_density"],outputCol="Member")

In [148]:
output=feat.transform(df)

In [149]:
output.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|              Member|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|[6.0,30.276999999...|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|[6.0,30.276999999...|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|[26.0,47.262,14.8...|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|[11.0,110.0,29.74...|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|[17.0,101.353,26....|
|    Ecstasy|   Carnival| 22|            70.367|     20.

In [150]:
final_df=output.select(["Member","crew"])

In [151]:
final_df.show()

+--------------------+----+
|              Member|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
|[22.0,70.367,20.5...| 9.2|
|[15.0,70.367,20.5...| 9.2|
|[23.0,70.367,20.5...| 9.2|
|[19.0,70.367,20.5...| 9.2|
|[6.0,110.23899999...|11.5|
|[10.0,110.0,29.74...|11.6|
|[28.0,46.052,14.5...| 6.6|
|[18.0,70.367,20.5...| 9.2|
|[17.0,70.367,20.5...| 9.2|
|[11.0,86.0,21.24,...| 9.3|
|[8.0,110.0,29.74,...|11.6|
|[9.0,88.5,21.24,9...|10.3|
|[15.0,70.367,20.5...| 9.2|
|[12.0,88.5,21.24,...| 9.3|
|[20.0,70.367,20.5...| 9.2|
+--------------------+----+
only showing top 20 rows



In [152]:
train,test=final_df.randomSplit([0.79,0.21])

In [153]:
train.show()

+--------------------+-----+
|              Member| crew|
+--------------------+-----+
|[4.0,220.0,54.0,1...| 21.0|
|[5.0,115.0,35.74,...| 12.2|
|[5.0,122.0,28.5,1...|  6.7|
|[5.0,133.5,39.59,...|13.13|
|[5.0,160.0,36.34,...| 13.6|
|[6.0,30.276999999...| 3.55|
|[6.0,30.276999999...| 3.55|
|[6.0,90.0,20.0,9....|  9.0|
|[6.0,93.0,23.94,9...|11.09|
|[6.0,110.23899999...| 11.5|
|[6.0,112.0,38.0,9...| 10.9|
|[6.0,113.0,37.82,...| 12.0|
|[6.0,158.0,43.7,1...| 13.6|
|[7.0,89.6,25.5,9....| 9.87|
|[7.0,116.0,31.0,9...| 12.0|
|[7.0,158.0,43.7,1...| 13.6|
|[8.0,91.0,22.44,9...| 11.0|
|[8.0,110.0,29.74,...| 11.6|
|[9.0,59.058,17.0,...|  7.4|
|[9.0,85.0,19.68,9...| 8.69|
+--------------------+-----+
only showing top 20 rows



In [154]:
test.show()

+--------------------+----+
|              Member|crew|
+--------------------+----+
|[5.0,86.0,21.04,9...| 8.0|
|[8.0,77.499,19.5,...| 9.0|
|[9.0,81.0,21.44,9...|10.0|
|[9.0,110.0,29.74,...|11.6|
|[10.0,58.825,15.6...| 7.0|
|[10.0,77.0,20.16,...| 9.0|
|[10.0,81.76899999...|8.42|
|[11.0,86.0,21.24,...| 9.3|
|[11.0,90.0,22.4,9...|11.0|
|[11.0,91.62700000...| 9.0|
|[11.0,110.0,29.74...|19.1|
|[12.0,42.0,14.8,7...| 6.8|
|[12.0,58.6,15.66,...| 7.0|
|[13.0,61.0,13.8,7...| 6.0|
|[14.0,30.27699999...|3.73|
|[15.0,83.338,17.5...|9.45|
|[17.0,70.367,20.5...| 9.2|
|[17.0,74.137,19.5...| 7.6|
|[17.0,75.166,19.2...|7.66|
|[18.0,70.367,20.5...| 9.2|
+--------------------+----+
only showing top 20 rows



In [155]:
from pyspark.ml.regression import LinearRegression

In [156]:
reg = LinearRegression(featuresCol="Member",labelCol="crew")

In [157]:
reg=reg.fit(train)

In [158]:
reg.coefficients

DenseVector([-0.0157, 0.0137, -0.14, 0.4593, 0.7399, -0.0039])

In [159]:
reg.intercept

-0.48338745676169725

In [160]:
train_sum=reg.summary

In [161]:
train_sum.rootMeanSquaredError

0.809913131210546

In [162]:
train_sum.r2

0.9496410327496956

In [163]:
train_sum.r2adj

0.9471230843871804

In [164]:
pred=reg.evaluate(test)

In [165]:
pred.predictions.show()

+--------------------+----+------------------+
|              Member|crew|        prediction|
+--------------------+----+------------------+
|[5.0,86.0,21.04,9...| 8.0| 9.366812608977343|
|[8.0,77.499,19.5,...| 9.0| 8.708609185245153|
|[9.0,81.0,21.44,9...|10.0| 9.493096226418313|
|[9.0,110.0,29.74,...|11.6|11.943041389746242|
|[10.0,58.825,15.6...| 7.0| 7.270983924302795|
|[10.0,77.0,20.16,...| 9.0|  8.58416172665555|
|[10.0,81.76899999...|8.42| 8.956380124759638|
|[11.0,86.0,21.24,...| 9.3| 9.666287939514127|
|[11.0,90.0,22.4,9...|11.0| 9.998037787441222|
|[11.0,91.62700000...| 9.0| 9.379456547687653|
|[11.0,110.0,29.74...|19.1| 11.92370294377876|
|[12.0,42.0,14.8,7...| 6.8| 6.468445090357514|
|[12.0,58.6,15.66,...| 7.0| 7.367096628522447|
|[13.0,61.0,13.8,7...| 6.0| 6.712931561296372|
|[14.0,30.27699999...|3.73|3.8455347845581187|
|[15.0,83.338,17.5...|9.45| 8.683836526365901|
|[17.0,70.367,20.5...| 9.2| 8.677251329472472|
|[17.0,74.137,19.5...| 7.6| 8.804077578886552|
|[17.0,75.166

In [166]:
from pyspark.ml.evaluation import RegressionEvaluator

In [167]:
pred_eval=RegressionEvaluator(predictionCol="prediction",labelCol="crew",metricName="r2")

In [168]:
pred_eval.evaluate(pred.predictions)

0.766077494617274

In [169]:
from pyspark.ml.regression import DecisionTreeRegressor

In [170]:
dt = DecisionTreeRegressor(featuresCol="Member",labelCol="crew")

In [171]:
dt=dt.fit(train)

In [172]:
dtpred_test = dt.transform(test)

In [173]:
dtpred.show()

+--------------------+-----+------------------+
|              Member| crew|        prediction|
+--------------------+-----+------------------+
|[4.0,220.0,54.0,1...| 21.0|              21.0|
|[5.0,86.0,21.04,9...|  8.0|  9.50409090909091|
|[5.0,115.0,35.74,...| 12.2|11.993333333333334|
|[5.0,122.0,28.5,1...|  6.7|6.9750000000000005|
|[5.0,133.5,39.59,...|13.13|12.521249999999998|
|[5.0,160.0,36.34,...| 13.6|12.521249999999998|
|[6.0,30.276999999...| 3.55| 3.663333333333333|
|[6.0,30.276999999...| 3.55| 3.663333333333333|
|[6.0,90.0,20.0,9....|  9.0|  9.50409090909091|
|[6.0,93.0,23.94,9...|11.09|10.823333333333332|
|[6.0,112.0,38.0,9...| 10.9|11.466666666666669|
|[6.0,113.0,37.82,...| 12.0|11.993333333333334|
|[6.0,158.0,43.7,1...| 13.6|12.521249999999998|
|[7.0,89.6,25.5,9....| 9.87|  9.50409090909091|
|[7.0,116.0,31.0,9...| 12.0|11.993333333333334|
|[8.0,91.0,22.44,9...| 11.0|10.823333333333332|
|[8.0,110.0,29.74,...| 11.6|11.466666666666669|
|[9.0,59.058,17.0,...|  7.4|6.9750000000

In [174]:
dtpred_test= dt.transform(train)

In [175]:
evaluator = RegressionEvaluator(
    labelCol="crew", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(dtpred)
print ("Root Mean Squared Error (RMSE) on train data = %g" % rmse)

Root Mean Squared Error (RMSE) on train data = 0.616493


In [176]:
evaluator = RegressionEvaluator(
    labelCol="crew", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(dtpred)
print ("r2 on train data = %g" % r2)

r2 on train data = 0.97118


In [177]:
evaluator = RegressionEvaluator(
    labelCol="crew", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(dtpred_test)
print ("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.575119


In [178]:
evaluator = RegressionEvaluator(
    labelCol="crew", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(dtpred_test)
print ("r2 on test data = %g" % r2)

r2 on test data = 0.974607


In [179]:
from pyspark.ml.regression import RandomForestRegressor

In [180]:
rf = RandomForestRegressor(featuresCol="Member",labelCol="crew" )

In [181]:
rfmodel = rf.fit(train)

In [182]:
predictions = rfmodel.transform(test)

In [183]:
evaluator = RegressionEvaluator(
    labelCol="crew", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print ("r2 on test data = %g" % r2)

r2 on test data = 0.742583


In [184]:
predictions_train = rfmodel.transform(train)

In [185]:
evaluator = RegressionEvaluator(
    labelCol="crew", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions_train)
print ("r2 on train data = %g" % r2)

r2 on train data = 0.964053
