In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://mirrors.sonic.net/apache/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xzf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark


import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"


import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

Load the data

In [None]:
df = spark.read.options(header = 'True', inferSchema = 'True').csv("drive/MyDrive/Colab Notebooks/cruise_ship_info.csv") # Con esto, guardamos nuestra base de datos

Check columns and its information

In [None]:
df.printSchema() # We will use cruise line column, so we need to change the type of the column (string is not a valid option to use as feature)

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



To do that, I will use StringIndexer, that assings to each cruise line an index.

In [None]:
from pyspark.ml.feature import StringIndexer

In [None]:
indexer = StringIndexer(inputCol="Cruise_line", outputCol="Cruise_Index")
df = indexer.fit(df).transform(df)

In [None]:
df.printSchema() # We can see that there is a new column (Cruise_Index), with a numeric value

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)
 |-- Cruise_Index: double (nullable = false)



In [None]:
df.describe().toPandas()

Unnamed: 0,summary,Ship_name,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew,Cruise_Index
0,count,158,158,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0
1,mean,Infinity,,15.689873417721518,71.28467088607599,18.45740506329114,8.130632911392404,8.830000000000005,39.90094936708861,7.794177215189873,5.063291139240507
2,stddev,,,7.615691058751413,37.229540025907866,9.677094775143416,1.793473548054825,4.4714172221480615,8.63921711391542,3.503486564627034,4.758744608182735
3,min,Adventure,Azamara,4.0,2.329,0.66,2.79,0.33,17.7,0.59,0.0
4,max,Zuiderdam,Windstar,48.0,220.0,54.0,11.82,27.0,71.43,21.0,19.0


In [None]:
df.columns # Check all column names

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Cruise_Index']

To make the predictions, I will use as features the columns Age, Tonnage, Passengers, Length, Cabins, Passenger_density and Cruise_index.

In [None]:
from pyspark.ml.feature import VectorAssembler

In [None]:
# With assembler we can join all columns in one.
assembler = VectorAssembler(inputCols=['Age', 'Tonnage', 'passengers', 'length', 'cabins', 
                                       'passenger_density', 'Cruise_Index'], outputCol='features', 
                            handleInvalid='skip')
output = assembler.transform(df)

In [None]:
output.show(10) # Comprobamos que la última columna es la nueva con los datos que nos interesan

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+------------+--------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_Index|            features|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+------------+--------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|        16.0|[6.0,30.276999999...|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|        16.0|[6.0,30.276999999...|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|         1.0|[26.0,47.262,14.8...|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|         1.0|[11.0,110.0,29.74...|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|          

In [None]:
df.groupBy('Cruise_line').count().toPandas()  # How many ships of each cruise line

Unnamed: 0,Cruise_line,count
0,Costa,11
1,P&O,6
2,Cunard,3
3,Regent_Seven_Seas,5
4,MSC,8
5,Carnival,22
6,Crystal,2
7,Orient,1
8,Princess,17
9,Silversea,4


In [None]:
final_data = output.select(['features', 'crew']) # Simplify our dataset by selecting only columns of desire to apply our model
final_data.toPandas() # Features columns, that has all the info needed to predict the crew needed.

Unnamed: 0,features,crew
0,"[6.0, 30.276999999999997, 6.94, 5.94, 3.55, 42...",3.55
1,"[6.0, 30.276999999999997, 6.94, 5.94, 3.55, 42...",3.55
2,"[26.0, 47.262, 14.86, 7.22, 7.43, 31.8, 1.0]",6.70
3,"[11.0, 110.0, 29.74, 9.53, 14.88, 36.99, 1.0]",19.10
4,"[17.0, 101.353, 26.42, 8.92, 13.21, 38.36, 1.0]",10.00
...,...,...
153,"[22.0, 3.341, 0.66, 2.79, 0.33, 50.62, 9.0]",0.59
154,"[14.0, 76.8, 19.6, 8.79, 9.67, 39.18, 9.0]",12.00
155,"[25.0, 5.35, 1.58, 4.4, 0.74, 33.86, 15.0]",0.88
156,"[27.0, 5.35, 1.67, 4.4, 0.74, 32.04, 15.0]",0.88


In every machine learning approach, we need to divide our dataset into training and test set.

In [None]:
train_data, test_data = final_data.randomSplit([0.75, 0.25,], 43)

Check the distribution of our data: 75% for train set, 25% for test set.

In [None]:
final_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              158|
|   mean|7.794177215189873|
| stddev|3.503486564627034|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+



In [None]:
train_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               119|
|   mean| 7.809747899159674|
| stddev|3.3790961237988872|
|    min|               0.6|
|    max|              19.1|
+-------+------------------+



In [None]:
test_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                39|
|   mean| 7.746666666666667|
| stddev|3.9054902875266393|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+



Let's create our Linear Regression model:

In [None]:
from pyspark.ml.regression import LinearRegression

In [None]:
lr = LinearRegression(labelCol='crew') # Because of we name the features column as "features", we just need to say what column do we want to predict

In [None]:
lr_model = lr.fit(train_data)

In [None]:
test_results = lr_model.evaluate(test_data)
test_results.residuals.show() # Difference between observed and predicted

+--------------------+
|           residuals|
+--------------------+
|0.038051460486979494|
| -1.5369843545120911|
| -1.0324781643054175|
| -1.0324781643054175|
| -1.4192667734414997|
|  0.7875724902029901|
| 0.34574838288981624|
| -0.6882628279703322|
|  0.9873848553712588|
| -0.3219109578666446|
| -0.5934259698637785|
|-0.24583239210983088|
| -0.5621196693065063|
|  0.8677299690271632|
|-0.22487241054968266|
|-0.06040528816903645|
| -0.5470282170814862|
| -1.1381609054175232|
|-0.18129588879307157|
|  0.7198523547263793|
+--------------------+
only showing top 20 rows



In [None]:
test_results.rootMeanSquaredError 

0.6822837126517787

In [None]:
test_results.r2 # Como vemos, el valor es muy alto (95%) lo que nos indica que nuestro modelo se ajusta bien a los datos aportados.

0.968677249278543

Let's predict:

In [None]:
lr_model.transform(test_data).show()

+--------------------+-----+------------------+
|            features| crew|        prediction|
+--------------------+-----+------------------+
|[4.0,220.0,54.0,1...| 21.0| 20.96194853951302|
|[5.0,160.0,36.34,...| 13.6| 15.13698435451209|
|[6.0,30.276999999...| 3.55| 4.582478164305417|
|[6.0,30.276999999...| 3.55| 4.582478164305417|
|[7.0,89.6,25.5,9....| 9.87|11.289266773441499|
|[8.0,91.0,22.44,9...| 11.0| 10.21242750979701|
|[9.0,81.0,21.44,9...| 10.0| 9.654251617110184|
|[9.0,105.0,27.2,8...|10.68|11.368262827970332|
|[9.0,113.0,26.74,...|12.38|11.392615144628742|
|[10.0,68.0,10.8,7...| 6.36| 6.681910957866645|
|[10.0,86.0,21.14,...|  9.2| 9.793425969863778|
|[10.0,91.62700000...|  9.0|  9.24583239210983|
|[10.0,110.0,29.74...| 11.6|12.162119669306506|
|[11.0,90.0,22.4,9...| 11.0|10.132270030972837|
|[11.0,91.62700000...|  9.0| 9.224872410549683|
|[12.0,50.0,7.0,7....| 4.45| 4.510405288169037|
|[12.0,58.6,15.66,...|  7.0| 7.547028217081486|
|[12.0,88.5,21.24,...|  9.3|10.438160905

We can also predict in this case withouth the crew column

In [None]:
unlabeled_data = test_data.select('features')

In [None]:
predictions = lr_model.transform(unlabeled_data)
predictions.show() # Same results

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[4.0,220.0,54.0,1...| 20.96194853951302|
|[5.0,160.0,36.34,...| 15.13698435451209|
|[6.0,30.276999999...| 4.582478164305417|
|[6.0,30.276999999...| 4.582478164305417|
|[7.0,89.6,25.5,9....|11.289266773441499|
|[8.0,91.0,22.44,9...| 10.21242750979701|
|[9.0,81.0,21.44,9...| 9.654251617110184|
|[9.0,105.0,27.2,8...|11.368262827970332|
|[9.0,113.0,26.74,...|11.392615144628742|
|[10.0,68.0,10.8,7...| 6.681910957866645|
|[10.0,86.0,21.14,...| 9.793425969863778|
|[10.0,91.62700000...|  9.24583239210983|
|[10.0,110.0,29.74...|12.162119669306506|
|[11.0,90.0,22.4,9...|10.132270030972837|
|[11.0,91.62700000...| 9.224872410549683|
|[12.0,50.0,7.0,7....| 4.510405288169037|
|[12.0,58.6,15.66,...| 7.547028217081486|
|[12.0,88.5,21.24,...|10.438160905417524|
|[12.0,90.09,25.01...| 8.861295888793071|
|[12.0,91.0,20.32,...| 9.270147645273621|
+--------------------+------------

In [None]:
predictions.select('prediction').toPandas()
.

Unnamed: 0,prediction
0,20.961949
1,15.136984
2,4.582478
3,4.582478
4,11.289267
5,10.212428
6,9.654252
7,11.368263
8,11.392615
9,6.681911
