In [1]:
## Import Libraries
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
from pyspark.ml.regression import LinearRegression

## Set seed
seed = 42

In [2]:
## Create Spark Session
spark = SparkSession.builder.appName('lrConsProject').getOrCreate()

In [3]:
## Setup Schema
schema = StructType(fields=[StructField('ship_name', StringType(), True),
                            StructField('cruise_line', StringType(), True),
                            StructField('age', IntegerType(), True),
                            StructField('tonnage', DoubleType(), True),
                            StructField('passengers', DoubleType(), True),
                            StructField('length', DoubleType(), True),
                            StructField('cabins', DoubleType(), True),
                            StructField('passenger_density', DoubleType(), True),
                            StructField('crew', DoubleType(), True)])

In [4]:
## Load Data
df = spark.read.csv('gs://spark-training-data/datasets/cruise_ship_info.csv', inferSchema=False,
                    header=True, schema=schema)
df.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  ship_name|cruise_line|age|           tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
only showing top 5 rows



                                                                                

In [5]:
## Confirm Proper Schema & Cols
df.printSchema()
df.columns

root
 |-- ship_name: string (nullable = true)
 |-- cruise_line: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



['ship_name',
 'cruise_line',
 'age',
 'tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [6]:
## Convert cruise_line to indexed value using StringIndexer
indexer = StringIndexer(inputCol='cruise_line', outputCol='cruise_line_index')
df_indexed = indexer.fit(df).transform(df)
df_indexed.show(5)

                                                                                

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-----------------+
|  ship_name|cruise_line|age|           tonnage|passengers|length|cabins|passenger_density|crew|cruise_line_index|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-----------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|             16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|             16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|              1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|              1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|              1.0|
+-----------+-----------+---+------------------+----------+------+------+-------

In [7]:
## Create Vector Assembler & transform data for modeling
assembler_all = VectorAssembler(inputCols=['age','tonnage','passengers','length','cabins',
                                           'passenger_density','cruise_line_index'], outputCol='features')
output_features_all = assembler_all.transform(df_indexed)
output_features_all.head(1) # Vector of numerical values we will be modeling on - Spark expects this

                                                                                

[Row(ship_name='Journey', cruise_line='Azamara', age=6, tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_line_index=16.0, features=DenseVector([6.0, 30.277, 6.94, 5.94, 3.55, 42.64, 16.0]))]

In [8]:
## Prep data - Grab only features and target column
final_data_all = output_features_all.select(['features', 'crew'])
final_data_all.show(5)

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
+--------------------+----+
only showing top 5 rows



In [9]:
## Split data into train / test
train_data_all, test_data_all = final_data_all.randomSplit([0.7, 0.3], seed=seed)
train_data_all.describe().show()
test_data_all.describe().show()

                                                                                

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               110|
|   mean| 7.538818181818191|
| stddev|3.7889277929052527|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                48|
|   mean| 8.379375000000001|
| stddev|2.6843584805550207|
|    min|              3.55|
|    max|              13.6|
+-------+------------------+



In [10]:
## Build linear regression model
lr = LinearRegression(labelCol='crew', featuresCol='features', predictionCol='prediction')
lr_model_all = lr.fit(train_data_all)

21/11/23 15:44:15 WARN org.apache.spark.ml.util.Instrumentation: [295be801] regParam is zero, which might cause numerical instability and overfitting.
21/11/23 15:44:16 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/11/23 15:44:16 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
21/11/23 15:44:16 WARN com.github.fommil.netlib.LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
21/11/23 15:44:16 WARN com.github.fommil.netlib.LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK
                                                                                

In [11]:
## Evaluate Model Accuracy
train_results_all = lr_model_all.evaluate(train_data_all)
test_results_all = lr_model_all.evaluate(test_data_all)

In [12]:
## Show Model Values
print(f'Train R^2: {train_results_all.r2} & R^2 Adj.: {train_results_all.r2adj}')
print(f'Test R^2: {test_results_all.r2} & R^2 Adj.: {test_results_all.r2adj}')

Train R^2: 0.9253832482880808 & R^2 Adj.: 0.920262490817655
Test R^2: 0.9241500945990748 & R^2 Adj.: 0.9108763611539129


In [13]:
## Review p-values of columns
## Removing age, tonnage, passenger_density since not significant
for each in zip(lr_model_all.summary.pValues, [val <= 0.05 for val in lr_model_all.summary.pValues]):
    print(each)

(0.31699735845888855, False)
(0.7039341525637155, False)
(0.010249985527425176, True)
(0.0015243196734100461, True)
(5.691003224228552e-13, True)
(0.6413383982278091, False)
(0.042687684449079555, True)
(0.4240276781005119, False)


In [14]:
## Reduced Model
## Create Vector Assembler & transform data for modeling
assembler_reduced = VectorAssembler(inputCols=['passengers','length','cabins','cruise_line_index'], outputCol='features')
output_features_reduced = assembler_reduced.transform(df_indexed)
output_features_reduced.head(1) # Vector of numerical values we will be modeling on - Spark expects this

[Row(ship_name='Journey', cruise_line='Azamara', age=6, tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_line_index=16.0, features=DenseVector([6.94, 5.94, 3.55, 16.0]))]

In [15]:
## Prep data - Grab only features and target column
final_data_reduced = output_features_reduced.select(['features', 'crew'])
final_data_reduced.show(5)

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.94,5.94,3.55,1...|3.55|
|[6.94,5.94,3.55,1...|3.55|
|[14.86,7.22,7.43,...| 6.7|
|[29.74,9.53,14.88...|19.1|
|[26.42,8.92,13.21...|10.0|
+--------------------+----+
only showing top 5 rows



In [16]:
## Split data into train / test
train_data_reduced, test_data_reduced = final_data_reduced.randomSplit([0.7, 0.3], seed=seed)
train_data_reduced.describe().show()
test_data_reduced.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               110|
|   mean| 8.252818181818185|
| stddev|3.4895135519186895|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+

+-------+----------------+
|summary|            crew|
+-------+----------------+
|  count|              48|
|   mean|        6.743125|
| stddev|3.33791255717727|
|    min|             0.6|
|    max|            13.6|
+-------+----------------+



In [17]:
## Build linear regression model
lr_reduced = LinearRegression(labelCol='crew', featuresCol='features', predictionCol='prediction')
lr_model_reduced = lr_reduced.fit(train_data_reduced)

21/11/23 15:56:30 WARN org.apache.spark.ml.util.Instrumentation: [389a117c] regParam is zero, which might cause numerical instability and overfitting.


In [18]:
## Evaluate Model Accuracy
train_results_reduced = lr_model_reduced.evaluate(train_data_reduced)
test_results_reduced = lr_model_reduced.evaluate(test_data_reduced)

In [19]:
## Show Model Values
print(f'Train R^2: {train_results_reduced.r2} & R^2 Adj.: {train_results_reduced.r2adj}')
print(f'Test R^2: {test_results_reduced.r2} & R^2 Adj.: {test_results_reduced.r2adj}')

Train R^2: 0.9159627477499088 & R^2 Adj.: 0.912761328616572
Test R^2: 0.9401543493307445 & R^2 Adj.: 0.9345873120591859


In [20]:
## Review p-values of columns
## Removing age, tonnage, passenger_density since not significant
for each in zip(lr_model_reduced.summary.pValues, [val <= 0.05 for val in lr_model_reduced.summary.pValues]):
    print(each)

(0.0030732436723790446, True)
(5.921468932501561e-05, True)
(1.354472090042691e-14, True)
(0.034170285859477234, True)
(0.006256788700951699, True)
