In [1]:
import findspark
findspark.init("/home/jean/spark-2.4.4-bin-hadoop2.7")
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.sql.functions import corr


spark = SparkSession.builder.appName('linear regression').getOrCreate()

In [2]:
df = spark.read.csv("cruise_ship_info.csv", inferSchema=True, header=True)

In [3]:
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [4]:
df.select(corr(df["length"],df["crew"])).show()

+------------------+
|corr(length, crew)|
+------------------+
| 0.895856627101658|
+------------------+



In [5]:
df.select(corr(df["Tonnage"],df["crew"])).show()

+-------------------+
|corr(Tonnage, crew)|
+-------------------+
| 0.9275688115449388|
+-------------------+



In [6]:
df.select(corr(df["passengers"],df["crew"])).show()

+----------------------+
|corr(passengers, crew)|
+----------------------+
|    0.9152341306065384|
+----------------------+



In [7]:
df.select(corr(df["cabins"],df["crew"])).show()

+------------------+
|corr(cabins, crew)|
+------------------+
|0.9508226063578497|
+------------------+



In [8]:
df.select(corr(df["passenger_density"],df["crew"])).show()

+-----------------------------+
|corr(passenger_density, crew)|
+-----------------------------+
|         -0.15550928421699717|
+-----------------------------+



In [9]:
indexer = StringIndexer(inputCol="Cruise_line", outputCol="cruise_cat")
df = indexer.fit(df).transform(df)

In [10]:
df.show(1)

+---------+-----------+---+------------------+----------+------+------+-----------------+----+----------+
|Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|cruise_cat|
+---------+-----------+---+------------------+----------+------+------+-----------------+----+----------+
|  Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|      16.0|
+---------+-----------+---+------------------+----------+------+------+-----------------+----+----------+
only showing top 1 row



In [11]:
assembler =(VectorAssembler(inputCols=["Tonnage", "passenger_density","cabins", "passengers", "length", "cruise_cat"],
                             outputCol="features"))

In [12]:
output = assembler.transform(df)

In [13]:
output.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+----------+--------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|cruise_cat|            features|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+----------+--------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|      16.0|[30.2769999999999...|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|      16.0|[30.2769999999999...|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|       1.0|[47.262,31.8,7.43...|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|       1.0|[110.0,36.99,14.8...|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0| 

In [14]:
data = output.select("features","crew")

In [15]:
train_data, test_data = data.randomSplit([0.7,0.3])

In [16]:
train_data.show(2)

+--------------------+----+
|            features|crew|
+--------------------+----+
|[2.329,24.78,0.45...| 0.6|
|[3.341,50.62,0.33...|0.59|
+--------------------+----+
only showing top 2 rows



In [17]:
lr = LinearRegression(labelCol="crew")

In [18]:
lr_model = lr.fit(train_data)

In [19]:
test_results = lr_model.evaluate(test_data)

In [20]:
test_results.r2

0.9664847178009763

In [21]:
train_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              123|
|   mean| 8.17268292682927|
| stddev|3.442671254397327|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+



In [22]:
test_results.r2

0.9664847178009763