In [1]:
# Import the PySpark module
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [2]:
##Create SparkContext
sc = SparkContext.getOrCreate()

# Create SparkSession object
spark = SparkSession.builder.master('local[*]').appName('oneHot').getOrCreate()

In [3]:
## Is not the best choose for large data sets
flights = spark.read.csv('flights.csv', sep=',', header=True, inferSchema=True, nullValue='NA')

In [4]:
from pyspark.ml.feature import StringIndexer

# Create an indexer
indexer = StringIndexer(inputCol='carrier', outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(flights)

# Indexer creates a new column with numeric index values
flights_indexed = indexer_model.transform(flights)

# Repeat the process for the other categorical feature
flights = flights_indexed = StringIndexer(inputCol='org', outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)

### One-hot Encoding

In [5]:
from pyspark.ml.feature import OneHotEncoderEstimator

# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy'])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flights)
flights_onehot = onehot.transform(flights)

# Check the results
flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show()

+---+-------+-------------+
|org|org_idx|    org_dummy|
+---+-------+-------------+
|ORD|    0.0|(7,[0],[1.0])|
|SFO|    1.0|(7,[1],[1.0])|
|JFK|    2.0|(7,[2],[1.0])|
|LGA|    3.0|(7,[3],[1.0])|
|SJC|    4.0|(7,[4],[1.0])|
|SMF|    5.0|(7,[5],[1.0])|
|TUS|    6.0|(7,[6],[1.0])|
|OGG|    7.0|    (7,[],[])|
+---+-------+-------------+



In [6]:
flights_onehot.show()

+---+---+---+-------+------+---+----+------+--------+-----+-----------+-------+-------------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|carrier_idx|org_idx|    org_dummy|
+---+---+---+-------+------+---+----+------+--------+-----+-----------+-------+-------------+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| null|        6.0|    2.0|(7,[2],[1.0])|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|        0.0|    0.0|(7,[0],[1.0])|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|        0.0|    1.0|(7,[1],[1.0])|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|        1.0|    0.0|(7,[0],[1.0])|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| null|        1.0|    0.0|(7,[0],[1.0])|
|  5|  2|  1|     UA|   704|SFO| 550|  7.98|     102|    2|        0.0|    1.0|(7,[1],[1.0])|
|  7|  2|  6|     AA|   380|ORD| 733| 10.83|     135|   54|        1.0|    0.0|(7,[0],[1.0])|
|  1| 16|  6|     UA|  1477|ORD|1440|   8.0|     232|   -7| 

In [7]:
flights = flights_onehot

In [8]:
from pyspark.ml.feature import VectorAssembler

# Create an assembler object

assembler = VectorAssembler(inputCols=['mile','org_dummy'], outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flights)
flights = flights_assembled

In [9]:
flights_assembled.show()

+---+---+---+-------+------+---+----+------+--------+-----+-----------+-------+-------------+--------------------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|carrier_idx|org_idx|    org_dummy|            features|
+---+---+---+-------+------+---+----+------+--------+-----+-----------+-------+-------------+--------------------+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| null|        6.0|    2.0|(7,[2],[1.0])|(8,[0,3],[2153.0,...|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|        0.0|    0.0|(7,[0],[1.0])|(8,[0,1],[316.0,1...|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|        0.0|    1.0|(7,[1],[1.0])|(8,[0,2],[337.0,1...|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|        1.0|    0.0|(7,[0],[1.0])|(8,[0,1],[1236.0,...|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| null|        1.0|    0.0|(7,[0],[1.0])|(8,[0,1],[258.0,1...|
|  5|  2|  1|     UA|   704|SFO| 550|  7.98|     102|    2|        0.0|    1.0|(

In [10]:
# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=17)

### Linear regression

In [11]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Create a regression object and train on training data
regression = LinearRegression(labelCol='duration').fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
predictions = regression.transform(flights_test)
predictions.select('duration', 'prediction').show(5, False)

# Calculate the RMSE
RegressionEvaluator(labelCol='duration').evaluate(predictions)

+--------+------------------+
|duration|prediction        |
+--------+------------------+
|135     |148.00838636163186|
|120     |131.98696435629046|
|160     |150.5192062281406 |
|275     |265.0604172812531 |
|85      |92.53122359686756 |
+--------+------------------+
only showing top 5 rows



11.013344846790027

In [12]:
# Intercept (average minutes on ground)
inter = regression.intercept
print(inter)

# Coefficients
coefs = regression.coefficients 
print(coefs)

# Average minutes per km
minutes_per_km = regression.coefficients[0]
print(minutes_per_km)

# Average speed in km per hour

15.962765747410497
[0.11956285078612996,28.384628982646692,20.328302758040284,52.52303648902806,46.6402648624079,18.13267630098186,15.549814800702347,17.666886213401646]
0.11956285078612996


### Interpreting coefficients

In [13]:
# Average speed in km per hour
avg_speed_hour = 60 / regression.coefficients[0]
print(avg_speed_hour)

# Average minutes on ground at OGG
inter = regression.intercept
print(inter)

# Average minutes on ground at JFK
avg_ground_jfk = inter + regression.coefficients[3]
print(avg_ground_jfk)

# Average minutes on ground at LGA
avg_ground_lga = inter + regression.coefficients[4]
print(avg_ground_lga)

501.82811471538093
15.962765747410497
68.48580223643856
62.603030609818404


In [14]:
# Find the RMSE on testing data
from pyspark.ml.evaluation import RegressionEvaluator
RegressionEvaluator(labelCol='duration').evaluate(predictions)

# Average minutes on ground at OGG for flights departing between 21:00 and 24:00
avg_eve_ogg = regression.intercept
print(avg_eve_ogg)

# Average minutes on ground at OGG for flights departing between 00:00 and 03:00
avg_night_ogg = regression.intercept + regression.coefficients[7]
print(avg_night_ogg)

# Average minutes on ground at JFK for flights departing between 00:00 and 03:00
avg_night_jfk = regression.intercept + regression.coefficients[7] + regression.coefficients[3]
print(avg_night_jfk)

15.962765747410497
33.62965196081214
86.1526884498402


### Lasso regularization

In [15]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Fit Lasso model (α = 1) to training data
regression = LinearRegression(labelCol='duration', regParam=1, elasticNetParam=1).fit(flights_train)

# Make predictions on testing data
#predictions = regression.transform(flights_test)

# Calculate the RMSE on testing data
rmse = RegressionEvaluator(labelCol='duration').evaluate(regression.transform(flights_test))
print("The test RMSE is", rmse)

# Look at the model coefficients
coeffs = regression.coefficients
print(coeffs)

# Number of zero coefficients
zero_coeff = sum([beta == 0 for beta in regression.coefficients])
print("Number of ceofficients equal to 0:", zero_coeff)

The test RMSE is 11.627558869357237
[0.11824617516771116,5.735841925911693,0.0,29.23464599249453,22.121212095427563,0.0,-2.162292165630389,0.0]
Number of ceofficients equal to 0: 3
