In [144]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/user/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('linear_regression_adv_lche329_lasso').getOrCreate()

# If you're getting an error with numpy, please type 'sudo pip install numpy --user' into the EC2 console.
from pyspark.ml.regression import LinearRegression

In [145]:
df = spark.read.load('./Sales_data/sales_en.csv',format='csv',header='true',inferSchema = True)

In [146]:
# Print the schema of the DataFrame. You can see potential features as well as the predictor.
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Product Name: string (nullable = true)
 |-- Unit: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Sales Representative: string (nullable = true)
 |-- Hospital Name: string (nullable = true)
 |-- Hospital Attribute: string (nullable = true)
 |-- Hospital Code: string (nullable = true)
 |-- Purchasing Price: double (nullable = true)
 |-- Selling Price: double (nullable = true)
 |-- IMF: string (nullable = true)
 |-- Hospital Class: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Client Name: string (nullable = true)
 |-- Sales Volume: double (nullable = true)
 |-- Profits: double (nullable = true)
 |-- Satisfaction: string (nullable = true)



In [147]:
df.head()

Row(ID=1, Year=2016, Product Name='Corbrin Capsule', Unit='Dept. 2', Region='Wuhan', Sales Representative='Xiongting', Hospital Name='Huazhongkejidaxuetongjiyixueyuanfushuxieheyiyuan', Hospital Attribute='Ministerial hospital', Hospital Code='ADXH', Purchasing Price=47.14, Selling Price=63.14, IMF='1571181790', Hospital Class='Third Class', Department='Shenneike', Client Name='Denganguo', Sales Volume=236.0, Profits=3776.0, Satisfaction='Y')

In [148]:
# A simple for loop allows us to make it even clearer. 
for item in df.head():
    print(item)

1
2016
Corbrin Capsule
Dept. 2
Wuhan
Xiongting
Huazhongkejidaxuetongjiyixueyuanfushuxieheyiyuan
Ministerial hospital
ADXH
47.14
63.14
1571181790
Third Class
Shenneike
Denganguo
236.0
3776.0
Y


In [149]:
# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [150]:
# The input columns are the feature column names, and the output column is what you'd like the new column to be named. 
assembler = VectorAssembler(
    inputCols=["Sales Representative", "Hospital Attribute", 
               "Selling Price","Hospital Class","Department","Sales Volume"],
    outputCol="features")

In [151]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="Sales Representative", outputCol="Sales_Representative_Index")
df_indexed = indexer.fit(df).transform(df)
indexer = StringIndexer(inputCol="Hospital Attribute", outputCol="Hospital_Attribute_Index")
df_indexed = indexer.fit(df_indexed).transform(df_indexed)
indexer = StringIndexer(inputCol="Hospital Class", outputCol="Hospital_Class_Index")
df_indexed = indexer.fit(df_indexed).transform(df_indexed)
indexer = StringIndexer(inputCol="Department", outputCol="Department_Index")
df_indexed = indexer.fit(df_indexed).transform(df_indexed)



In [152]:
df_indexed.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Product Name: string (nullable = true)
 |-- Unit: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Sales Representative: string (nullable = true)
 |-- Hospital Name: string (nullable = true)
 |-- Hospital Attribute: string (nullable = true)
 |-- Hospital Code: string (nullable = true)
 |-- Purchasing Price: double (nullable = true)
 |-- Selling Price: double (nullable = true)
 |-- IMF: string (nullable = true)
 |-- Hospital Class: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Client Name: string (nullable = true)
 |-- Sales Volume: double (nullable = true)
 |-- Profits: double (nullable = true)
 |-- Satisfaction: string (nullable = true)
 |-- Sales_Representative_Index: double (nullable = true)
 |-- Hospital_Attribute_Index: double (nullable = true)
 |-- Hospital_Class_Index: double (nullable = true)
 |-- Department_Index: double (nullable = true)



In [153]:
# The input columns are the feature column names, and the output column is what you'd like the new column to be named. 
assembler = VectorAssembler(
    inputCols=["Sales_Representative_Index", "Hospital_Attribute_Index", 
               "Selling Price","Hospital_Class_Index","Department_Index","Sales Volume"],
    outputCol="features")

In [154]:
# Now that we've created the assembler variable, let's actually transform the data.
output = assembler.transform(df_indexed)

In [155]:
# Using print schema, you see that the features output column has been added. 
output.printSchema()

# You can see that the features column is a dense vector that combines the various features as expected.
output.head(1)

root
 |-- ID: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Product Name: string (nullable = true)
 |-- Unit: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Sales Representative: string (nullable = true)
 |-- Hospital Name: string (nullable = true)
 |-- Hospital Attribute: string (nullable = true)
 |-- Hospital Code: string (nullable = true)
 |-- Purchasing Price: double (nullable = true)
 |-- Selling Price: double (nullable = true)
 |-- IMF: string (nullable = true)
 |-- Hospital Class: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Client Name: string (nullable = true)
 |-- Sales Volume: double (nullable = true)
 |-- Profits: double (nullable = true)
 |-- Satisfaction: string (nullable = true)
 |-- Sales_Representative_Index: double (nullable = true)
 |-- Hospital_Attribute_Index: double (nullable = true)
 |-- Hospital_Class_Index: double (nullable = true)
 |-- Department_Index: double (nullable = true)
 |-- features

[Row(ID=1, Year=2016, Product Name='Corbrin Capsule', Unit='Dept. 2', Region='Wuhan', Sales Representative='Xiongting', Hospital Name='Huazhongkejidaxuetongjiyixueyuanfushuxieheyiyuan', Hospital Attribute='Ministerial hospital', Hospital Code='ADXH', Purchasing Price=47.14, Selling Price=63.14, IMF='1571181790', Hospital Class='Third Class', Department='Shenneike', Client Name='Denganguo', Sales Volume=236.0, Profits=3776.0, Satisfaction='Y', Sales_Representative_Index=139.0, Hospital_Attribute_Index=3.0, Hospital_Class_Index=0.0, Department_Index=3.0, features=DenseVector([139.0, 3.0, 63.14, 0.0, 3.0, 236.0]))]

In [156]:
# Let's select two columns (the feature and predictor).
# This is now in the appropriate format to be processed by Spark.
final_data = output.select("features",'Profits')
final_data.show()

+--------------------+-------+
|            features|Profits|
+--------------------+-------+
|[139.0,3.0,63.14,...| 3776.0|
|[139.0,3.0,63.14,...| 5072.0|
|[139.0,3.0,63.14,...| 4592.0|
|[139.0,3.0,63.14,...| 2560.0|
|[139.0,3.0,63.14,...| 1920.0|
|[139.0,3.0,63.14,...| 1104.0|
|[139.0,3.0,63.14,...| 2064.0|
|[139.0,3.0,63.14,...| 2752.0|
|[139.0,3.0,63.14,...| 3392.0|
|[139.0,3.0,63.14,...| 4960.0|
|[139.0,3.0,63.14,...| 2448.0|
|[139.0,3.0,63.14,...|  288.0|
|[139.0,3.0,63.14,...|    0.0|
|[139.0,3.0,63.14,...|    0.0|
|[139.0,3.0,63.14,...| 3536.0|
|[139.0,3.0,63.14,...|    0.0|
|[139.0,3.0,63.14,...| 9488.0|
|[139.0,3.0,63.14,...|17760.0|
|[139.0,3.0,63.14,...|  336.0|
|[139.0,3.0,63.14,...|  336.0|
+--------------------+-------+
only showing top 20 rows



In [157]:
# Let's do a randomised 70/30 split. 
# Remember, you can use other splits depending on how easy/difficult it is to train your model.
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [158]:
# Let's see our training data.
train_data.describe().show()

# And our testing data.
test_data.describe().show()

+-------+------------------+
|summary|           Profits|
+-------+------------------+
|  count|             28247|
|   mean|234.57779241024986|
| stddev| 910.0578547428056|
|    min|           -130.02|
|    max|           38656.0|
+-------+------------------+

+-------+------------------+
|summary|           Profits|
+-------+------------------+
|  count|             12038|
|   mean|234.97949338757667|
| stddev| 924.5838629470064|
|    min|           -130.02|
|    max|           35632.0|
+-------+------------------+



In [159]:
#a linear regression model is trained with the elastic net parameter α set to 1, it is equivalent to a Lasso model.
lr = LinearRegression(maxIter=10,regParam=0.3,elasticNetParam=1.0,labelCol='Profits')

In [160]:
# Fit the model to the data.
lrModel = lr.fit(train_data)

In [161]:
# Print the coefficients and intercept for linear regression.
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

Coefficients: [0.6325920066517491,4.04634790230776,0.24774777920769298,34.150934868328235,2.8186258355007245,11.816557221989177] Intercept: -77.9570112665469


In [162]:
# Let's evaluate the model against the test data.
test_results = lrModel.evaluate(test_data)

In [163]:
# Interesting results! This shows the difference between the predicted value and the test data.
test_results.residuals.show()

# Let's get some evaluation metrics (as discussed in the previous linear regression notebook).
print("RSME_Lasso: {}".format(test_results.rootMeanSquaredError))

+------------------+
|         residuals|
+------------------+
| 61.04903247406967|
| 61.04903247406967|
| 61.04903247406967|
| 61.04903247406967|
| 61.04903247406967|
| 61.04903247406967|
| 61.04903247406967|
| 61.04903247406967|
| 61.04903247406967|
| 61.04903247406967|
|60.807741802444475|
|60.807741802444475|
|60.807741802444475|
| 50.92756036764168|
| 50.64017294376076|
| 50.64017294376076|
| 50.64017294376076|
| 44.38930025578043|
| 35.11276020134795|
| 35.11276020134795|
+------------------+
only showing top 20 rows

RSME_Lasso: 574.6522599571334


In [164]:
# We can also get the R2 value. 
print("R2_Lasso: {}".format(test_results.r2))

R2_Lasso: 0.6136742130826649


In [165]:
final_data.describe().show()

+-------+------------------+
|summary|           Profits|
+-------+------------------+
|  count|             40285|
|   mean|234.69782905826548|
| stddev| 914.4112274804912|
|    min|           -130.02|
|    max|           38656.0|
+-------+------------------+

