In [0]:
life_exp = spark.read.format("csv") \
                    .option("header", "true") \
                    .option("inferSchema", "true") \
                    .load("/FileStore/datasets/life_expectancy.csv")

life_exp.display()

Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.27962362,65.0,1154,19.1,83,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.52358168,62.0,492,18.6,86,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.21924272,64.0,430,18.1,89,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.1842153,67.0,2787,17.6,93,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097108703,68.0,3013,17.2,97,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5
Afghanistan,2010,Developing,58.8,279.0,74,0.01,79.67936736,66.0,1989,16.7,102,66.0,9.2,66.0,0.1,553.32894,2883167.0,18.4,18.4,0.448,9.2
Afghanistan,2009,Developing,58.6,281.0,77,0.01,56.76221682,63.0,2861,16.2,106,63.0,9.42,63.0,0.1,445.8932979,284331.0,18.6,18.7,0.434,8.9
Afghanistan,2008,Developing,58.1,287.0,80,0.03,25.87392536,64.0,1599,15.7,110,64.0,8.33,64.0,0.1,373.3611163,2729431.0,18.8,18.9,0.433,8.7
Afghanistan,2007,Developing,57.5,295.0,82,0.02,10.91015598,63.0,1141,15.2,113,63.0,6.73,63.0,0.1,369.835796,26616792.0,19.0,19.1,0.415,8.4
Afghanistan,2006,Developing,57.3,295.0,84,0.03,17.17151751,64.0,1990,14.7,116,58.0,7.43,58.0,0.1,272.56377,2589345.0,19.2,19.3,0.405,8.1


In [0]:
life_exp.count()

In [0]:
life_exp = life_exp.dropna()

life_exp.count()

In [0]:
display(life_exp.groupBy('Status').count().orderBy('Status'))

Status,count
Developed,242
Developing,1407


Output can only be rendered in Databricks

In [0]:
display(life_exp.select('Life expectancy '))

Life expectancy
65.0
59.9
59.9
59.5
59.2
58.8
58.6
58.1
57.5
57.3


Output can only be rendered in Databricks

In [0]:
display(life_exp.select('Life expectancy ', 'Status'))

Life expectancy,Status
65.0,Developing
59.9,Developing
59.9,Developing
59.5,Developing
59.2,Developing
58.8,Developing
58.6,Developing
58.1,Developing
57.5,Developing
57.3,Developing


Output can only be rendered in Databricks

In [0]:
independent_variables = ['Adult Mortality', 
                         'Schooling', 
                         'Total expenditure', 
                         'Diphtheria ', 
                         'GDP',
                         'Population']

dependent_variable = ['Life expectancy ']

In [0]:
#calculate correlation coefficients
life_exp_corr = life_exp.select(independent_variables + dependent_variable) 

for i in life_exp_corr.columns:
        print( "Correlation to life expectancy for", i, "is: ", life_exp_corr.stat.corr('Life expectancy ', i))

In [0]:
# next steps are for data transformation
from pyspark.ml.feature import StringIndexer 
 
categoricalCols = ['Status']
 
stringIndexer = StringIndexer(inputCols=categoricalCols, outputCols=[x + "Index" for x in categoricalCols])

In [0]:
stringIndexerModel = stringIndexer.fit(life_exp)

life_exp_statusindex = stringIndexerModel.transform(life_exp)

life_exp_statusindex.filter(life_exp_statusindex['Country'].isin(['Afghanistan', 'Germany']))\
                    .select('Country', 'Status', 'StatusIndex')\
                    .display()

Country,Status,StatusIndex
Afghanistan,Developing,0.0
Afghanistan,Developing,0.0
Afghanistan,Developing,0.0
Afghanistan,Developing,0.0
Afghanistan,Developing,0.0
Afghanistan,Developing,0.0
Afghanistan,Developing,0.0
Afghanistan,Developing,0.0
Afghanistan,Developing,0.0
Afghanistan,Developing,0.0


In [0]:
feature_columns = ['Year', 'Adult Mortality', 'infant deaths',
                   'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ',
                   'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ',
                   ' HIV/AIDS', 'GDP', 'Population', ' thinness  1-19 years',
                   ' thinness 5-9 years', 'Income composition of resources', 'Schooling', 'StatusIndex']

label_column = 'Life expectancy '

In [0]:

#set up our features into vector  
from pyspark.ml.feature import VectorAssembler

vector_assembler = VectorAssembler(inputCols = feature_columns, outputCol = 'features')

life_exp_features_label = vector_assembler.transform(life_exp_statusindex).select(['features', label_column])

life_exp_features_label.display()

features,Life expectancy
"Map(vectorType -> dense, length -> 20, values -> List(2015.0, 263.0, 62.0, 0.01, 71.27962362, 65.0, 1154.0, 19.1, 83.0, 6.0, 8.16, 65.0, 0.1, 584.25921, 3.3736494E7, 17.2, 17.3, 0.479, 10.1, 0.0))",65.0
"Map(vectorType -> dense, length -> 20, values -> List(2014.0, 271.0, 64.0, 0.01, 73.52358168, 62.0, 492.0, 18.6, 86.0, 58.0, 8.18, 62.0, 0.1, 612.696514, 327582.0, 17.5, 17.5, 0.476, 10.0, 0.0))",59.9
"Map(vectorType -> dense, length -> 20, values -> List(2013.0, 268.0, 66.0, 0.01, 73.21924272, 64.0, 430.0, 18.1, 89.0, 62.0, 8.13, 64.0, 0.1, 631.744976, 3.1731688E7, 17.7, 17.7, 0.47, 9.9, 0.0))",59.9
"Map(vectorType -> dense, length -> 20, values -> List(2012.0, 272.0, 69.0, 0.01, 78.1842153, 67.0, 2787.0, 17.6, 93.0, 67.0, 8.52, 67.0, 0.1, 669.959, 3696958.0, 17.9, 18.0, 0.463, 9.8, 0.0))",59.5
"Map(vectorType -> dense, length -> 20, values -> List(2011.0, 275.0, 71.0, 0.01, 7.097108703, 68.0, 3013.0, 17.2, 97.0, 68.0, 7.87, 68.0, 0.1, 63.537231, 2978599.0, 18.2, 18.2, 0.454, 9.5, 0.0))",59.2
"Map(vectorType -> dense, length -> 20, values -> List(2010.0, 279.0, 74.0, 0.01, 79.67936736, 66.0, 1989.0, 16.7, 102.0, 66.0, 9.2, 66.0, 0.1, 553.32894, 2883167.0, 18.4, 18.4, 0.448, 9.2, 0.0))",58.8
"Map(vectorType -> dense, length -> 20, values -> List(2009.0, 281.0, 77.0, 0.01, 56.76221682, 63.0, 2861.0, 16.2, 106.0, 63.0, 9.42, 63.0, 0.1, 445.8932979, 284331.0, 18.6, 18.7, 0.434, 8.9, 0.0))",58.6
"Map(vectorType -> dense, length -> 20, values -> List(2008.0, 287.0, 80.0, 0.03, 25.87392536, 64.0, 1599.0, 15.7, 110.0, 64.0, 8.33, 64.0, 0.1, 373.3611163, 2729431.0, 18.8, 18.9, 0.433, 8.7, 0.0))",58.1
"Map(vectorType -> dense, length -> 20, values -> List(2007.0, 295.0, 82.0, 0.02, 10.91015598, 63.0, 1141.0, 15.2, 113.0, 63.0, 6.73, 63.0, 0.1, 369.835796, 2.6616792E7, 19.0, 19.1, 0.415, 8.4, 0.0))",57.5
"Map(vectorType -> dense, length -> 20, values -> List(2006.0, 295.0, 84.0, 0.03, 17.17151751, 64.0, 1990.0, 14.7, 116.0, 58.0, 7.43, 58.0, 0.1, 272.56377, 2589345.0, 19.2, 19.3, 0.405, 8.1, 0.0))",57.3


In [0]:
train_df, test_df = life_exp_features_label.randomSplit([0.75, 0.25], seed=123)

train_df.count(), test_df.count()

In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
linear_regression = LinearRegression(featuresCol='features', labelCol=label_column)

linear_regression_model = linear_regression.fit(train_df)

In [0]:
print('Model Coefficients: \n' + str(linear_regression_model.coefficients))

In [0]:
print('Intercept: \n' + str(linear_regression_model.intercept))

In [0]:
training_summary = linear_regression_model.summary

print('RMSE: %f' % training_summary.rootMeanSquaredError)

print('R-SQUARED: %f' % training_summary.r2)

In [0]:
print('TRAINING DATASET RESIDUALS: ')

training_summary.residuals.display()

residuals
-6.01352544816794
0.7341113587891357
-4.897008544887541
-1.114932990933454
-1.8452106620266164
3.65320321706804
-2.846751045142298
0.2252549056734381
1.4277636360986037
2.0375967110674797


Output can only be rendered in Databricks

In [0]:
test_predictions = linear_regression_model.transform(test_df)

In [0]:
print('TEST DATASET PREDICTIONS AGAINST ACTUAL LABEL: ')

test_predictions.select('features', 'prediction', 'life expectancy ').display()

features,prediction,life expectancy
"Map(vectorType -> dense, length -> 20, values -> List(2000.0, 11.0, 1.0, 3.66, 91.71154052, 96.0, 662.0, 45.0, 1.0, 97.0, 6.26, 97.0, 0.1, 1175.788981, 38927.0, 2.1, 2.2, 0.656, 10.7, 0.0))",73.26255196509055,72.6
"Map(vectorType -> dense, length -> 20, values -> List(2000.0, 16.0, 27.0, 0.45, 63.42140024, 43.0, 7368.0, 44.8, 32.0, 95.0, 4.18, 95.0, 0.1, 1332.382358, 2.8849621E7, 6.7, 6.6, 0.519, 8.0, 0.0))",69.51087692175491,68.6
"Map(vectorType -> dense, length -> 20, values -> List(2000.0, 86.0, 2.0, 11.12, 1934.398154, 77.0, 152.0, 57.6, 2.0, 95.0, 7.21, 95.0, 0.1, 14676.769, 4567864.0, 0.6, 0.5, 0.82, 15.7, 1.0))",79.499090445798,79.1
"Map(vectorType -> dense, length -> 20, values -> List(2000.0, 95.0, 3.0, 12.91, 4238.540035, 84.0, 0.0, 55.1, 4.0, 94.0, 1.1, 9.0, 0.1, 23718.7467, 8221158.0, 1.2, 1.2, 0.855, 16.2, 1.0))",79.16866441324072,78.0
"Map(vectorType -> dense, length -> 20, values -> List(2000.0, 112.0, 4.0, 1.21, 264.7842196, 94.0, 47.0, 48.1, 5.0, 97.0, 5.4, 97.0, 0.1, 2213.91488, 9699197.0, 6.6, 6.5, 0.646, 12.8, 0.0))",73.45407699879013,72.9
"Map(vectorType -> dense, length -> 20, values -> List(2000.0, 133.0, 4.0, 0.49, 227.296617, 93.0, 32.0, 54.0, 4.0, 94.0, 9.65, 91.0, 0.1, 1657.889256, 51313.0, 4.0, 4.0, 0.7, 12.6, 0.0))",74.28471353642061,71.7
"Map(vectorType -> dense, length -> 20, values -> List(2000.0, 153.0, 3.0, 8.4, 412.4323974, 99.0, 77.0, 53.1, 4.0, 98.0, 5.5, 98.0, 0.1, 4492.72764, 3.8258629E7, 2.5, 2.8, 0.777, 14.6, 1.0))",76.13934377714219,73.7
"Map(vectorType -> dense, length -> 20, values -> List(2000.0, 158.0, 0.0, 1.24, 40.49128897, 93.0, 0.0, 65.5, 0.0, 91.0, 4.75, 91.0, 0.1, 263.27236, 9882.0, 0.1, 0.1, 0.676, 13.7, 0.0))",74.42074773866841,71.6
"Map(vectorType -> dense, length -> 20, values -> List(2000.0, 167.0, 18.0, 4.66, 477.1341814, 78.0, 1.0, 46.7, 21.0, 82.0, 5.91, 79.0, 0.1, 2472.197831, 443958.0, 2.6, 2.3, 0.65, 11.3, 0.0))",71.10892224767395,71.4
"Map(vectorType -> dense, length -> 20, values -> List(2000.0, 174.0, 6.0, 2.61, 28.80831098, 93.0, 0.0, 38.8, 8.0, 88.0, 6.63, 94.0, 1.7, 188.783165, 6524283.0, 2.8, 2.7, 0.551, 9.8, 0.0))",67.82541106274778,71.0


Output can only be rendered in Databricks

In [0]:
test_summary = linear_regression_model.evaluate(test_df)

print('RMSE on Test Data = %g' % test_summary.rootMeanSquaredError)

print('R-SQUARED on Test Data = %g' % test_summary.r2)

#####Hyper parameter Tuning

In [0]:
# gives the explanation of all parameters
print(linear_regression.explainParams())

In [0]:
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

paramGrid = ParamGridBuilder()\
            .addGrid(linear_regression.regParam, [0.1, 0.05, 0.01]) \
            .addGrid(linear_regression.fitIntercept, [False, True])\
            .addGrid(linear_regression.elasticNetParam, [0.0, 0.5, 1.0])\
            .build()

In [0]:
evaluator = RegressionEvaluator(labelCol=label_column)

In [0]:
tvs = TrainValidationSplit(estimator=linear_regression,
                           estimatorParamMaps=paramGrid,
                           evaluator=evaluator,
                           trainRatio=0.8)

In [0]:
model = tvs.fit(train_df)

In [0]:
tuned_prediction = model.transform(test_df)

tuned_prediction.select('features', 'Life expectancy ', 'prediction').display()

features,Life expectancy,prediction
"Map(vectorType -> dense, length -> 20, values -> List(2000.0, 11.0, 1.0, 3.66, 91.71154052, 96.0, 662.0, 45.0, 1.0, 97.0, 6.26, 97.0, 0.1, 1175.788981, 38927.0, 2.1, 2.2, 0.656, 10.7, 0.0))",72.6,73.31607049176262
"Map(vectorType -> dense, length -> 20, values -> List(2000.0, 16.0, 27.0, 0.45, 63.42140024, 43.0, 7368.0, 44.8, 32.0, 95.0, 4.18, 95.0, 0.1, 1332.382358, 2.8849621E7, 6.7, 6.6, 0.519, 8.0, 0.0))",68.6,69.57748531001522
"Map(vectorType -> dense, length -> 20, values -> List(2000.0, 86.0, 2.0, 11.12, 1934.398154, 77.0, 152.0, 57.6, 2.0, 95.0, 7.21, 95.0, 0.1, 14676.769, 4567864.0, 0.6, 0.5, 0.82, 15.7, 1.0))",79.1,79.49657900420249
"Map(vectorType -> dense, length -> 20, values -> List(2000.0, 95.0, 3.0, 12.91, 4238.540035, 84.0, 0.0, 55.1, 4.0, 94.0, 1.1, 9.0, 0.1, 23718.7467, 8221158.0, 1.2, 1.2, 0.855, 16.2, 1.0))",78.0,79.05740569596628
"Map(vectorType -> dense, length -> 20, values -> List(2000.0, 112.0, 4.0, 1.21, 264.7842196, 94.0, 47.0, 48.1, 5.0, 97.0, 5.4, 97.0, 0.1, 2213.91488, 9699197.0, 6.6, 6.5, 0.646, 12.8, 0.0))",72.9,73.52869654285507
"Map(vectorType -> dense, length -> 20, values -> List(2000.0, 133.0, 4.0, 0.49, 227.296617, 93.0, 32.0, 54.0, 4.0, 94.0, 9.65, 91.0, 0.1, 1657.889256, 51313.0, 4.0, 4.0, 0.7, 12.6, 0.0))",71.7,74.33504708880594
"Map(vectorType -> dense, length -> 20, values -> List(2000.0, 153.0, 3.0, 8.4, 412.4323974, 99.0, 77.0, 53.1, 4.0, 98.0, 5.5, 98.0, 0.1, 4492.72764, 3.8258629E7, 2.5, 2.8, 0.777, 14.6, 1.0))",73.7,76.18047399832824
"Map(vectorType -> dense, length -> 20, values -> List(2000.0, 158.0, 0.0, 1.24, 40.49128897, 93.0, 0.0, 65.5, 0.0, 91.0, 4.75, 91.0, 0.1, 263.27236, 9882.0, 0.1, 0.1, 0.676, 13.7, 0.0))",71.6,74.47058118238442
"Map(vectorType -> dense, length -> 20, values -> List(2000.0, 167.0, 18.0, 4.66, 477.1341814, 78.0, 1.0, 46.7, 21.0, 82.0, 5.91, 79.0, 0.1, 2472.197831, 443958.0, 2.6, 2.3, 0.65, 11.3, 0.0))",71.4,71.0604255151211
"Map(vectorType -> dense, length -> 20, values -> List(2000.0, 174.0, 6.0, 2.61, 28.80831098, 93.0, 0.0, 38.8, 8.0, 88.0, 6.63, 94.0, 1.7, 188.783165, 6524283.0, 2.8, 2.7, 0.551, 9.8, 0.0))",71.0,67.84612058410238


In [0]:
r2_score = evaluator.setMetricName('r2').evaluate(tuned_prediction)

print('R-SQUARED on Test Data = %g' % r2_score)

In [0]:
print('Best regParam: ' + str(model.bestModel._java_obj.getRegParam()) + "\n" +
      'Best ElasticNetParam:' + str(model.bestModel._java_obj.getElasticNetParam()))