## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [0]:
# File location and type
file_location = "/FileStore/tables/weight_height.csv"
file_type = "csv"

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.csv(file_location, header=True,inferSchema = True)

In [0]:
df.printSchema()

root
 |-- Gender: string (nullable = true)
 |-- Height: double (nullable = true)
 |-- Weight: double (nullable = true)



In [0]:
df.show()

+------+----------------+----------------+
|Gender|          Height|          Weight|
+------+----------------+----------------+
|  Male| 73.847017017515|241.893563180437|
|  Male|68.7819040458903|  162.3104725213|
|  Male|74.1101053917849|  212.7408555565|
|  Male|71.7309784033377|220.042470303077|
|  Male|69.8817958611153|206.349800623871|
|  Male|67.2530156878065|152.212155757083|
|  Male|68.7850812516616|183.927888604031|
|  Male|68.3485155115879|167.971110489509|
|  Male| 67.018949662883| 175.92944039571|
|  Male|63.4564939783664|156.399676387112|
|  Male|71.1953822829745|186.604925560358|
|  Male|71.6408051192206|213.741169489411|
|  Male|64.7663291334055|167.127461073476|
|  Male|69.2830700967204|189.446181386738|
|  Male|69.2437322298112|186.434168021239|
|  Male|67.6456197004212|172.186930058117|
|  Male|72.4183166259878|196.028506330482|
|  Male| 63.974325721061| 172.88347020878|
|  Male|69.6400598997523| 185.98395757313|
|  Male|67.9360048540095|182.426648013226|
+------+---

+------+----------------+----------------+------------------+
|Gender|          Height|          Weight|               BMI|
+------+----------------+----------------+------------------+
|  Male| 73.847017017515|241.893563180437| 2302.749410656825|
|  Male|68.7819040458903|  162.3104725213|1658.9285185583867|
|  Male|74.1101053917849|  212.7408555565|2018.0354712165592|
|  Male|71.7309784033377|220.042470303077| 2156.527905603826|
|  Male|69.8817958611153|206.349800623871|2075.8469076393617|
|  Male|67.2530156878065|152.212155757083|1591.0832310324224|
|  Male|68.7850812516616|183.927888604031| 1879.787060446488|
|  Male|68.3485155115879|167.971110489509|  1727.67015918736|
|  Male| 67.018949662883| 175.92944039571| 1845.424275079034|
|  Male|63.4564939783664|156.399676387112|1732.6669913031053|
|  Male|71.1953822829745|186.604925560358|1842.5810559950112|
|  Male|71.6408051192206|213.741169489411|2097.4086193057383|
|  Male|64.7663291334055|167.127461073476| 1814.069234843414|
|  Male|

In [0]:
 df.select('Height').show()

+----------------+
|          Height|
+----------------+
| 73.847017017515|
|68.7819040458903|
|74.1101053917849|
|71.7309784033377|
|69.8817958611153|
|67.2530156878065|
|68.7850812516616|
|68.3485155115879|
| 67.018949662883|
|63.4564939783664|
|71.1953822829745|
|71.6408051192206|
|64.7663291334055|
|69.2830700967204|
|69.2437322298112|
|67.6456197004212|
|72.4183166259878|
| 63.974325721061|
|69.6400598997523|
|67.9360048540095|
+----------------+
only showing top 20 rows



In [0]:
df.columns

Out[7]: ['Gender', 'Height', 'Weight']

In [0]:
## Handling categorical features

from pyspark.ml.feature import StringIndexer

indexer=StringIndexer(inputCol="Gender",outputCol="Gender_indexed")
df_r=indexer.fit(df).transform(df)
df_r.show()
`
## for multiple column
# indexer=StringIndexer(inputCols=["smoker","day","time"],outputCols=["smoker_indexed","day_indexed","time_index"])                                                                  
# indexer.fit(df_r).transform(df_r).show()


Exception ignored in: <function JavaWrapper.__del__ at 0x7f9f09c96550>
Traceback (most recent call last):
  File "/databricks/spark/python/pyspark/ml/wrapper.py", line 39, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'StringIndexer' object has no attribute '_java_obj'
+------+----------------+----------------+--------------+
|Gender|          Height|          Weight|Gender_indexed|
+------+----------------+----------------+--------------+
|  Male| 73.847017017515|241.893563180437|           1.0|
|  Male|68.7819040458903|  162.3104725213|           1.0|
|  Male|74.1101053917849|  212.7408555565|           1.0|
|  Male|71.7309784033377|220.042470303077|           1.0|
|  Male|69.8817958611153|206.349800623871|           1.0|
|  Male|67.2530156878065|152.212155757083|           1.0|
|  Male|68.7850812516616|183.927888604031|           1.0|
|  Male|68.3485155115879|167.971110489509|           1.0|
|  Male| 67.018949662883| 175.9294403

In [0]:
df_r = df_r.withColumn('BMI',(703 * (df['Weight']/df['Height'])))

In [0]:
from pyspark.ml.feature import VectorAssembler

featureassembler=VectorAssembler(inputCols=['Gender_indexed','Height','Weight'],outputCol="Independent Features")
output=featureassembler.transform(df_r)


In [0]:
output.show()

+------+----------------+----------------+--------------+------------------+--------------------+
|Gender|          Height|          Weight|Gender_indexed|               BMI|Independent Features|
+------+----------------+----------------+--------------+------------------+--------------------+
|  Male| 73.847017017515|241.893563180437|           1.0| 2302.749410656825|[1.0,73.847017017...|
|  Male|68.7819040458903|  162.3104725213|           1.0|1658.9285185583867|[1.0,68.781904045...|
|  Male|74.1101053917849|  212.7408555565|           1.0|2018.0354712165592|[1.0,74.110105391...|
|  Male|71.7309784033377|220.042470303077|           1.0| 2156.527905603826|[1.0,71.730978403...|
|  Male|69.8817958611153|206.349800623871|           1.0|2075.8469076393617|[1.0,69.881795861...|
|  Male|67.2530156878065|152.212155757083|           1.0|1591.0832310324224|[1.0,67.253015687...|
|  Male|68.7850812516616|183.927888604031|           1.0| 1879.787060446488|[1.0,68.785081251...|
|  Male|68.348515511

In [0]:
final_df = output.select('Independent Features','BMI')

In [0]:
from pyspark.ml.regression import LinearRegression
##train test split
train_data,test_data=final_df.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='Independent Features', labelCol='BMI')
regressor=regressor.fit(train_data)

In [0]:
regressor.coefficients

Out[27]: DenseVector([-1.2522, -25.8782, 10.6613])

In [0]:
regressor.intercept

Out[29]: 1694.5868110806764

In [0]:
results = regressor.evaluate(test_data)

In [0]:
results.predictions.show()

+--------------------+------------------+------------------+
|Independent Features|               BMI|        prediction|
+--------------------+------------------+------------------+
|[0.0,55.336492408...|1122.6173696287412|1204.6785421702311|
|[0.0,56.066636350...| 1123.102098344527|1198.6265418243142|
|[0.0,56.078699732...|1184.5019102628657|1250.7377832248212|
|[0.0,56.108902096...|1008.9927485609927|1101.1552032122977|
|[0.0,56.167299186...| 974.9923525169057|1071.5759819162363|
|[0.0,56.534165808...|1215.4412866218772|1273.6592267647682|
|[0.0,56.548843079...|1129.3927843077522|1199.7558012793797|
|[0.0,56.630411976...|1110.7949851761948|  1183.06978061455|
|[0.0,56.678140490...|1206.4754768728878|1264.8807073994542|
|[0.0,56.764456446...| 980.5358792298916|1069.7244675863394|
|[0.0,56.822239838...|1261.6870864817054|1311.3659359715914|
|[0.0,56.856082129...|1203.8743648963987| 1261.288838874371|
|[0.0,56.975133231...|1100.2414732924942|1170.8372785173226|
|[0.0,57.148198080...|11

In [0]:
## Performance Metrics
results.r2,results.meanAbsoluteError,results.meanSquaredError


Out[35]: (0.9959142262866425, 12.001699241312608, 271.28109075492165)