<a href="https://colab.research.google.com/github/kazhar6821/spark-this-way1/blob/main/Tutorial_5_PySpark_Machine_Learning_Teaching_Data_to_Think_%F0%9F%A4%96.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Missing').getOrCreate()

In [2]:
## Read The dataset
training = spark.read.csv('test1.csv',header=True,inferSchema=True)

In [3]:
training.show()


+-------------+---+----------+------+
|         Name|Age|Experience|Salary|
+-------------+---+----------+------+
|     John Doe| 28|         5| 60000|
|   Jane Smith| 32|         8| 75000|
| Mike Johnson| 24|         2| 45000|
|    Sarah Lee| 29|         4| 55000|
|  David Brown| 35|        10| 90000|
|  Emily Davis| 27|         3| 50000|
|Robert Taylor| 40|        15|110000|
+-------------+---+----------+------+



In [4]:
training.printSchema()


root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [5]:
training.columns


['Name', 'Age', 'Experience', 'Salary']

In [13]:
from pyspark.ml.feature import VectorAssembler

featureassembler = VectorAssembler(
    inputCols=["Age", "Experience", "Salary"],
    outputCol="features"
)


In [14]:
output = featureassembler.transform(training)
output.show()

+-------------+---+----------+------+--------------------+
|         Name|Age|Experience|Salary|            features|
+-------------+---+----------+------+--------------------+
|     John Doe| 28|         5| 60000|  [28.0,5.0,60000.0]|
|   Jane Smith| 32|         8| 75000|  [32.0,8.0,75000.0]|
| Mike Johnson| 24|         2| 45000|  [24.0,2.0,45000.0]|
|    Sarah Lee| 29|         4| 55000|  [29.0,4.0,55000.0]|
|  David Brown| 35|        10| 90000| [35.0,10.0,90000.0]|
|  Emily Davis| 27|         3| 50000|  [27.0,3.0,50000.0]|
|Robert Taylor| 40|        15|110000|[40.0,15.0,110000.0]|
+-------------+---+----------+------+--------------------+



In [15]:
output.columns


['Name', 'Age', 'Experience', 'Salary', 'features']

In [18]:
finalized_data = output.select("features", "Salary")
finalized_data.show()



+--------------------+------+
|            features|Salary|
+--------------------+------+
|  [28.0,5.0,60000.0]| 60000|
|  [32.0,8.0,75000.0]| 75000|
|  [24.0,2.0,45000.0]| 45000|
|  [29.0,4.0,55000.0]| 55000|
| [35.0,10.0,90000.0]| 90000|
|  [27.0,3.0,50000.0]| 50000|
|[40.0,15.0,110000.0]|110000|
+--------------------+------+



In [33]:
from pyspark.ml.regression import LinearRegression

train_data, test_data = finalized_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol="features", labelCol="Salary")
regressor = regressor.fit(train_data)





In [35]:
### Coefficients
regressor.coefficients

DenseVector([674.7405, 2698.9619, 0.3927])

In [34]:
### Intercepts
regressor.intercept


4048.4429069988855

In [36]:
### Prediction
pred_results=regressor.evaluate(test_data)

In [37]:
pred_results.predictions.show()


+--------------------+------+-----------------+
|            features|Salary|       prediction|
+--------------------+------+-----------------+
|  [24.0,2.0,45000.0]| 45000|43313.14878905028|
|  [29.0,4.0,55000.0]| 55000|56012.11072668954|
|  [32.0,8.0,75000.0]| 75000| 76686.8512110242|
|[40.0,15.0,110000.0]|110000|114723.1833907664|
+--------------------+------+-----------------+



In [38]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError


(2277.249134857464, 7255940.8704769835)