In [27]:
# !pip install pyspark
# !pip install numpy
# !pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [28]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import LinearRegression

# Create a SparkSession
spark = SparkSession.builder.appName("EmploymentPrediction").getOrCreate()

# Load the CSV file into a Spark DataFrame
data = spark.read.csv("employment.csv", header=True, inferSchema=True)

# Select the relevant columns
selected_data = data.select("Period", "Data_value")

# Drop rows with missing values
selected_data = selected_data.na.drop()

# Convert Period column to numeric
selected_data = selected_data.withColumn("Period", selected_data["Period"].cast("double"))

# Create a numerical label column
label_indexer = StringIndexer(inputCol="Data_value", outputCol="label").fit(selected_data)
indexed_data = label_indexer.transform(selected_data)

# Split the data into training and testing sets
(train_data, test_data) = indexed_data.randomSplit([0.7, 0.3])

# Prepare the feature vector
assembler = VectorAssembler(inputCols=["Period"], outputCol="features")
assembled_train_data = assembler.transform(train_data)
assembled_test_data = assembler.transform(test_data)

# Train a linear regression model
lr = LinearRegression(labelCol="label", featuresCol="features")
lr_model = lr.fit(assembled_train_data)

# Make predictions on the test data
predictions = lr_model.transform(assembled_test_data)
predictions.select("Period", "Data_value", "prediction").show()


+-------+----------+-----------------+
| Period|Data_value|       prediction|
+-------+----------+-----------------+
|2011.06|  3.149822|6462.875732307035|
|2011.06| 11.496498|6462.875732307035|
|2011.06| 48.249229|6462.875732307035|
|2011.06| 53.712856|6462.875732307035|
|2011.06| 56.435279|6462.875732307035|
|2011.06| 68.230651|6462.875732307035|
|2011.06| 70.378835|6462.875732307035|
|2011.06| 71.510597|6462.875732307035|
|2011.06| 72.387248|6462.875732307035|
|2011.06| 78.478014|6462.875732307035|
|2011.06| 85.297018|6462.875732307035|
|2011.06| 95.364781|6462.875732307035|
|2011.06|113.933198|6462.875732307035|
|2011.06|119.968233|6462.875732307035|
|2011.06|120.957226|6462.875732307035|
|2011.06| 125.06741|6462.875732307035|
|2011.06|139.115025|6462.875732307035|
|2011.06|170.108119|6462.875732307035|
|2011.06|197.836571|6462.875732307035|
|2011.06|199.490649|6462.875732307035|
+-------+----------+-----------------+
only showing top 20 rows



In [33]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import LinearRegressionModel

# Create a SparkSession
spark = SparkSession.builder.appName("EmploymentPrediction").getOrCreate()

# Load the CSV file into a Spark DataFrame
data = spark.read.csv("employment.csv", header=True, inferSchema=True)

# Select the relevant columns
selected_data = data.select("Period", "Data_value")

# Drop rows with missing values
selected_data = selected_data.na.drop()

# Convert Period column to numeric
selected_data = selected_data.withColumn("Period", selected_data["Period"].cast("double"))

# Create a numerical label column
label_indexer = StringIndexer(inputCol="Data_value", outputCol="label").fit(selected_data)
indexed_data = label_indexer.transform(selected_data)

# Prepare the feature vector
assembler = VectorAssembler(inputCols=["Period"], outputCol="features")
assembled_data = assembler.transform(indexed_data)

# Split the data into training and testing sets
(train_data, test_data) = assembled_data.randomSplit([0.7, 0.3])

# Load the trained model
lr_model = LinearRegressionModel.load("linear_regression_model")

# Make predictions on the test data
predictions = lr_model.transform(test_data)
predictions.select("Period", "Data_value", "prediction").show()


Py4JJavaError: An error occurred while calling o1841.load.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: file:/C:/Users/Admin/Desktop/Jupyter-Notebook/Big Data/spark/linear_regression_model/metadata
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:287)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:208)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:292)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:292)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:288)
	at org.apache.spark.rdd.RDD.$anonfun$take$1(RDD.scala:1449)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.take(RDD.scala:1443)
	at org.apache.spark.rdd.RDD.$anonfun$first$1(RDD.scala:1484)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.first(RDD.scala:1484)
	at org.apache.spark.ml.util.DefaultParamsReader$.loadMetadata(ReadWrite.scala:587)
	at org.apache.spark.ml.regression.LinearRegressionModel$LinearRegressionModelReader.load(LinearRegression.scala:825)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
