In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import Normalizer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
spark = SparkSession.builder.appName("LabEndsemPrac").getOrCreate()

In [4]:
df = spark.read.csv("synthetic_crop_yield.csv", 
                    sep=",", 
                    quote='"',
                    inferSchema=True,
                    header=True)
df.show(5)
df.printSchema()

+-------+-------+------+------+------------------+------------------+------------------+
|user_id|crop_id|rating| state|      soil_quality|          rainfall|             yield|
+-------+-------+------+------+------------------+------------------+------------------+
|     39|      1|     5|State2|1.4473952495864468|  97.8329737705031| 1333.412487228435|
|     29|      6|     5|State4|0.8373401410170235| 85.79430013239207|1024.3014593681485|
|     15|      6|     1|State4|1.2249811699240525|102.21311367797993|1101.4908676573082|
|     43|     16|     2|State1| 1.038813009039619| 101.1548262958734|1167.1508027715001|
|      8|      4|     4|State4|0.5606289260664682| 115.2785995477906|1026.8081772466048|
+-------+-------+------+------+------------------+------------------+------------------+
only showing top 5 rows

root
 |-- user_id: integer (nullable = true)
 |-- crop_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- state: string (nullable = true)
 |-- soil_qua

In [5]:
filtered_df = df.filter(df["yield"]>1000).select("state","crop_id","yield")

filtered_df.show()

+------+-------+------------------+
| state|crop_id|             yield|
+------+-------+------------------+
|State2|      1| 1333.412487228435|
|State4|      6|1024.3014593681485|
|State4|      6|1101.4908676573082|
|State1|     16|1167.1508027715001|
|State4|      4|1026.8081772466048|
|State1|     11|1048.1265488050308|
|State4|      8| 1121.091683364107|
|State2|      5| 1014.178130564413|
|State2|      6|1116.0197683270414|
|State3|      2|1108.6912921908213|
|State3|      4|1109.4283350120954|
|State4|      6|  1113.24535297919|
|State1|     17|1060.2047208327124|
|State4|     19|1043.9593469996205|
|State1|      2| 1073.129717331181|
|State4|     19|1163.3677419264504|
|State4|     20|1118.9529829553862|
|State4|      1|1167.2442157750004|
|State4|     17|1028.3101558158687|
|State3|     15|1085.7840796361977|
+------+-------+------------------+
only showing top 20 rows



In [7]:
df = df.na.drop()
df = df.na.fill({"yield": 0})

df.show()

+-------+-------+------+------+------------------+------------------+------------------+
|user_id|crop_id|rating| state|      soil_quality|          rainfall|             yield|
+-------+-------+------+------+------------------+------------------+------------------+
|     39|      1|     5|State2|1.4473952495864468|  97.8329737705031| 1333.412487228435|
|     29|      6|     5|State4|0.8373401410170235| 85.79430013239207|1024.3014593681485|
|     15|      6|     1|State4|1.2249811699240525|102.21311367797993|1101.4908676573082|
|     43|     16|     2|State1| 1.038813009039619| 101.1548262958734|1167.1508027715001|
|      8|      4|     4|State4|0.5606289260664682| 115.2785995477906|1026.8081772466048|
|     21|     11|     2|State1|1.1899997373089435| 104.3671646235406|1048.1265488050308|
|     39|      9|     2|State3|1.1900965582378435| 93.75772216207828| 782.6749646092585|
|     19|      3|     1|State3| 1.430606804846537|106.16273806649025|  820.776994312683|
|     23|     19|    

In [11]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="state", outputCol="massive")
df = indexer.fit(df).transform(df)

df.show()

+-------+-------+------+------+------------------+------------------+------------------+-------------+-------+
|user_id|crop_id|rating| state|      soil_quality|          rainfall|             yield|categoryIndex|massive|
+-------+-------+------+------+------------------+------------------+------------------+-------------+-------+
|     39|      1|     5|State2|1.4473952495864468|  97.8329737705031| 1333.412487228435|          3.0|    3.0|
|     29|      6|     5|State4|0.8373401410170235| 85.79430013239207|1024.3014593681485|          2.0|    2.0|
|     15|      6|     1|State4|1.2249811699240525|102.21311367797993|1101.4908676573082|          2.0|    2.0|
|     43|     16|     2|State1| 1.038813009039619| 101.1548262958734|1167.1508027715001|          0.0|    0.0|
|      8|      4|     4|State4|0.5606289260664682| 115.2785995477906|1026.8081772466048|          2.0|    2.0|
|     21|     11|     2|State1|1.1899997373089435| 104.3671646235406|1048.1265488050308|          0.0|    0.0|
|

In [19]:
train, test = df.randomSplit([0.7, 0.3], seed=1234)


In [20]:
from pyspark.ml.recommendation import ALS

als = ALS(userCol="user_id", itemCol="crop_id", ratingCol="rating", rank=10)
model = als.fit(train)

# Predicting on test data
predictions = model.transform(test)


In [21]:
predictions.show()

+-------+-------+------+------+------------------+------------------+------------------+-------------+-------+----------+
|user_id|crop_id|rating| state|      soil_quality|          rainfall|             yield|categoryIndex|massive|prediction|
+-------+-------+------+------+------------------+------------------+------------------+-------------+-------+----------+
|      1|      1|     1|State4|0.8526026059288582| 77.97495157698857| 1028.288102584671|          2.0|    2.0| 2.9484446|
|      1|      1|     4|State1|1.0298812610885262| 96.13016042239097|1123.2451557587103|          0.0|    0.0| 2.9484446|
|      1|      5|     1|State4|1.2894321941951912| 93.86325079517123| 1085.826572304459|          2.0|    2.0| 2.7901173|
|      1|     17|     2|State3| 1.214641274908724| 95.10319166423974|1196.3561893911606|          1.0|    1.0| 3.7873878|
|      2|      1|     2|State4|1.1011026956243284|  118.865413333656| 946.8480362574217|          2.0|    2.0| 2.7820675|
|      2|      1|     5|