# 1. Prepare feature matrix

Feature matrix are saved via previous program. So in this part, they are loaded from local files.

# 2. Run in Spark

In [77]:
spark = SparkSession.builder.appName("HomeDepot").getOrCreate()
df = spark.read.csv("/Users/jiemin/Documents/cmu/tasks/task14/train_s_formatted.csv",header=True,inferSchema=True)

In [78]:
df.take(1)

[Row(id=2, product_uid=100001, relevance=3.0, cosine_1=0.276609, cosine_2=0.206489, cosine_3=0.0, common_word=1.0, fuzzy_train=56.0, jaccard_train=0.2, query_length=2.0)]

In [79]:
trainSet, testSet = df.randomSplit([0.8, 0.2])

In [80]:
trainSet.take(1)

[Row(id=2, product_uid=100001, relevance=3.0, cosine_1=0.276609, cosine_2=0.206489, cosine_3=0.0, common_word=1.0, fuzzy_train=56.0, jaccard_train=0.2, query_length=2.0)]

## (1) Linear regression

In [81]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.mllib.linalg import SparseVector
from pyspark.sql.functions import udf
vs = VectorAssembler(inputCols=["cosine_1","cosine_2","cosine_3","common_word","fuzzy_train","jaccard_train","query_length"],outputCol='features')
# vs = VectorAssembler(inputCols=["query_length"],outputCol='features')
train_lr = vs.transform(trainSet)

In [82]:
train_lr.take(1)

[Row(id=2, product_uid=100001, relevance=3.0, cosine_1=0.276609, cosine_2=0.206489, cosine_3=0.0, common_word=1.0, fuzzy_train=56.0, jaccard_train=0.2, query_length=2.0, features=DenseVector([0.2766, 0.2065, 0.0, 1.0, 56.0, 0.2, 2.0]))]

In [83]:
lr = LinearRegression(featuresCol='features', regParam=0.3, elasticNetParam=0.8,labelCol='relevance',maxIter=1000)
model = lr.fit(train_lr)

In [84]:
lr_summary = model.summary
print("Coefficients: %s" % str(model.coefficients))
print("Intercept: %s" % str(model.intercept))
print("numIterations: %d" % lr_summary.totalIterations)
print("objectiveHistory: %s" % str(lr_summary.objectiveHistory))
print("RMSE: %f" % lr_summary.rootMeanSquaredError)
print("r2: %f" % lr_summary.r2)

Coefficients: [0.0,0.0,0.0,0.0,0.0,0.0,0.0]
Intercept: 2.3830579675733823
numIterations: 1
objectiveHistory: [0.4999999999999982]
RMSE: 0.533692
r2: -0.000000


In [85]:
test_lr = vs.transform(testSet)
predict_lr = model.transform(test_lr)

In [86]:
predict_lr.show(3)

+---+-----------+---------+--------+--------+--------+-----------+-----------+-------------+------------+--------------------+------------------+
| id|product_uid|relevance|cosine_1|cosine_2|cosine_3|common_word|fuzzy_train|jaccard_train|query_length|            features|        prediction|
+---+-----------+---------+--------+--------+--------+-----------+-----------+-------------+------------+--------------------+------------------+
| 16|     100005|     2.33|0.145886|0.383327|  0.0451|        1.0|       55.0|     0.066667|         3.0|[0.145886,0.38332...|2.3830579675733823|
| 23|     100007|     2.67|0.422888|0.304101| 0.14809|        1.0|      100.0|          0.1|         2.0|[0.422888,0.30410...|2.3830579675733823|
| 81|     100017|     2.33|     0.0|0.604409|     0.0|        0.0|       21.0|          0.0|         2.0|(7,[1,4,6],[0.604...|2.3830579675733823|
+---+-----------+---------+--------+--------+--------+-----------+-----------+-------------+------------+-------------------

In [87]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(
    labelCol="relevance", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predict_lr)
print("Root Mean Squared Error (RMSE) on validation data = %g" % rmse)

Root Mean Squared Error (RMSE) on validation data = 0.535112


## (2) Random forest

In [88]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer 
from pyspark.ml.feature import IndexToString, StringIndexer
from pyspark.ml import Pipeline 

rf = RandomForestRegressor(featuresCol="features")
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(train_lr)
labelIndexer = StringIndexer(inputCol="relevance", outputCol="label").fit(train_lr)
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])
model = pipeline.fit(train_lr)
predict_rf = model.transform(test_lr)

In [89]:
train_lr.show(3)

+---+-----------+---------+--------+--------+--------+-----------+-----------+-------------+------------+--------------------+
| id|product_uid|relevance|cosine_1|cosine_2|cosine_3|common_word|fuzzy_train|jaccard_train|query_length|            features|
+---+-----------+---------+--------+--------+--------+-----------+-----------+-------------+------------+--------------------+
|  2|     100001|      3.0|0.276609|0.206489|     0.0|        1.0|       56.0|          0.2|         2.0|[0.276609,0.20648...|
|  3|     100001|      2.5|     0.0|0.206489|     0.0|        0.0|       19.0|          0.0|         2.0|(7,[1,4,6],[0.206...|
|  9|     100002|      3.0|     0.0|0.240663|0.056452|        0.0|       19.0|          0.0|         2.0|[0.0,0.240663,0.0...|
+---+-----------+---------+--------+--------+--------+-----------+-----------+-------------+------------+--------------------+
only showing top 3 rows



In [90]:
predict_rf.show(5)

+---+-----------+---------+--------+--------+--------+-----------+-----------+-------------+------------+--------------------+-----+--------------------+------------------+
| id|product_uid|relevance|cosine_1|cosine_2|cosine_3|common_word|fuzzy_train|jaccard_train|query_length|            features|label|     indexedFeatures|        prediction|
+---+-----------+---------+--------+--------+--------+-----------+-----------+-------------+------------+--------------------+-----+--------------------+------------------+
| 16|     100005|     2.33|0.145886|0.383327|  0.0451|        1.0|       55.0|     0.066667|         3.0|[0.145886,0.38332...|  1.0|[0.145886,0.38332...|2.3154171800580485|
| 23|     100007|     2.67|0.422888|0.304101| 0.14809|        1.0|      100.0|          0.1|         2.0|[0.422888,0.30410...|  2.0|[0.422888,0.30410...|1.2797180313774803|
| 81|     100017|     2.33|     0.0|0.604409|     0.0|        0.0|       21.0|          0.0|         2.0|(7,[1,4,6],[0.604...|  1.0|(7,

In [91]:
# evluator for regression 
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predict_rf)
print("Root Mean Squared Error = %g" % rmse)

Root Mean Squared Error = 1.52284


## (3) Output result

In [92]:
df_test = spark.read.csv("/Users/jiemin/Documents/cmu/tasks/task14/test_s_formatted.csv",header=True,inferSchema=True)

In [93]:
df.show(5)

+---+-----------+---------+--------+--------+--------+-----------+-----------+-------------+------------+
| id|product_uid|relevance|cosine_1|cosine_2|cosine_3|common_word|fuzzy_train|jaccard_train|query_length|
+---+-----------+---------+--------+--------+--------+-----------+-----------+-------------+------------+
|  2|     100001|      3.0|0.276609|0.206489|     0.0|        1.0|       56.0|          0.2|         2.0|
|  3|     100001|      2.5|     0.0|0.206489|     0.0|        0.0|       19.0|          0.0|         2.0|
|  9|     100002|      3.0|     0.0|0.240663|0.056452|        0.0|       19.0|          0.0|         2.0|
| 16|     100005|     2.33|0.145886|0.383327|  0.0451|        1.0|       55.0|     0.066667|         3.0|
| 17|     100005|     2.67|0.428689|0.383327|0.132527|        3.0|      100.0|     0.230769|         3.0|
+---+-----------+---------+--------+--------+--------+-----------+-----------+-------------+------------+
only showing top 5 rows



In [94]:
df_train = vs.transform(df)
df_train.show(5)

+---+-----------+---------+--------+--------+--------+-----------+-----------+-------------+------------+--------------------+
| id|product_uid|relevance|cosine_1|cosine_2|cosine_3|common_word|fuzzy_train|jaccard_train|query_length|            features|
+---+-----------+---------+--------+--------+--------+-----------+-----------+-------------+------------+--------------------+
|  2|     100001|      3.0|0.276609|0.206489|     0.0|        1.0|       56.0|          0.2|         2.0|[0.276609,0.20648...|
|  3|     100001|      2.5|     0.0|0.206489|     0.0|        0.0|       19.0|          0.0|         2.0|(7,[1,4,6],[0.206...|
|  9|     100002|      3.0|     0.0|0.240663|0.056452|        0.0|       19.0|          0.0|         2.0|[0.0,0.240663,0.0...|
| 16|     100005|     2.33|0.145886|0.383327|  0.0451|        1.0|       55.0|     0.066667|         3.0|[0.145886,0.38332...|
| 17|     100005|     2.67|0.428689|0.383327|0.132527|        3.0|      100.0|     0.230769|         3.0|[0.428

### 1) Linear regression

In [95]:
lr = LinearRegression(featuresCol='features', regParam=0.3, elasticNetParam=0.8,labelCol='relevance',maxIter=1000)
model_lr = lr.fit(df_train)
test = vs.transform(df_test)
predict_linear = model_lr.transform(test)

In [96]:
p_lr = predict_linear
p_lr = p_lr.drop("product_uid","cosine_1","cosine_2","cosine_3","common_word","fuzzy_train","jaccard_train","query_length","features")
p_lr.toPandas().to_csv("predict_result_lr.csv",index=False)

### 2) Random forest

In [97]:
model_rf = pipeline.fit(df_train)
predict_random = model.transform(test)

In [98]:
predict_random.show(5)

+---+-----------+--------+--------+--------+-----------+-----------+-------------+------------+--------------------+--------------------+------------------+
| id|product_uid|cosine_1|cosine_2|cosine_3|common_word|fuzzy_train|jaccard_train|query_length|            features|     indexedFeatures|        prediction|
+---+-----------+--------+--------+--------+-----------+-----------+-------------+------------+--------------------+--------------------+------------------+
|  1|     100001|     0.0|0.206489|0.043806|        0.0|       28.0|          0.0|         3.0|[0.0,0.206489,0.0...|[0.0,0.206489,0.0...|2.4903236903787382|
|  4|     100001|     0.0|0.206489|     0.0|        0.0|       24.0|          0.0|         3.0|(7,[1,4,6],[0.206...|(7,[1,4,6],[0.206...| 2.549244832902799|
|  5|     100001|0.300237|0.206489|0.064453|        1.0|       61.0|     0.166667|         3.0|[0.300237,0.20648...|[0.300237,0.20648...|2.0859801912656737|
|  6|     100001|0.485935|0.206489|0.136106|        1.0|  

In [None]:
p_rf = predict_random
p_rf = p_rf.drop("product_uid","cosine_1","cosine_2","cosine_3","common_word","fuzzy_train","jaccard_train","query_length","features")
p_rf.toPandas().to_csv("predict_result_rf",index=False)