In [17]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as func
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

In [18]:
from math import log
from sklearn.metrics import log_loss
from random import seed
import pandas as pd
import matplotlib.pyplot as plt
#%matplotlib inline

In [19]:
spark = SparkSession.builder.appName("Quora1_model").getOrCreate()
spark

In [20]:
# Project Description
# Read features
trainFeaturesSmallPath = "data/train_features_1000.csv" # small training data
trainFeaturesPath = "data/train_features2.csv"           # full training data
testFeaturesPath = "data/test_features2.csv"             # full test data (for cluster job only)
outPath = "data/predictions.csv"  

In [21]:
# Split into train and test
train_df = spark.read.csv(trainFeaturesPath, header=True, inferSchema=True)
#test_df = spark.read.csv(testFeaturesPath, header=True, inferSchema=True)

In [22]:
# Look at train and test data
train_df.printSchema()
#test_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- lWCount1: integer (nullable = true)
 |-- lWCount2: integer (nullable = true)
 |-- qWCount1: integer (nullable = true)
 |-- qWCount2: integer (nullable = true)
 |-- lLen1: integer (nullable = true)
 |-- lLen2: integer (nullable = true)
 |-- qLen1: integer (nullable = true)
 |-- qLen2: integer (nullable = true)
 |-- lWCount_ratio: double (nullable = true)
 |-- qWCount_ratio: double (nullable = true)
 |-- lLen_ratio: double (nullable = true)
 |-- qLen_ratio: double (nullable = true)
 |-- qNgrams_1: integer (nullable = true)
 |-- qNgrams_2: integer (nullable = true)
 |-- qNgrams_3: integer (nullable = true)
 |-- lNgrams_1: integer (nullable = true)
 |-- lNgrams_2: integer (nullable = true)
 |-- lNgrams_3: integer (nullable = true)
 |-- qUnigram_ratio: double (nullable = true)
 |-- lUnigram_ratio: double (nullable = true)
 |-- tfidfDistance: double (nullable = true)
 |-- is_duplicate: integer (nullable = true)



In [23]:
# Get features' names from the training data
featuresNames=train_df.columns[1:-1]

In [24]:
# Create features column
assembler=VectorAssembler(inputCols=featuresNames, outputCol='features')
train_df=assembler.transform(train_df)
#test_df=assembler.transform(test_df)

In [25]:
# Remove unnecessary columns
train_df=train_df.select('id','features','is_duplicate')
train_df.show(3)

+---+--------------------+------------+
| id|            features|is_duplicate|
+---+--------------------+------------+
|148|[6.0,7.0,12.0,14....|           0|
|463|[4.0,2.0,14.0,8.0...|           0|
|471|[2.0,1.0,11.0,5.0...|           0|
+---+--------------------+------------+
only showing top 3 rows



In [26]:
# Split into train and test
seed(0)
(trainingData, testData)= train_df.randomSplit([0.7,0.3])
print('trainingData has %d rows' % trainingData.count())
print('testData has %d rows' % testData.count())

trainingData has 282965 rows
testData has 121323 rows


In [27]:
# Random Forest model
# Train a Random Forest model :RF_model
#for tree in [400,200]:# decreases variance
#    for depth in [30,1]: #variance
#        rf = RandomForestClassifier(labelCol="is_duplicate", featuresCol="features", \
#                                    maxMemoryInMB=16000, minInstancesPerNode=1, maxDepth=depth, numTrees=tree)
#        RF_model = rf.fit(trainingData)
#        # Make predictions: RF_predictions
#        RF_predictions = RF_model.transform(testData)
#        #RF_predictions.show(5)
#        # Select (prediction, true label) and compute AUC of the test
#        evaluator = BinaryClassificationEvaluator(labelCol="is_duplicate", metricName='areaUnderROC')
#        RF_AUC = evaluator.evaluate(RF_predictions)
#        print("Random Forest AUC = %g" % RF_AUC, tree, depth)

In [28]:
for maxIter in [5,250]:    
    # Create a GBT model : gbtModel
    gbt = GBTClassifier(labelCol="is_duplicate", featuresCol="features", maxIter=maxIter)
    gbtModel=gbt.fit(trainingData)
    # make predictions: gbtPredictions
    gbtPredictions=gbtModel.transform(testData)
    # Calculate AUC
    evaluator = BinaryClassificationEvaluator(
        labelCol="is_duplicate", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
    gbtAUC = evaluator.evaluate(gbtPredictions)
    print("AUC = %g" %  gbtAUC, "iter=",maxIter)
#AUC = 0.82515 iter= 100
#AUC = 0.8312 iter= 200


AUC = 0.789846 iter= 5


Py4JJavaError: An error occurred while calling o275.fit.
: org.apache.spark.SparkException: Job 3660 cancelled because SparkContext was shut down
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:820)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:818)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:78)
	at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:818)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1732)
	at org.apache.spark.util.EventLoop.stop(EventLoop.scala:83)
	at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1651)
	at org.apache.spark.SparkContext$$anonfun$stop$8.apply$mcV$sp(SparkContext.scala:1921)
	at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1317)
	at org.apache.spark.SparkContext.stop(SparkContext.scala:1920)
	at org.apache.spark.SparkContext$$anonfun$2.apply$mcV$sp(SparkContext.scala:581)
	at org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:216)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1954)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply$mcV$sp(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anon$2.run(ShutdownHookManager.scala:178)
	at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:746)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:745)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:745)
	at org.apache.spark.ml.tree.impl.RandomForest$.findSplitsBySorting(RandomForest.scala:928)
	at org.apache.spark.ml.tree.impl.RandomForest$.findSplits(RandomForest.scala:906)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:118)
	at org.apache.spark.ml.regression.DecisionTreeRegressor.train(DecisionTreeRegressor.scala:125)
	at org.apache.spark.ml.tree.impl.GradientBoostedTrees$.boost(GradientBoostedTrees.scala:323)
	at org.apache.spark.ml.tree.impl.GradientBoostedTrees$.run(GradientBoostedTrees.scala:53)
	at org.apache.spark.ml.classification.GBTClassifier.train(GBTClassifier.scala:175)
	at org.apache.spark.ml.classification.GBTClassifier.train(GBTClassifier.scala:59)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
# def pro of one func
prob_of_one_udf = func.udf(lambda v: float(v[1]), FloatType())

In [None]:
#outdf = predictions.withColumn('predict', func.round(prob_of_one_udf('probability'),6)).select('id','predict')
#outdf.cache()
#outdf.show(6)

In [None]:
# write csv
#outdf.orderBy('id').coalesce(1).write.csv(outPath,header=True,mode='overwrite',quote="")