In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from pyspark.sql import SparkSession
from pyspark.sql.functions import split, regexp_replace, lower, col, explode, regexp_extract
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

spark = SparkSession.builder.getOrCreate()

In [2]:
df = spark.read.orc('data/dataframe.orc')
df.printSchema()

root
 |-- AcceptedAnswerId: long (nullable = true)
 |-- AnswerCount: long (nullable = true)
 |-- Body: string (nullable = true)
 |-- CommentCount: long (nullable = true)
 |-- ContentLicense: string (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- FavoriteCount: long (nullable = true)
 |-- Id: long (nullable = true)
 |-- LastActivityDate: timestamp (nullable = true)
 |-- OwnerUserId: long (nullable = true)
 |-- PostTypeId: long (nullable = true)
 |-- Score: long (nullable = true)
 |-- Tags: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- ViewCount: long (nullable = true)



In [3]:
df.select('Title', 'Body', 'Score', 'Tags').show()

+--------------------+--------------------+-----+--------------------+
|               Title|                Body|Score|                Tags|
+--------------------+--------------------+-----+--------------------+
|Coding the regres...|<p>Say I have 5 b...|    0|<r><regression><l...|
|                null|<p>I recommend yo...|    2|                null|
|Why dummy variabl...|<p>I was speaking...|    3|  <categorical-data>|
|Binomial random v...|<p>I've been read...|    4|<chi-squared><bin...|
| Analyze proportions|<p>I have a datas...|   13|    <r><multinomial>|
|                null|<p>The terminolog...|    6|                null|
|                null|<p>Perhaps he is ...|    3|                null|
|                null|<p>Like Michelle ...|    3|                null|
|Simulating financ...|<p>I would like t...|    1|<r><distributions...|
|Multiple regressi...|<p>I am performin...|    6|<r><time-series><...|
|Winning percentag...|<p>Suppose I want...|    4|<regression><gene...|
|Chi-s

In [4]:
df.withColumn('category', regexp_extract(col('tags'), '^<(.+?)>', 1)).select('category').show()

+--------------------+
|            category|
+--------------------+
|                   r|
|                null|
|    categorical-data|
|         chi-squared|
|                   r|
|                null|
|                null|
|                null|
|                   r|
|                   r|
|          regression|
|       distributions|
|                null|
|       distributions|
|                null|
|                null|
|    categorical-data|
|statistical-signi...|
|                null|
|                null|
+--------------------+
only showing top 20 rows



In [5]:
df.count()

340838

In [8]:
data = df.withColumn('category', regexp_extract(col('tags'), '^<(.+?)>', 1)).filter(col('category').isin('r', 'regression', 'machine-learning', 'probability'))

In [9]:
data.groupby('category').count().sort('count', ascending=False).limit(10).show()

+----------------+-----+
|        category|count|
+----------------+-----+
|               r|24119|
|      regression|18987|
|machine-learning|13303|
|     probability| 8446|
+----------------+-----+



In [10]:
tokenizer = Tokenizer(inputCol="Body", outputCol="words")
words = tokenizer.transform(data)
words.select('Body', 'words').show()

+--------------------+--------------------+
|                Body|               words|
+--------------------+--------------------+
|<p>Say I have 5 b...|[<p>say, i, have,...|
|<p>I have a datas...|[<p>i, have, a, d...|
|<p>I would like t...|[<p>i, would, lik...|
|<p>I am performin...|[<p>i, am, perfor...|
|<p>Suppose I want...|[<p>suppose, i, w...|
|<p>I want to do m...|[<p>i, want, to, ...|
|<p>Suppose you ha...|[<p>suppose, you,...|
|<p>I am doing som...|[<p>i, am, doing,...|
|<p>Complex survey...|[<p>complex, surv...|
|<p>Most of situat...|[<p>most, of, sit...|
|<p>I would like t...|[<p>i, would, lik...|
|<p>I'm implementi...|[<p>i'm, implemen...|
|<p>Given multiple...|[<p>given, multip...|
|<p>Is it reasonab...|[<p>is, it, reaso...|
|<p>I am working o...|[<p>i, am, workin...|
|<p>If p is the pr...|[<p>if, p, is, th...|
|<p>I've got a non...|[<p>i've, got, a,...|
|<p>In R, if I set...|[<p>in, r,, if, i...|
|<p>I have a datas...|[<p>i, have, a, d...|
|<p>This is someth...|[<p>this, 

In [11]:
cv = CountVectorizer(inputCol="words", outputCol="features", minDF=2.0, vocabSize=50000)

model = cv.fit(words)

result = model.transform(words)

In [12]:
result.select('Title', 'Body', 'words', 'features', 'category').show()

+--------------------+--------------------+--------------------+--------------------+----------------+
|               Title|                Body|               words|            features|        category|
+--------------------+--------------------+--------------------+--------------------+----------------+
|Coding the regres...|<p>Say I have 5 b...|[<p>say, i, have,...|(50000,[0,1,2,3,4...|               r|
| Analyze proportions|<p>I have a datas...|[<p>i, have, a, d...|(50000,[0,1,2,3,4...|               r|
|Simulating financ...|<p>I would like t...|[<p>i, would, lik...|(50000,[0,1,2,3,4...|               r|
|Multiple regressi...|<p>I am performin...|[<p>i, am, perfor...|(50000,[0,1,2,3,4...|               r|
|Winning percentag...|<p>Suppose I want...|[<p>suppose, i, w...|(50000,[0,1,2,3,4...|      regression|
|How to choose the...|<p>I want to do m...|[<p>i, want, to, ...|(50000,[0,1,2,4,5...|      regression|
|   Interaction terms|<p>Suppose you ha...|[<p>suppose, you,...|(50000,[2

In [13]:
result.groupby('category').count().show()

+----------------+-----+
|        category|count|
+----------------+-----+
|machine-learning|13303|
|      regression|18987|
|     probability| 8446|
|               r|24119|
+----------------+-----+



In [14]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
indexed = indexer.fit(result).transform(result)
indexed.select('category', 'categoryIndex').show()

+----------------+-------------+
|        category|categoryIndex|
+----------------+-------------+
|               r|          0.0|
|               r|          0.0|
|               r|          0.0|
|               r|          0.0|
|      regression|          1.0|
|      regression|          1.0|
|      regression|          1.0|
|               r|          0.0|
|               r|          0.0|
|               r|          0.0|
|               r|          0.0|
|machine-learning|          2.0|
|      regression|          1.0|
|      regression|          1.0|
|machine-learning|          2.0|
|               r|          0.0|
|               r|          0.0|
|               r|          0.0|
|machine-learning|          2.0|
|      regression|          1.0|
+----------------+-------------+
only showing top 20 rows



In [16]:
train, test = indexed.randomSplit([0.8, 0.2], seed=12345)

In [25]:
lr = LogisticRegression(featuresCol='features', labelCol='categoryIndex', maxIter=400, regParam=0.3, elasticNetParam=0.8)

In [26]:
# Fit the model
lrModel = lr.fit(train)

In [24]:
accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Accuracy: 0.37177091560555564
FPR: 0.37177091560555564
TPR: 0.37177091560555564
F-measure: 0.2015112175332571
Precision: 0.1382136136901932
Recall: 0.37177091560555564
