In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from pyspark.sql import SparkSession
from pyspark.sql.functions import split, regexp_replace, lower, col, explode, regexp_extract
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

spark = SparkSession.builder.getOrCreate()

In [2]:
df = spark.read.orc('data/dataframe.orc')
df.printSchema()

root
 |-- AcceptedAnswerId: long (nullable = true)
 |-- AnswerCount: long (nullable = true)
 |-- Body: string (nullable = true)
 |-- CommentCount: long (nullable = true)
 |-- ContentLicense: string (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- FavoriteCount: long (nullable = true)
 |-- Id: long (nullable = true)
 |-- LastActivityDate: timestamp (nullable = true)
 |-- OwnerUserId: long (nullable = true)
 |-- PostTypeId: long (nullable = true)
 |-- Score: long (nullable = true)
 |-- Tags: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- ViewCount: long (nullable = true)



In [3]:
df.select('Title', 'Body', 'Score', 'Tags').show()

+--------------------+--------------------+-----+--------------------+
|               Title|                Body|Score|                Tags|
+--------------------+--------------------+-----+--------------------+
|Coding the regres...|<p>Say I have 5 b...|    0|<r><regression><l...|
|                null|<p>I recommend yo...|    2|                null|
|Why dummy variabl...|<p>I was speaking...|    3|  <categorical-data>|
|Binomial random v...|<p>I've been read...|    4|<chi-squared><bin...|
| Analyze proportions|<p>I have a datas...|   13|    <r><multinomial>|
|                null|<p>The terminolog...|    6|                null|
|                null|<p>Perhaps he is ...|    3|                null|
|                null|<p>Like Michelle ...|    3|                null|
|Simulating financ...|<p>I would like t...|    1|<r><distributions...|
|Multiple regressi...|<p>I am performin...|    6|<r><time-series><...|
|Winning percentag...|<p>Suppose I want...|    4|<regression><gene...|
|Chi-s

In [4]:
df.withColumn('category', regexp_extract(col('Tags'), '^<(.+?)>', 1)).select('category').show()

+--------------------+
|            category|
+--------------------+
|                   r|
|                null|
|    categorical-data|
|         chi-squared|
|                   r|
|                null|
|                null|
|                null|
|                   r|
|                   r|
|          regression|
|       distributions|
|                null|
|       distributions|
|                null|
|                null|
|    categorical-data|
|statistical-signi...|
|                null|
|                null|
+--------------------+
only showing top 20 rows



In [5]:
data = df.withColumn('category', regexp_extract(col('Tags'), '^<(.+?)>', 1)).filter(col('category').isin('r', 'regression', 'machine-learning', 'probability', 'time-series'))

In [6]:
data.groupby('category').count().sort('count', ascending=False).limit(10).show()

+----------------+-----+
|        category|count|
+----------------+-----+
|               r|24119|
|      regression|18987|
|machine-learning|13303|
|     probability| 8446|
|     time-series| 7891|
+----------------+-----+



In [7]:
tokenizer = Tokenizer(inputCol="Body", outputCol="words")
words = tokenizer.transform(data)
words.select('Body', 'words').show()

+--------------------+--------------------+
|                Body|               words|
+--------------------+--------------------+
|<p>Say I have 5 b...|[<p>say, i, have,...|
|<p>I have a datas...|[<p>i, have, a, d...|
|<p>I would like t...|[<p>i, would, lik...|
|<p>I am performin...|[<p>i, am, perfor...|
|<p>Suppose I want...|[<p>suppose, i, w...|
|<p>I want to do m...|[<p>i, want, to, ...|
|<p>Suppose you ha...|[<p>suppose, you,...|
|<p>Suppose produc...|[<p>suppose, prod...|
|<p>I am doing som...|[<p>i, am, doing,...|
|<p>Complex survey...|[<p>complex, surv...|
|<p>Most of situat...|[<p>most, of, sit...|
|<p>I would like t...|[<p>i, would, lik...|
|<p>I'm implementi...|[<p>i'm, implemen...|
|<p>Given multiple...|[<p>given, multip...|
|<p>Is it reasonab...|[<p>is, it, reaso...|
|<p>I am working o...|[<p>i, am, workin...|
|<p>I've made repl...|[<p>i've, made, r...|
|<p>If p is the pr...|[<p>if, p, is, th...|
|<p>I've got a non...|[<p>i've, got, a,...|
|<p>In R, if I set...|[<p>in, r,

In [8]:
cv = CountVectorizer(inputCol="words", outputCol="features", minDF=2.0, vocabSize=50000)

model = cv.fit(words)
result = model.transform(words)

In [9]:
result.select('Title', 'Body', 'words', 'features', 'category').show()

+--------------------+--------------------+--------------------+--------------------+----------------+
|               Title|                Body|               words|            features|        category|
+--------------------+--------------------+--------------------+--------------------+----------------+
|Coding the regres...|<p>Say I have 5 b...|[<p>say, i, have,...|(50000,[0,1,2,3,4...|               r|
| Analyze proportions|<p>I have a datas...|[<p>i, have, a, d...|(50000,[0,1,2,3,4...|               r|
|Simulating financ...|<p>I would like t...|[<p>i, would, lik...|(50000,[0,1,2,3,4...|               r|
|Multiple regressi...|<p>I am performin...|[<p>i, am, perfor...|(50000,[0,1,2,3,4...|               r|
|Winning percentag...|<p>Suppose I want...|[<p>suppose, i, w...|(50000,[0,1,2,3,4...|      regression|
|How to choose the...|<p>I want to do m...|[<p>i, want, to, ...|(50000,[0,1,2,4,5...|      regression|
|   Interaction terms|<p>Suppose you ha...|[<p>suppose, you,...|(50000,[2

In [10]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="category", outputCol="label")
indexed = indexer.fit(result).transform(result)
indexed.select('features', 'category', 'label').show()

+--------------------+----------------+-----+
|            features|        category|label|
+--------------------+----------------+-----+
|(50000,[0,1,2,3,4...|               r|  0.0|
|(50000,[0,1,2,3,4...|               r|  0.0|
|(50000,[0,1,2,3,4...|               r|  0.0|
|(50000,[0,1,2,3,4...|               r|  0.0|
|(50000,[0,1,2,3,4...|      regression|  1.0|
|(50000,[0,1,2,4,5...|      regression|  1.0|
|(50000,[2,8,13,15...|      regression|  1.0|
|(50000,[0,1,2,3,4...|     time-series|  4.0|
|(50000,[1,2,3,5,6...|               r|  0.0|
|(50000,[0,1,2,3,4...|               r|  0.0|
|(50000,[0,1,2,3,4...|               r|  0.0|
|(50000,[0,1,2,3,4...|               r|  0.0|
|(50000,[2,4,7,10,...|machine-learning|  2.0|
|(50000,[1,3,4,6,7...|      regression|  1.0|
|(50000,[0,1,2,3,4...|      regression|  1.0|
|(50000,[0,1,2,3,4...|machine-learning|  2.0|
|(50000,[0,1,2,3,4...|     time-series|  4.0|
|(50000,[0,1,2,3,4...|               r|  0.0|
|(50000,[0,1,2,3,4...|            

In [11]:
train, test = indexed.randomSplit([0.8, 0.2], seed=12345)
train, validate = train.randomSplit([0.8, 0.2], seed=12345)

In [12]:
train.count()

46683

In [13]:
validate.count()

11565

In [14]:
test.count()

14498

In [15]:
lr = LogisticRegression()

In [16]:
# Fit the model
lrModel = lr.fit(train)

In [18]:
predictions = lrModel.transform(test)

In [19]:
predictions.printSchema()

root
 |-- AcceptedAnswerId: long (nullable = true)
 |-- AnswerCount: long (nullable = true)
 |-- Body: string (nullable = true)
 |-- CommentCount: long (nullable = true)
 |-- ContentLicense: string (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- FavoriteCount: long (nullable = true)
 |-- Id: long (nullable = true)
 |-- LastActivityDate: timestamp (nullable = true)
 |-- OwnerUserId: long (nullable = true)
 |-- PostTypeId: long (nullable = true)
 |-- Score: long (nullable = true)
 |-- Tags: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- ViewCount: long (nullable = true)
 |-- category: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- label: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [21]:
predictions.select('prediction', 'probability', 'label').show()

+----------+--------------------+-----+
|prediction|         probability|label|
+----------+--------------------+-----+
|       0.0|[0.99999999814299...|  1.0|
|       0.0|[0.99999999990438...|  1.0|
|       2.0|[8.12714517432618...|  3.0|
|       1.0|[6.65844708753902...|  1.0|
|       3.0|[4.65509876440643...|  3.0|
|       1.0|[2.52655339123261...|  1.0|
|       1.0|[1.47861563657327...|  1.0|
|       0.0|[1.0,2.1332443640...|  0.0|
|       0.0|[1.0,6.7448164940...|  1.0|
|       1.0|[6.781924499707E-...|  1.0|
|       1.0|[0.0,1.0,0.0,0.0,...|  1.0|
|       0.0|[1.0,0.0,0.0,0.0,...|  0.0|
|       4.0|[0.0,0.0,0.0,0.0,...|  3.0|
|       0.0|[1.0,8.4689471332...|  4.0|
|       0.0|[1.0,9.5386296882...|  1.0|
|       1.0|[2.74766799830940...|  1.0|
|       0.0|[1.0,9.2569942373...|  0.0|
|       0.0|[1.0,0.0,0.0,0.0,...|  0.0|
|       0.0|[1.0,4.9235111209...|  0.0|
|       1.0|[4.26015254109860...|  1.0|
+----------+--------------------+-----+
only showing top 20 rows



In [25]:
predictions.filter(col('prediction') == col('label')).count() / predictions.count()

0.6659539246792661

In [27]:
# 5 classes
lrModel.coefficientMatrix()

DenseMatrix(5, 50000, [0.0108, 0.0193, 0.5031, 0.0048, 0.043, 1.1338, -0.3093, 0.2789, ..., -35.5791, -23.4632, -6.6764, 63.5506, 7.0663, -2.575, -11.0971, 63.9501], 1)

In [22]:
trainingSummary = lrModel.summary
accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Accuracy: 1.0
FPR: 0.0
TPR: 1.0
F-measure: 1.0
Precision: 1.0
Recall: 1.0


In [91]:
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

False positive rate by label:
label 0: 0.0
label 1: 0.0
label 2: 0.0
label 3: 0.0
label 4: 0.0
True positive rate by label:
label 0: 1.0
label 1: 1.0
label 2: 1.0
label 3: 1.0
label 4: 1.0
Precision by label:
label 0: 1.0
label 1: 1.0
label 2: 1.0
label 3: 1.0
label 4: 1.0
Recall by label:
label 0: 1.0
label 1: 1.0
label 2: 1.0
label 3: 1.0
label 4: 1.0
F-measure by label:
label 0: 1.0
label 1: 1.0
label 2: 1.0
label 3: 1.0
label 4: 1.0
