# Ex3: NLP - Tags 

### Requirement: Build a tags filter. Use the various NLP tools and a classifier, to predict tag for one question.  In future questions could be auto-tagged by such a classifier or tags could be recommended to users prior to posting.
- Dataset: stack-overflow-data.csv. It contains Stack Overflow questions and associated tags.
- Link tham khảo: http://benalexkeen.com/multiclass-text-classification-with-pyspark/

In [None]:
import findspark
findspark.init()

In [None]:
import pyspark

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf

In [None]:
SparkContext.setSystemProperty('spark.executor.memory', '6g')
sc = SparkContext(master='spark://172.25.53.2:7077', appName='Stack_Overflow')

In [None]:
sc

In [None]:
spark = SparkSession(sc)

In [None]:
file_name = "hdfs://172.24.40.251:19000/stack_overflow_data.csv"
# file_name = "stack-overflow-data.csv"

In [None]:
data = spark.read.csv(file_name, inferSchema=True,header=True)

In [None]:
data.show(5)

In [None]:
data.groupby('tags').count().show(30)

In [None]:
tags_null_data = data.filter(data.tags.isNull())

In [None]:
tags_null_data.count()

In [None]:
data = data.filter(data.tags.isNotNull())

In [None]:
data.count()

In [None]:
from pyspark.sql.functions import *

## Clean and Prepare the Data

** Create a new length feature: **

In [None]:
from pyspark.sql.functions import length

In [None]:
data = data.withColumn('length',length(data['post']))

In [None]:
data.show()

In [None]:
# Pretty Clear Difference
data.groupby('tags').mean().show()

## Feature Transformations

In [None]:
from bs4 import BeautifulSoup

from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

In [None]:
class BsTextExtractor(Transformer, HasInputCol, HasOutputCol):

    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(BsTextExtractor, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def _transform(self, dataset):

        def f(s):
            cleaned_post = BeautifulSoup(s).text
            return cleaned_post

        t = StringType()
        out_col = self.getOutputCol()
        in_col = dataset[self.getInputCol()]
        return dataset.withColumn(out_col, udf(f, t)(in_col))

In [None]:
from pyspark.ml.feature import Tokenizer,StopWordsRemover, CountVectorizer,IDF,StringIndexer
text_extractor = BsTextExtractor(inputCol="post", outputCol="cleaned_post")
tokenizer = Tokenizer(inputCol="cleaned_post", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec')
idf = IDF(inputCol="c_vec", outputCol="tf_idf")
class_to_num = StringIndexer(inputCol='tags',outputCol='label')

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [None]:
clean_up = VectorAssembler(inputCols=['tf_idf','length'],outputCol='features')

### The Model

We'll use Naive Bayes, but feel free to play around with this choice!

In [None]:
from pyspark.ml.classification import NaiveBayes

In [None]:
# Use defaults
nb = NaiveBayes()

### Pipeline

In [None]:
from pyspark.ml import Pipeline

In [None]:
data_prep_pipe = Pipeline(stages=[class_to_num,text_extractor,tokenizer,stopremove,count_vec,idf,clean_up])

In [None]:
cleaner = data_prep_pipe.fit(data)

In [None]:
clean_data = cleaner.transform(data)

### Training and Evaluation!

In [None]:
clean_data = clean_data.select(['label','features'])

In [None]:
clean_data.show() 

In [None]:
(training,testing) = clean_data.randomSplit([0.7,0.3], seed=142)

In [None]:
#training.cache()

In [None]:
#testing.cache()

In [None]:
#training.groupBy("label").count().show()

In [None]:
#testing.groupBy("label").count().show()

In [None]:
predictor = nb.fit(training)

In [None]:
test_results = predictor.transform(testing)

In [None]:
test_results.show()

In [None]:
# Create a confusion matrix
test_results.groupBy('label', 'prediction').count().show()

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting: {}".format(acc))

In [None]:
# save may cuc bo
nb.save("NB_TagFilters_model")

In [None]:
# save hdfs
nb.save("hdfs://172.24.40.251:19000/NB_TagFilters_model")

- Not very good result! (~72%)
- Solution: Try switching out the classification models! Or even try to come up with other engineered features!...

### Use LogisticRegression/Random Forest

### Logistic Regression

In [None]:
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression

In [None]:
lg = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

In [None]:
predictor_1 = lg.fit(training)

In [None]:
test_results_1 = predictor_1.transform(testing)

In [None]:
# Create a confusion matrix
test_results_1.groupBy('label', 'prediction').count().show()

In [None]:
acc_eval = MulticlassClassificationEvaluator()
acc_1 = acc_eval.evaluate(test_results_1)
print("Accuracy of model at predicting: {}".format(acc_1))

In [None]:
## It's not better result!!!

In [None]:
# Save máy cục bộ
# lg.save("LG_TagFilters_model")

In [None]:
# Save HDFS
# lg.save("hdfs://172.24.40.251:19000/LG_TagFilters_model")

### Random forest

In [None]:
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 500, \
                            maxDepth = 5, \
                            maxBins = 64)

In [None]:
predictor_2 = rf.fit(training)

In [None]:
test_results_2 = predictor_2.transform(testing)

In [None]:
# Create a confusion matrix
test_results_2.groupBy('label', 'prediction').count().show()

In [None]:
test_results_2.groupBy('prediction').count().show()

In [None]:
acc_eval = MulticlassClassificationEvaluator()
acc_2 = acc_eval.evaluate(test_results_2)
print("Accuracy of model at predicting: {}".format(acc_2))

In [None]:
## It has higher accuracy but is not a better result!!!

In [None]:
# Save máy cục bộ
# rf.save("RF_TagFilters_model")

In [None]:
# Save HDFS
# rf.save("hdfs://172.24.40.251:19000/RF_TagFilters_model")

In [None]:
sc.stop()