In [0]:
from pyspark.sql.functions import *
from pyspark.sql.functions import col, regexp_replace, lower, trim
from pyspark.ml import PipelineModel
# Decoding the indexer
from pyspark.ml.feature import StringIndexerModel, IndexToString
import datetime

In [0]:
today_date = str(datetime.date.today())

posts_file_location = "/mnt/deBDProject/ml_training_project/Posts/"+today_date+"/*"
posts = spark.read.parquet(posts_file_location)
ml_model = "/mnt/deBDProject/modelRF"
stringindexer = "/mnt/deBDProject/stringindexerRF"

In [0]:
# User defined function
def predictions_udf(df, ml_model, stringindexer):
    # Filter out empty body text
    df = df.filter("Body is not null")

    # Formatting the 'Body' and `Tag` columns for machine learning training
    df = df.withColumn("Body",regexp_replace(df.Body,"<.*?>",""))\
        .withColumn("Tags", split(trim(translate(col("Tags"), "<>", " ")), " ")
    )

    # Making sure the naming of the columns are consistent with the model
    df = df.select(col("Body").alias("text"), col("Tags"), col("ViewCount").alias("viewcount"))

    # Preprocessing of the feature column
    cleaned = df.withColumn('text', regexp_replace('text', r"http\S+", "")) \
                    .withColumn('text', regexp_replace('text', r"[^a-zA-Z]", " ")) \
                    .withColumn('text', regexp_replace('text', r"\s{2,}", " ")) \
                    .withColumn('text', lower('text')) \
                    .withColumn('text', trim('text')) 
    

    # Load in the saved pipeline model
    model = PipelineModel.load(ml_model)

    # Making the prediction
    prediction = model.transform(cleaned)

    predicted = prediction.select(col('text'), col('tags'),col('ViewCount'), col('prediction'))

    # Load in the StringIndexer that was saved
    indexer = StringIndexerModel.load(stringindexer)

    # Initialize the IndexToString converter
    i2s = IndexToString(inputCol = 'prediction', outputCol = 'decoded', labels = indexer.labels)
    converted = i2s.transform(predicted)

    # Display the important columns
    return converted

In [0]:
result = predictions_udf(posts,ml_model, stringindexer)

In [0]:
# result.limit(15).toPandas()

Unnamed: 0,text,tags,ViewCount,prediction,decoded
0,take a look at camera programming topics for ios,[NULL],0,0.0,c#
1,doc was developed using the interviews ui tool...,[NULL],0,0.0,c#
2,for debugging purposes how do i echo the query...,"[php, mysql, pdo]",472,4.0,php
3,there is also another project which may be wor...,[NULL],0,0.0,c#
4,i ve built a function which will prepare sql s...,"[php, mysql, pdo]",1146,4.0,php
5,the problem is that the databinding has not ye...,[NULL],0,0.0,c#
6,i have changed the deployment provider in the ...,"[clickonce, manifest]",314,0.0,c#
7,for example here are two ways to set an intege...,"[c++, binary, decimal, hex]",1318,0.0,c#
8,while learning iphone programming every xcode ...,"[objective-c, cocoa-touch, cocoa]",2267,0.0,c#
9,you may want to check nerddinner which is done...,[NULL],0,0.0,c#


In [0]:
topics = result.withColumnRenamed('decoded', 'topic').select('topic','prediction')

In [0]:
# topics.show(15,truncate=False)

+-----+----------+
|topic|prediction|
+-----+----------+
|c#   |0.0       |
|c#   |0.0       |
|php  |4.0       |
|c#   |0.0       |
|php  |4.0       |
|c#   |0.0       |
|c#   |0.0       |
|c#   |0.0       |
|c#   |0.0       |
|c#   |0.0       |
|c#   |0.0       |
|c#   |0.0       |
|c#   |0.0       |
|c#   |0.0       |
|c#   |0.0       |
+-----+----------+
only showing top 15 rows



In [0]:
# Aggregate the topics and calculate the total qty of each topic
topic_final = topics.groupBy(col("topic")).agg(count('topic').alias('qty')).orderBy(desc('qty'))


+----------+----+
|     topic| qty|
+----------+----+
|        c#|1953|
|      java| 103|
|       php|  42|
|javascript|  33|
|       sql|  26|
+----------+----+
only showing top 5 rows



In [0]:
topic_final = topic_final.withColumn("trainingDate", lit(today_date))

+----------+----+------------+
|     topic| qty|trainingDate|
+----------+----+------------+
|        c#|1953|  2023-12-06|
|      java| 103|  2023-12-06|
|       php|  42|  2023-12-06|
|javascript|  33|  2023-12-06|
|       sql|  26|  2023-12-06|
+----------+----+------------+
only showing top 5 rows



In [0]:
# define this function

def crt_sgl_file(result_path):
        # write the result to a folder container several files
        path = result_path.split(".")[0]
        topic_final.write.option("delimiter", ",").option("header", "true").mode("overwrite").csv(path)

        # list the folder, find the csv file 
        filenames = dbutils.fs.ls(path)
        name = ''
        for filename in filenames:
            if filename.name.endswith('csv'):
                org_name = filename.name

        # copy the csv file to the path you want to save, in this example, we use  "/mnt/deBDProject/BI/ml_result.csv"
        dbutils.fs.cp(path + '/'+ org_name, result_path)

        # delete the folder
        dbutils.fs.rm(path, True)

In [0]:
# run the function
result_path = "/mnt/deBDProject/BI_RF/ml_result.csv"

crt_sgl_file(result_path)