### Possibly interesting features based on https://news.ycombinator.com/item?id=36590226
- Time of day [Done]
- how many posts 1 hr before post made (indicative of whether it's peak hour or not)
- Analysis on title: TF-IDF? Remove stopwords? Any other importance measures, and onehot encode the impt words?

In [93]:
import os

from pyspark.sql import SQLContext, Window
from pyspark.sql.functions import col,sum,desc,when,udf, percent_rank

from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import LogisticRegression, LinearSVC, RandomForestClassifier
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder

from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

In [2]:
sc

In [3]:
# Change file path
os.chdir("/Users/hydraze/Library/CloudStorage/GoogleDrive-tohziyu2@gmail.com/My Drive/Studies/KU Leuven/Courses/Classes/Y1S2/Advanced Analytics in Business/Project/3/AdvancedAnalytics_Streaming-Text-Analytics")

In [4]:
# Load all folder names, then read into one textfile
file_path = "/Users/hydraze/Library/CloudStorage/GoogleDrive-tohziyu2@gmail.com/My Drive/Studies/KU Leuven/Courses/Classes/Y1S2/Advanced Analytics in Business/Project/3/AdvancedAnalytics_Streaming-Text-Analytics/"
folders_names = [file_path + 'data/' + folder for folder in os.listdir(file_path + 'data/') if "saved_stories" in folder]
compiled_stories = sc.textFile(','.join(folders_names[0:100]))

In [38]:
df = spark.read.json(compiled_stories)

                                                                                

In [None]:
# Convert frontpage to numeric
df = df.withColumn('frontpage', when(df.frontpage==True, 1).otherwise(0))

In [39]:
# Compile cleaning steps which cannot be fit into a pipeline. These steps will not cause data leakage
# Will have to be implemented on the script for doing streaming predictions

# Extracting type of post: Show HN
df = df.withColumn('isShowHN', when(df.title.contains("Show HN"), 1).otherwise(0))

# Extracting time of day
extract_time_of_day_udf = udf(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%H'))


df = df.withColumn('time_of_day', extract_time_of_day_udf(df.posted_at))

# Extracting day of week
weekDay =  udf(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%w'))

df = df.withColumn('day_of_week', weekDay(df.posted_at))

### Modelling time

In [40]:
# Train test split based on time: Have to prevent data leakage. Sort dataframe by posted_at, and give a percentile rank allowing us to split
df = df.withColumn("rank", percent_rank().over(Window.partitionBy().orderBy("posted_at")))

train = df.where("rank <= .8").drop("rank")
test = df.where("rank > .8").drop("rank")

In [None]:
from pyspark.sql.functions import col

# Helper function to drop column if it exists
def drop_column_if_exists(df, col_name):
    return df.drop(col_name) if col_name in df.columns else df

# Fill null values
df = df.na.fill({"title": "", "source_title": "", "source_text": ""})

# If you would like to overwrite it
df = drop_column_if_exists(df, "title_tokens")
df = drop_column_if_exists(df, "title_filtered")
df = drop_column_if_exists(df, "title_cv")
df = drop_column_if_exists(df, "source_title_tokens")
df = drop_column_if_exists(df, "source_title_filtered")
df = drop_column_if_exists(df, "source_cv")
df = drop_column_if_exists(df, "title_tfidf")
df = drop_column_if_exists(df, "source_title_tfidf")

In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml import Pipeline

# 1. Tokenization
tokenizer = Tokenizer(inputCol="title", outputCol="title_tokens")
tokenizer_source = Tokenizer(inputCol="source_title", outputCol="source_title_tokens")

# 2. Stop Word Removal
remover = StopWordsRemover(inputCol="title_tokens", outputCol="title_filtered")
remover_source = StopWordsRemover(inputCol="source_title_tokens", outputCol="source_title_filtered")

# 3. Vectorization
vectorizer = CountVectorizer(inputCol="title_filtered", outputCol="title_cv")
vectorizer_source = CountVectorizer(inputCol="source_title_tokens", outputCol="source_title_cv")

# 4. TF-IDF
idf = IDF(inputCol="title_cv", outputCol="title_tfidf")
idf_source = IDF(inputCol="source_title_cv", outputCol="source_title_tfidf")

# Create a Pipeline to process the data
pipeline = Pipeline(stages=[tokenizer, remover, vectorizer, idf])

# Fit the pipeline to the DataFrame
model = pipeline.fit(df)

# Transform the DataFrame
df_transformed = model.transform(df)

# Show the DataFrame with the new 'title_features' column
df_transformed.select("title", "title_tokens", "title_filtered", "title_cv", "title_tfidf").show(truncate=False)

In [52]:
# Define numerical, categorical and target columns. To adjust the column lists here which will feed into the processing later
NUM_COL = ['votes', 'comments']
CAT_COL = ['isShowHN', 'time_of_day', 'day_of_week']

In [58]:
# Other cleaning stages which can be fit into a pipeline. Will automatically apply all steps to test set

# Dealing with numerical variables
numerical_vector_assembler = VectorAssembler(inputCols=NUM_COL, outputCol='num_col_vector')
std_scaler = StandardScaler(inputCol='num_col_vector',
                            outputCol='scaled_num_col_vector',
                            withStd=True, withMean=True)

# Dealing with categorical variables
isShowHN_indexer = StringIndexer(inputCol='isShowHN',
                            outputCol='isShowHN_index')
isShowHN_ohe = OneHotEncoder(inputCol='isShowHN_index',
                                outputCol='isShowHN_OHE')

# To put all numerical vectors and onehotencoded categorical variables into the same final_feature_vector vector
overall_assembler = VectorAssembler(inputCols=['scaled_num_col_vector',
                                               'isShowHN_OHE'],
                                    outputCol='final_feature_vector')


In [79]:
# Initiate model(s) and params
lr = LogisticRegression(maxIter=100, family="binomial",
                        featuresCol='final_feature_vector', labelCol='frontpage',
                        weightCol="frontpage")

param_grid_lr = (ParamGridBuilder()
                .addGrid(lr.regParam, [0.0, 0.1,0.5]) 
                .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) 
                .build())

rfc = RandomForestClassifier(maxDepth=30, seed=42, weightCol="frontpage")

param_grid_rfc = (ParamGridBuilder()
                .addGrid(rfc.numTrees, [10, 100]) 
                .addGrid(rfc.impurity, ['gini', 'entroopy']) 
                .build())

In [87]:
# Compile final pipelines
cleaning_stages = [numerical_vector_assembler,
                   std_scaler, 
                   isShowHN_indexer, 
                   isShowHN_ohe,
                   overall_assembler]
pipeline_lr = Pipeline(stages=cleaning_stages + [lr])
pipeline_rfc = Pipeline(stages=cleaning_stages + [rfc])

In [96]:
# Defining evaluator and crossvalidation object
evaluator = BinaryClassificationEvaluator().setLabelCol("frontpage")

# Join everything together using a CrossValidator object.
crossval_lr = CrossValidator(
    estimator=pipeline_lr, 
    estimatorParamMaps=param_grid_lr, 
    evaluator=evaluator, 
    numFolds=3,
    parallelism=4, 
    seed=42
)

In [97]:
# Training
cvModel = crossval_lr.fit(train)


24/05/09 19:03:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/09 19:03:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/09 19:03:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/09 19:03:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/09 19:03:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/09 19:03:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/09 1

KeyboardInterrupt: 

24/05/09 19:03:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/09 19:03:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/09 19:04:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/09 19:04:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/09 19:04:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/09 19:04:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/09 1

In [69]:
# Model evaluation
best_model = cvModel.bestModel
best_score = cvModel.avgMetrics[0]

print("Best model: ", best_model)
print("Best score: ", best_score)

Best model:  PipelineModel_eb506f5816f2
Best score:  0.9418476184285297


In [70]:
# Get best model
best_lr_params = best_model.stages[-1].extractParamMap()

print("Best score (RMSE):", best_score, end="\n\n")

for parameter, value in best_lr_params.items():
    print(f"{str(parameter):50s}, {value}")

Best score (RMSE): 0.9418476184285297

LogisticRegression_a34220c6c6c6__aggregationDepth , 2
LogisticRegression_a34220c6c6c6__elasticNetParam  , 1.0
LogisticRegression_a34220c6c6c6__family           , binomial
LogisticRegression_a34220c6c6c6__featuresCol      , final_feature_vector
LogisticRegression_a34220c6c6c6__fitIntercept     , True
LogisticRegression_a34220c6c6c6__labelCol         , frontpage
LogisticRegression_a34220c6c6c6__maxBlockSizeInMB , 0.0
LogisticRegression_a34220c6c6c6__maxIter          , 100
LogisticRegression_a34220c6c6c6__predictionCol    , prediction
LogisticRegression_a34220c6c6c6__probabilityCol   , probability
LogisticRegression_a34220c6c6c6__rawPredictionCol , rawPrediction
LogisticRegression_a34220c6c6c6__regParam         , 0.1
LogisticRegression_a34220c6c6c6__standardization  , True
LogisticRegression_a34220c6c6c6__threshold        , 0.5
LogisticRegression_a34220c6c6c6__tol              , 1e-06


In [72]:
# Test scores
test_pred = best_model.transform(test)

# show scores
print(evaluator.evaluate(test_pred))

24/05/09 17:27:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/09 17:27:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/09 17:27:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.

0.9759615384615384


24/05/09 17:28:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/09 17:28:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

In [73]:
# save best model from cv grid search
mPath =  file_path+"models/best_model"
best_model.write().overwrite().save(mPath)

                                                                                