## INFOSYS 722 Assignement Iteration 4 - Sub-iteration 2

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import math

In [None]:
spark = SparkSession.builder.appName('infosys722-i4-i2').getOrCreate()

In [None]:
integrated_purchase_detail = spark.read.options(compression='gzip').format('parquet').load(
    './Ready Datasets/purchase_detail_v001')
integrated_sales = spark.read.options(compression='gzip').format('parquet').load(
    './Ready Datasets/sales_v001')

In [None]:
def remove_extreme_by_z_scores(dataframe, column, threshold = 1):
    mean, std = dataframe.agg(F.mean(column), F.stddev(column)).collect()[0]
    dataframe = dataframe.withColumn('z_scores', (F.col(column) - mean) / std)
    return dataframe.where(F.col('z_scores') <= threshold).drop('z_scores')

integrated_sales = remove_extreme_by_z_scores(integrated_sales, 'SalesQuantity')
integrated_sales.describe('SalesQuantity').show()

## Modeling

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor, LinearRegression, RandomForestRegressor
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, NaiveBayes, GBTClassifier
from pyspark.ml.evaluation import RegressionEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline

In [None]:
def evaluate_regression_model(result, labelCol, predictionCol='prediction', metrics = ['rmse', 'mse', 'mae', 'r2']):
    results = [(metric, 
                RegressionEvaluator(labelCol=labelCol, metricName=metric, predictionCol=predictionCol).evaluate(result)) 
               for metric in metrics]
    for metric, result in results:
        print(f'{metric}: {result}')
    return dict(results)

def evaluate_classification_model(result, labelCol, predictionCol='prediction', metrics = ['f1', 'accuracy']):
    results = [(metric, 
                MulticlassClassificationEvaluator(
                    labelCol=labelCol, metricName=metric, predictionCol=predictionCol).evaluate(result)) 
               for metric in metrics]
    for metric, v in results:
        print(f'{metric}: {v}')
    return dict(results)
    
def get_string_indexers(dataframe, suffix='_index'):
    indexers = []
    for feature, dtype in dataframe.dtypes:
        if dtype == 'string':
            indexers.append(StringIndexer(inputCol=feature, outputCol=feature + suffix))
    return indexers

def generate_baseline(dataframe, target):
    target_mean = dataframe.agg(F.mean(target)).collect()[0][0]
    return dataframe.withColumn('baseline_prediction', F.lit(target_mean))

def balance(dataframe, target, size=None):
    categories = dataframe.groupBy(target).agg(F.count(target).alias('count'))
    if size is None:
        size = int(categories.agg(F.mean('count')).collect()[0][0])
    balanced_df = spark.createDataFrame([], dataframe.schema)
    for category, count in categories.collect():
        if count > size:
            sample = dataframe.where(F.col(target) == category).orderBy(F.rand(13)).limit(size)
        else:
            ratio = math.ceil(size/count)
            sample = dataframe.where(F.col(target) == category).withColumn(
                'dummy', F.explode(F.array([F.lit(i) for i in range(ratio)]))).drop('dummy').orderBy(F.rand(13)).limit(size)
        balanced_df = balanced_df.unionAll(sample)
    return balanced_df

def print_evaluation(model, train, test, target, transformer, evaluate_function=evaluate_regression_model):
    print('Training data evaluation metrics:')
    result = model.transform(transformer.transform(train))
    evaluate_function(result, target)

    print('Testing data evaluation metrics:')
    result = model.transform(transformer.transform(test))
    evaluate_function(result, target)

def print_feature_importance(model, train, rounding=6):
    attrs = train.schema["features"].metadata['ml_attr']['attrs']
    feature_names = []
    for key, feature_list in attrs.items():
        feature_names = feature_names + feature_list
    feature_importances = [(f['name'], float(round(model.featureImportances[f['idx']], rounding))) for f in feature_names]
    spark.createDataFrame(feature_importances, ['feature', 'importance']).orderBy('importance', ascending=False).show()

### Compare Algorithm Sales

In [None]:
train, test = integrated_sales.randomSplit([0.7, 0.3], seed=13)

In [None]:
plt.hist(train.toPandas()['SalesQuantity'], bins=[i+0.5 for i in range(0, 9)], rwidth=0.5, alpha=0.5, label='train')
balanced_train = balance(train, 'SalesQuantity')
plt.hist(balanced_train.toPandas()['SalesQuantity'], bins=[i+0.5 for i in range(0, 9)], rwidth=0.5, alpha=0.5, label='balanced')
plt.legend(loc='upper right')
plt.show()

In [None]:
train = balanced_train

In [None]:
transformation_pipeline = Pipeline(stages=get_string_indexers(integrated_sales) + [
    VectorAssembler(inputCols=[
        'SalesPrice', 'DayOfMonth', 'PurchasePrice',
        'VendorNumber_index', 'DayOfWeek_index'], 
                    outputCol='features')
])
transformer = transformation_pipeline.fit(integrated_sales)

In [None]:
# Decision Tree
dtr = DecisionTreeClassifier(maxDepth=10, labelCol='SalesQuantity', featuresCol='features', maxBins=256)
model = dtr.fit(transformer.transform(train))
print_evaluation(model, train, test, 'SalesQuantity', transformer, evaluate_function=evaluate_classification_model)

In [None]:
# Random Forest
rfr = RandomForestClassifier(maxDepth=10, labelCol='SalesQuantity', featuresCol='features', maxBins=256)
model = rfr.fit(transformer.transform(train))
print_evaluation(model, train, test, 'SalesQuantity', transformer, evaluate_function=evaluate_classification_model)

In [None]:
# Naive Bayes
nb = NaiveBayes(labelCol='SalesQuantity', featuresCol='features')
model = nb.fit(transformer.transform(train))
print_evaluation(model, train, test, 'SalesQuantity', transformer, evaluate_function=evaluate_classification_model)

In [None]:
transformation_pipeline = Pipeline(stages=get_string_indexers(integrated_sales) + [
    VectorAssembler(inputCols=[
        'SalesPrice_log10', 'DayOfMonth', 'PurchasePrice_log10',
        'VendorNumber_index', 'DayOfWeek_index'], 
                    outputCol='features')
])
transformer = transformation_pipeline.fit(integrated_sales)

In [None]:
# Decision Tree
dtr = DecisionTreeClassifier(maxDepth=10, labelCol='SalesQuantity', featuresCol='features', maxBins=256)
model = dtr.fit(transformer.transform(train))
print_evaluation(model, train, test, 'SalesQuantity', transformer, evaluate_function=evaluate_classification_model)

In [None]:
# Random Forest
rfr = RandomForestClassifier(maxDepth=10, labelCol='SalesQuantity', featuresCol='features', maxBins=256)
model = rfr.fit(transformer.transform(train))
print_evaluation(model, train, test, 'SalesQuantity', transformer, evaluate_function=evaluate_classification_model)

### Data Mining - Sales Model

In [None]:
# Random Forest
transformation_pipeline = Pipeline(stages=get_string_indexers(integrated_sales) + [
    VectorAssembler(inputCols=[
        'SalesPrice', 'DayOfMonth', 'PurchasePrice',
        'VendorNumber_index', 'DayOfWeek_index'], 
                    outputCol='features')
])
transformer = transformation_pipeline.fit(integrated_sales)

test_data_size = 1 / (math.sqrt(5) + 1)
print(f'Test data size is {test_data_size}')
train, test = integrated_sales.randomSplit([1-test_data_size, test_data_size], seed=13)
train = balance(train, 'SalesQuantity')

In [None]:
plt.hist(train.toPandas()['SalesQuantity'], bins=[i+0.5 for i in range(0, 9)], rwidth=0.5, alpha=0.5, label='balanced')
plt.legend(loc='upper right')
plt.show()

In [None]:
cache = {}
def try_depth(max_depth):
    if cache.get(max_depth) is not None:
        return cache.get(max_depth)
    print('===============================================')
    print(f'Trying maxDepth: {max_depth}')
    rfr = RandomForestClassifier(maxDepth=max_depth, labelCol='SalesQuantity', featuresCol='features', maxBins=256)
    model = rfr.fit(transformer.transform(train))
    result = model.transform(transformer.transform(test))
    cache[max_depth] = evaluate_classification_model(result, 'SalesQuantity')
    return cache[max_depth]

In [None]:
def find_best_max_depth(left, right):
    if left == right:
        return left
    left_metric, right_metric = try_depth(left), try_depth(right)
    if left_metric['accuracy'] < right_metric['accuracy']:
        middle = math.ceil((left + right) / 2)
        return find_best_max_depth(middle, right)
    else:
        middle = math.floor((left + right) / 2)
        return find_best_max_depth(left, middle)

find_best_max_depth(10, 20)