In [None]:
spark.stop()

### Workflow:
---

A. Convert from json to csv using spark

B. process csv data for train
    1. load data from csv-alike files
    2. create a new negative records table (user with false item_record and category) 
    3. join category by item_id as a positive records table
    4. zip negative table and positive table as one, add a positive label to indicates.
    5. aggregate same user history items into two new columns: items_history and category history
    6. save last two records(one positive / one negative) of each user to local_test, remains save to local_train
    7. split local_test with random 1:9 into local_test_splitByUser and local_train_splitByUser
    8. build 3 dictionary: mid_voc, uid_voc and cat_voc from local_train_splitByUser using aggregate_count and sort
    9. create a new dictionary map for mid and cat, create a mid list according to reviews_info
    9. save all files: mid_voc, uid_voc, cat_voc, local_train_splitByUser and local_test_splitByUser

### Convert Json to CSV
#### May take about 2min, pls skip if you only want to check data process
#### Notice: Seems original json file contains some error, convert by pyspark some rows will return Null items

In [None]:
# Convert Json to CSV

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import numpy as np
import pandas as pd
from time import time
from timeit import default_timer as timer
import random
import os.path
import pickle
import pandas

dir_path = "/mnt/nvme2/chendi/BlueWhale/x-deeplearning/xdl-algorithm-solution/DIEN/data/pyspark_output/"
local_prefix = "file://" + dir_path

def load_json():
    
    item_info_df = spark.read.json(local_prefix + 'meta_Books.json')
    reviews_info_df = spark.read.json(local_prefix + 'reviews_Books.json')
    
    item_info_df = item_info_df.select('asin', expr("categories[0][size(categories[0]) - 1] as categories"))
    reviews_info_df = reviews_info_df.select('reviewerID', 'asin', 'overall', 'unixReviewTime')
    
    return reviews_info_df, item_info_df

def list_dir(path):   
    source_path_dict = {}
    dirs = os.listdir(path)
    for files in dirs:
        try:
            sub_dirs = os.listdir(path + "/" + files)
            for file_name in sub_dirs:
                if (file_name.endswith('parquet') or file_name.endswith('csv')):
                    source_path_dict[files] = os.path.join(
                        path, files, file_name)
        except:
            source_path_dict[files] = os.path.join(path, files)
    return source_path_dict

def result_rename_or_convert():   
    fpath = dir_path
    source_path_dict = list_dir(fpath)
    fix = "-spark"
    try:
        os.rename(source_path_dict["reviews-info" + fix], fpath + 'reviews-info')
        os.rename(source_path_dict["item-info" + fix], fpath + 'item-info')
    except:
        pass


t0 = timer()
spark = SparkSession\
    .builder\
    .master('yarn')\
    .appName("DIEN_DATA_PROCESS") \
    .getOrCreate()

# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

t1 = timer()
reviews_info_df, item_info_df = load_json()
t2 = timer()
reviews_info_df.repartition(1).write.option("sep", '\t').format('csv').mode('overwrite').save(local_prefix + 'reviews-info-spark')
item_info_df.repartition(1).write.option("sep", '\t').format('csv').mode('overwrite').save(local_prefix + 'item-info-spark')
t3 = timer()
result_rename_or_convert()
t4 = timer()

# should be 2370585
print("Total length of item-info is ", item_info_df.count())
# should be 22507155
print("Total length of reviews-info is ", reviews_info_df.count())

print("\n==================== Convert Time =======================\n")
print("Total process took %.3f secs" % (t4 - t0))
print("Details:")
print("start spark took %.3f secs" % (t1 - t0))
print("load from json took %.3f secs" % (t2 - t1))
print("write as csv took %.3f secs" % (t3 - t2))
print("rename csv files took %.3f secs" % (t4 - t3))
print("\n==========================================================")    

In [None]:
#item_info_df = item_info_df.select('asin', expr("categories[0][size(categories[0]) - 1] as categories"))
item_info_df = spark.read.json(local_prefix + 'meta_Books.json')
print(item_info_df.filter(col('asin').isNull()).count())

### DIEN data process

In [None]:
# Data Process

import findspark
findspark.init()
 
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import numpy as np
import pandas as pd
from time import time
import random
import os.path
import pickle
import pandas

dir_path = "/mnt/nvme2/chendi/BlueWhale/x-deeplearning/xdl-algorithm-solution/DIEN/data/pyspark_output/"
local_prefix_src = "file://" + dir_path
local_prefix = "file://" + dir_path

def load_csv():
    review_id_field = StructField('review_id', StringType())
    movie_id_field = StructField('movie_id', StringType())
    overall_field = StructField('overall', FloatType())
    unix_time_field = StructField('unix_review_time', IntegerType())
    reviews_info_schema = StructType([review_id_field, movie_id_field, overall_field, unix_time_field])

    category_field = StructField('category', StringType())
    item_info_schema = StructType([movie_id_field, category_field])
    
    reviews_info_df = spark.read.schema(reviews_info_schema).option('sep', '\t').csv(local_prefix_src + "/reviews-info") 
    item_info_df = spark.read.schema(item_info_schema).option('sep', '\t').csv(local_prefix_src + "/item-info")
    
    return reviews_info_df, item_info_df

def list_dir(path):   
    source_path_dict = {}
    dirs = os.listdir(path)
    for files in dirs:
        try:
            sub_dirs = os.listdir(path + "/" + files)
            for file_name in sub_dirs:
                if (file_name.endswith('parquet') or file_name.endswith('csv')):
                    source_path_dict[files] = os.path.join(
                        path, files, file_name)
        except:
            source_path_dict[files] = os.path.join(path, files)
    return source_path_dict

def to_pydict(df):
    if df.shape[1] == 2:
        keys = df.keys()
        l1 = df[keys[0]].to_list()
        l2 = df[keys[1]].to_list()
        return dict(zip(l1, l2))
    return {}
        
def result_rename_or_convert():   
    fpath = dir_path
    source_path_dict = list_dir(fpath)
    fix = "_spark"
    os.rename(source_path_dict["local_test_splitByUser" + fix], fpath + '/local_test_splitByUser')
    os.rename(source_path_dict["local_train_splitByUser" + fix], fpath + '/local_train_splitByUser')
    uid_voc = to_pydict(pandas.read_parquet(source_path_dict["uid_voc" + fix]))
    mid_voc = to_pydict(pandas.read_parquet(source_path_dict["mid_voc" + fix]))
    cat_voc = to_pydict(pandas.read_parquet(source_path_dict["cat_voc" + fix]))

    pickle.dump(uid_voc, open(fpath + '/uid_voc.pkl', "wb"), protocol=0)
    pickle.dump(mid_voc, open(fpath + '/mid_voc.pkl', "wb"), protocol=0)
    pickle.dump(cat_voc, open(fpath + '/cat_voc.pkl', "wb"), protocol=0)

class DataProcessor:
    def rand_ordinal_n(self, df, n, name = 'ordinal'):
        return df.withColumn(name, (rand() * n).cast("int"))
    
    def process(self):
        self.uid_dict_df.repartition(1).write.format('parquet').mode('overwrite').save(local_prefix + '/uid_voc_spark')
        self.mid_dict_df.repartition(1).write.format('parquet').mode('overwrite').save(local_prefix + '/mid_voc_spark')
        self.cat_dict_df.repartition(1).write.format('parquet').mode('overwrite').save(local_prefix + '/cat_voc_spark')
        
        # we need to concat two history array columns to string
        self.test_df = self.test_df.select('positive',\
                            'review_id',\
                            'movie_id',\
                            'category',\
                            expr("concat_ws('\x02', concated_movie_id)"),\
                            expr("concat_ws('\x02', concated_category)"),\
                            'row_number')
        self.test_df.repartition(1)\
                    .write.option("sep",'\t').format('csv').mode('overwrite').save(local_prefix + '/local_test_splitByUser_spark')
        
        self.train_df = self.train_df.select('positive',\
                            'review_id',\
                            'movie_id',\
                            'category',\
                            expr("concat_ws('\x02', concated_movie_id)"),\
                            expr("concat_ws('\x02', concated_category)"),
                            'row_number')          
        self.train_df.repartition(1)\
                     .write.option("sep",'\t').format('csv').mode('overwrite').save(local_prefix + '/local_train_splitByUser_spark')
    
    def shuffle_data_by_user(self, df):
        window_spec = Window.partitionBy(df.review_id).orderBy(df.positive)
        return df.withColumn("row_number", row_number().over(window_spec))
        #return df.withColumn("row_number", row_number().over(window_spec)).drop("row_number")
    
    def __init__(self, spark, reviews_info_df, item_info_df):
        self.reviews_info_df = reviews_info_df
        self.item_info_df = item_info_df

        # same as meta_map in process_data.py
        self.meta_map_df = self.item_info_df\
                        .groupby("movie_id")\
                        .agg(first("category").alias("category"))
        self.meta_map_df.write.format('parquet').mode('overwrite').save(local_prefix + '/meta_map')
        self.meta_map_df = spark.read.parquet(local_prefix + '/meta_map')
        
    
        #same as item_list in process_data.py
        item_list_df = self.reviews_info_df\
                                .groupby("movie_id")\
                                .count()\
                                .drop('count')

        # same as user_map in process_data.py
        user_map_df = self.reviews_info_df 

        asin_list = [row['movie_id'] for row in item_list_df.select('movie_id').collect()]
        asin_len = len(asin_list)
        broadcast_movie_id_list = spark.sparkContext.broadcast(asin_list)

        def get_random_id(asin_total_len, asin):
            item_list = broadcast_movie_id_list.value
            asin_neg = asin
            while True:
                asin_neg_index = random.randint(0, asin_total_len - 1)
                asin_neg = item_list[asin_neg_index]
                if asin_neg == None or asin_neg == asin:
                    continue
                else:
                    break
            return asin_neg        

        get_random_id_udf = udf(get_random_id, StringType())

        ## manual join ##
        # same as line 66-75 in process_data.py
        negative_df =  user_map_df\
                        .withColumn('positive', lit(0))\
                        .withColumn('rand_false_mid', get_random_id_udf(lit(asin_len), "movie_id"))\
                        .join(self.meta_map_df, col("rand_false_mid") == self.meta_map_df.movie_id, 'left_outer')\
                        .withColumn('category', when(col('category').isNotNull(), col('category')).otherwise("default_cat"))\
                        .select("positive", "review_id", self.meta_map_df.movie_id, "overall", "unix_review_time", "category")\

        positive_df = user_map_df\
                        .withColumn('positive', lit(1))\
                        .join(self.meta_map_df, 'movie_id', 'left_outer')\
                        .withColumn('category', when(col('category').isNotNull(), col('category')).otherwise("default_cat"))\
                        .select("positive", "review_id", "movie_id", "overall", "unix_review_time", "category")\

        ## split_test ##
        # last two records of one user(one positive one negative) set tag 20190119
        # other previous records of one user set tag 20180118

        ## local_aggregator
        # all tag with 20190119 will be wrote to local_test
        user_window = Window.partitionBy('review_id').orderBy('unix_review_time')
        last_positive_with_concat_df = positive_df\
                                            .withColumn('uid', row_number().over(user_window))\
                                            .groupby('review_id')\
                                            .agg(last('movie_id').alias('movie_id'),\
                                                 last('category').alias('category'),\
                                                 collect_list('movie_id').alias("concated_movie_id"),\
                                                 collect_list('category').alias("concated_category"),\
                                                 count("*").alias("numItemsByUser"),
                                                 last('uid').alias('uid'))\
                                            .withColumn("concated_movie_id", expr("slice(concated_movie_id, 1, numItemsByUser - 1)"))\
                                            .withColumn("concated_category", expr("slice(concated_category, 1, numItemsByUser - 1)"))\
                                            .withColumn("numItemsByUser", expr("numItemsByUser - 1"))\
                                            .filter(col("numItemsByUser") > 0)

        ## save data ############################################################################################
        last_positive_with_concat_df.write.format('parquet').mode('overwrite').save(local_prefix + '/aggregated_records')
        reload_last_positive_with_concat_df = spark.read.parquet(local_prefix + '/aggregated_records')

        # by saving to local, we can ensure negative record and positive record will have same history sequence
        #########################################################################################################

        last_negative_record_of_user_df = negative_df\
                                            .groupby('review_id')\
                                            .agg(last('positive').alias('positive'),\
                                                 last('movie_id').alias('movie_id'),\
                                                 last('category').alias('category'))
        last_negative_record_of_user_df = reload_last_positive_with_concat_df\
                                            .join(last_negative_record_of_user_df, 'review_id', 'inner')\
                                            .select(\
                                                    'review_id',\
                                                    'positive',\
                                                    last_negative_record_of_user_df.movie_id.alias('movie_id'),\
                                                    last_negative_record_of_user_df.category.alias('category'),\
                                                    'concated_movie_id',\
                                                    'concated_category',\
                                                    'numItemsByUser')
        last_positive_record_of_user_df = reload_last_positive_with_concat_df\
                                            .select('review_id',\
                                                    lit(1).alias('positive'),\
                                                    'movie_id',\
                                                    'category',\
                                                    'concated_movie_id',\
                                                    'concated_category',\
                                                    'numItemsByUser')
        union_records_df = last_negative_record_of_user_df\
                            .union(last_positive_record_of_user_df)

        ## all local_test will be split with random 1:9 to local_train_splitByUser and local_test_splitByUser
        reviews_groupby_user_df = self.rand_ordinal_n(reload_last_positive_with_concat_df, 10).select('review_id', 'ordinal')
        union_concated_df = reviews_groupby_user_df\
                            .join(union_records_df, 'review_id', 'inner')\
                            .select('positive',\
                                    'review_id',\
                                    'movie_id',\
                                    'category',\
                                    'concated_movie_id',\
                                    'concated_category',\
                                    'numItemsByUser',\
                                    'ordinal')
        ## save data ############################################################################################
        union_concated_df.write.format('parquet').mode('overwrite').save(local_prefix + '/local_test')
        #########################################################################################################
        
        ## end if local_test exists ################################################################################
        reload_union_concated_df = spark.read.parquet(local_prefix + '/local_test')
        
        ## split aggregated_labled_df by 1:9
        self.test_df = reload_union_concated_df.filter(col("ordinal") == 2).drop("ordinal")
        self.train_df = reload_union_concated_df.filter(col("ordinal") != 2).drop("ordinal")
        
        ## use window function to make sure same user records stay together
        self.test_df = self.shuffle_data_by_user(self.test_df)
        self.train_df = self.shuffle_data_by_user(self.train_df)
            
        ## build uid_dict, mid_dict and cat_dict
        columns = ['review_id', 'uid']
        zero = spark.createDataFrame([("A1Y6U82N6TYZPI",0)], columns)      
        
        self.uid_dict_df = zero.union(self.train_df\
                            .groupBy('review_id')\
                            .count()\
                            .withColumn('uid', row_number().over(Window.orderBy(desc('count'))))\
                            .drop("count"))
        
        columns = ['movie_id', 'mid']
        zero = spark.createDataFrame([("default_mid",0)], columns)  
        self.mid_dict_df = zero.union(self.train_df\
                            .withColumn("concated_movie_id", array_union(col("concated_movie_id"), array(col("movie_id"))))\
                            .select(explode(col("concated_movie_id")).alias("movie_id"))\
                            .groupBy('movie_id')\
                            .count()\
                            .filter(col('movie_id') != "default_mid")\
                            .withColumn('mid', row_number().over(Window.orderBy(desc('count'))))\
                            .drop("count"))
        
        columns = ['category', 'cat']
        zero = spark.createDataFrame([("default_cat",0)], columns) 
        self.cat_dict_df = zero.union(self.train_df\
                            .withColumn("concated_category", array_union(col("concated_category"), array(col("category"))))\
                            .select(explode(col("concated_category")).alias("category"))\
                            .groupBy('category')\
                            .count()\
                            .filter(col('category') != "default_cat")\
                            .withColumn('cat', row_number().over(Window.orderBy(desc('count'))))\
                            .drop("count"))

In [None]:
import findspark
findspark.init()
 
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import numpy as np
import pandas as pd
from timeit import default_timer as timer

t0 = timer()
spark = SparkSession\
    .builder\
    .master('yarn')\
    .appName("DIEN_DATA_PREPARE") \
    .getOrCreate()

# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
t1 = timer()
# Load reviews_info and item_info from HDFS
reviews_info_df, item_info_df = load_csv()
data_processor = DataProcessor(spark, reviews_info_df, item_info_df)
t2 = timer()
data_processor.process()
t3 = timer()
result_rename_or_convert()
t4 = timer()
print("\n==================== Process Time =======================\n")
print("Total process took %.3f secs" % (t4 - t0))
print("Details:")
print("start spark took %.3f secs" % (t1 - t0))
print("pre process took %.3f secs" % (t2 - t1))
print("process and save took %.3f secs" % (t3 - t2))
print("rename took %.3f secs" % (t4 - t3))
print("\n==========================================================")

==================== Process Time =======================

Total process took 119.847 secs

Details:

start spark took 29.739 secs

pre process took 55.900 secs

process and save took 26.244 secs

rename took 7.964 secs

==========================================================

### Option2: using original python to convert (No need to run)

In [None]:
# Load original json as CSV
import sys
import random
import time
from time import time

def process_meta(file):
    fi = open(file, "r")
    fo = open("item-info", "w")
    for line in fi:
        obj = eval(line)
        cat = obj["categories"][0][-1]
        print(obj["asin"] + "\t" + cat, file=fo)

def process_reviews(file):
    fi = open(file, "r")
    user_map = {}
    fo = open("reviews-info", "w")
    for line in fi:
        obj = eval(line)
        userID = obj["reviewerID"]
        itemID = obj["asin"]
        rating = obj["overall"]
        time = obj["unixReviewTime"]
        print(userID + "\t" + itemID + "\t" + str(rating) + "\t" + str(time), file=fo)

local_prefix = "/mnt/nvme2/chendi/BlueWhale/x-deeplearning/xdl-algorithm-solution/DIEN/data/"
t0 = timer()
process_meta(local_prefix + 'meta_Books.json')
process_reviews(local_prefix + 'reviews_Books.json')
t1 = timer()

print("Convert initial csv from json took %.3f secs" % (t1 - t0))

In [None]:
spark.stop()