In [None]:
spark.stop()

### Workflow:
---

A. Convert from json to csv using spark

B. process csv data for train
    1. load data from csv-alike files
    2. create a new negative records table (user with false item_record and category) 
    3. join category by item_id as a positive records table
    4. zip negative table and positive table as one, add a positive label to indicates.
    5. aggregate same user history items into two new columns: items_history and category history
    6. save last two records(one positive / one negative) of each user to local_test, remains save to local_train
    7. split local_test with random 1:9 into local_test_splitByUser and local_train_splitByUser
    8. build 3 dictionary: mid_voc, uid_voc and cat_voc from local_train_splitByUser using aggregate_count and sort
    9. create a new dictionary map for mid and cat, create a mid list according to reviews_info
    9. save all files: mid_voc, uid_voc, cat_voc, local_train_splitByUser and local_test_splitByUser

### Convert Json to CSV
#### May take about 2min, pls skip if you only want to check data process

In [2]:
# Convert Json to CSV

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import numpy as np
import pandas as pd
from time import time
from timeit import default_timer as timer
import random
import os.path

local_prefix = "file:///mnt/nvme2/chendi/BlueWhale/x-deeplearning/xdl-algorithm-solution/DIEN/data/"

def load_json():
    
    item_info_df = spark.read.json(local_prefix + 'meta_Books.json')
    reviews_info_df = spark.read.json(local_prefix + 'reviews_Books.json')
    
    item_info_df = item_info_df.select('asin', expr("categories[0][size(categories[0]) - 1] as categories"))
    reviews_info_df = reviews_info_df.select('reviewerID', 'asin', 'overall', 'unixReviewTime')
    
    return reviews_info_df, item_info_df

t0 = timer()
spark = SparkSession\
    .builder\
    .master('yarn')\
    .appName("DIEN_DATA_PROCESS") \
    .getOrCreate()

# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

t0 = timer()
reviews_info_df, item_info_df = load_json()
t1 = timer()
reviews_info_df.write.format('csv').mode('overwrite').save(local_prefix + 'reviews-info-spark')
item_info_df.write.format('csv').mode('overwrite').save(local_prefix + 'item-info-spark')
t2 = timer()

# should be 2370585
print("Total length of item-info is ", item_info_df.count())
# should be 22507155
print("Total length of reviews-info is ", reviews_info_df.count())

print("\n==================== Convert Time =======================\n")
print("Total process took %.3f secs" % (t2 - t0))
print("Details:")
print("load from json took %.3f secs" % (t1 - t0))
print("write as csv took %.3f secs" % (t2 - t1))
print("\n==========================================================")    

Total length of item-info is  2370585
Total length of reviews-info is  22507155


Total process took 109.148 secs
Details:
load from json took 101.544 secs
write as csv took 7.604 secs



### DIEN data process

In [15]:
# Data Process

import findspark
findspark.init()
 
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import numpy as np
import pandas as pd
from time import time
import random
import os.path

local_prefix_src = "file:///mnt/nvme2/chendi/BlueWhale/x-deeplearning/xdl-algorithm-solution/DIEN/data"
local_prefix = "file:///mnt/nvme2/chendi/BlueWhale/x-deeplearning/xdl-algorithm-solution/DIEN/data/pyspark_output"

def load_csv():

    review_id_field = StructField('review_id', StringType())
    movie_id_field = StructField('movie_id', StringType())
    overall_field = StructField('overall', FloatType())
    unix_time_field = StructField('unix_review_time', IntegerType())
    reviews_info_schema = StructType([review_id_field, movie_id_field, overall_field, unix_time_field])

    category_field = StructField('category', StringType())
    item_info_schema = StructType([movie_id_field, category_field])
    
    reviews_info_df = spark.read.schema(reviews_info_schema).option('sep', '\t').csv(local_prefix_src + "/reviews-info") 
    item_info_df = spark.read.schema(item_info_schema).option('sep', '\t').csv(local_prefix_src + "/item-info")
    
    return reviews_info_df, item_info_df

def load_pyspark_processed_csv():

    review_id_field = StructField('review_id', StringType())
    movie_id_field = StructField('movie_id', StringType())
    overall_field = StructField('overall', FloatType())
    unix_time_field = StructField('unix_review_time', IntegerType())
    reviews_info_schema = StructType([review_id_field, movie_id_field, overall_field, unix_time_field])

    category_field = StructField('category', StringType())
    item_info_schema = StructType([movie_id_field, category_field])
    
    reviews_info_df = spark.read.schema(reviews_info_schema).option('sep', ',').csv(local_prefix_src + "/reviews-info-spark") 
    item_info_df = spark.read.schema(item_info_schema).option('sep', ',').csv(local_prefix_src + "/item-info-spark")
    
    return reviews_info_df, item_info_df


class DataProcessor:
    def rand_ordinal_n(self, df, n, name = 'ordinal'):
        return df.withColumn(name, (rand() * n).cast("int"))
    
    def process(self):
        self.uid_dict_df.repartition(1).write.format('parquet').mode('overwrite').save(local_prefix + '/dien/output/uid_voc')
        self.mid_dict_df.repartition(1).write.format('parquet').mode('overwrite').save(local_prefix + '/dien/output/mid_voc')
        self.cat_dict_df.repartition(1).write.format('parquet').mode('overwrite').save(local_prefix + '/dien/output/cat_voc')

        # we need to concat two history array columns to string
        self.test_df = self.test_df.select('positive',\
                            'review_id',\
                            'movie_id',\
                            'category',\
                            expr("concat_ws('\x02', concated_movie_id)"),\
                            expr("concat_ws('\x02', concated_category)"),\
                            'row_number')
        self.test_df.repartition(1)\
                    .write.option("sep",'\t').format('csv').mode('overwrite').save(local_prefix + '/dien/output/local_test_splitByUser')
        
        self.train_df = self.train_df.select('positive',\
                            'review_id',\
                            'movie_id',\
                            'category',\
                            expr("concat_ws('\x02', concated_movie_id)"),\
                            expr("concat_ws('\x02', concated_category)"),
                            'row_number')          
        self.train_df.repartition(1)\
                     .write.option("sep",'\t').format('csv').mode('overwrite').save(local_prefix + '/dien/output/local_train_splitByUser')
    
    def if_local_test_exists(self):
        import subprocess

        path = local_prefix + "/dien/output/local_test/_SUCCESS"
        if local_prefix == "":
            proc = subprocess.Popen(['hadoop', 'fs', '-test', '-e', path])
            proc.communicate()

            if proc.returncode != 0:
                return False
            else : 
                return True
        else:
            return os.path.isfile(path)  
    
    def shuffle_data_by_user(self, df):
        window_spec = Window.partitionBy(df.review_id).orderBy(df.positive)
        return df.withColumn("row_number", row_number().over(window_spec))
        #return df.withColumn("row_number", row_number().over(window_spec)).drop("row_number")
    
    def __init__(self, spark, reviews_info_df, item_info_df):
        self.reviews_info_df = reviews_info_df
        self.item_info_df = item_info_df

        # same as meta_map in process_data.py
        self.meta_map_df = self.item_info_df\
                        .groupby("movie_id")\
                        .agg(first("category").alias("category"))
        self.meta_map_df.write.format('parquet').mode('overwrite').save(local_prefix + '/dien/output/meta_map')
        self.meta_map_df = spark.read.parquet(local_prefix + '/dien/output/meta_map')
        
        if not self.if_local_test_exists():

            #same as item_list in process_data.py
            item_list_df = self.reviews_info_df\
                                    .groupby("movie_id")\
                                    .count()\
                                    .drop('count')

            # same as user_map in process_data.py
            user_map_df = self.reviews_info_df 

            asin_len = item_list_df.count()
            asin_list = [row['movie_id'] for row in item_list_df.select('movie_id').collect()]
            broadcast_movie_id_list = spark.sparkContext.broadcast(asin_list)

            def get_random_id(asin_total_len, asin):
                item_list = broadcast_movie_id_list.value
                asin_neg = asin
                while True:
                    asin_neg_index = random.randint(0, asin_total_len - 1)
                    asin_neg = item_list[asin_neg_index]
                    if asin_neg == asin:
                        continue
                    else:
                        break
                return asin_neg        

            get_random_id_udf = udf(get_random_id, StringType())

            ## manual join ##
            # same as line 66-75 in process_data.py
            negative_df =  user_map_df\
                            .withColumn('positive', lit(0))\
                            .withColumn('rand_false_mid', get_random_id_udf(lit(asin_len), "movie_id"))\
                            .join(self.meta_map_df, col("rand_false_mid") == self.meta_map_df.movie_id, 'left_outer')\
                            .withColumn('category', when(col('category').isNotNull(), col('category')).otherwise("default_cat"))\
                            .select("positive", "review_id", self.meta_map_df.movie_id, "overall", "unix_review_time", "category")\

            positive_df = user_map_df\
                            .withColumn('positive', lit(1))\
                            .join(self.meta_map_df, 'movie_id', 'left_outer')\
                            .withColumn('category', when(col('category').isNotNull(), col('category')).otherwise("default_cat"))\
                            .select("positive", "review_id", "movie_id", "overall", "unix_review_time", "category")\

            ## split_test ##
            # last two records of one user(one positive one negative) set tag 20190119
            # other previous records of one user set tag 20180118

            ## local_aggregator
            # all tag with 20190119 will be wrote to local_test
            last_positive_with_concat_df = positive_df\
                                                .groupby('review_id')\
                                                .agg(last('movie_id').alias('movie_id'),\
                                                     last('category').alias('category'),\
                                                     collect_list('movie_id').alias("concated_movie_id"),\
                                                     collect_list('category').alias("concated_category"),\
                                                     count("*").alias("numItemsByUser"))\
                                                .withColumn("concated_movie_id", expr("slice(concated_movie_id, 1, numItemsByUser - 1)"))\
                                                .withColumn("concated_category", expr("slice(concated_category, 1, numItemsByUser - 1)"))\
                                                .withColumn("numItemsByUser", expr("numItemsByUser - 1"))\
                                                .filter(col("numItemsByUser") > 0)
            
            ## save data ############################################################################################
            last_positive_with_concat_df.write.format('parquet').mode('overwrite').save(local_prefix + '/dien/output/aggregated_records')
            reload_last_positive_with_concat_df = spark.read.parquet(local_prefix + '/dien/output/aggregated_records')
            
            # by saving to local, we can ensure negative record and positive record will have same history sequence
            #########################################################################################################
                  
            last_negative_record_of_user_df = negative_df\
                                                .groupby('review_id')\
                                                .agg(last('positive').alias('positive'),\
                                                     last('movie_id').alias('movie_id'),\
                                                     last('category').alias('category'))
            last_negative_record_of_user_df = reload_last_positive_with_concat_df\
                                                .join(last_negative_record_of_user_df, 'review_id', 'inner')\
                                                .select(\
                                                        'review_id',\
                                                        'positive',\
                                                        last_negative_record_of_user_df.movie_id.alias('movie_id'),\
                                                        last_negative_record_of_user_df.category.alias('category'),\
                                                        'concated_movie_id',\
                                                        'concated_category',\
                                                        'numItemsByUser')
            last_positive_record_of_user_df = reload_last_positive_with_concat_df\
                                                .select('review_id',\
                                                        lit(1).alias('positive'),\
                                                        'movie_id',\
                                                        'category',\
                                                        'concated_movie_id',\
                                                        'concated_category',\
                                                        'numItemsByUser')
            union_records_df = last_negative_record_of_user_df\
                                .union(last_positive_record_of_user_df)

            ## all local_test will be split with random 1:9 to local_train_splitByUser and local_test_splitByUser
            reviews_groupby_user_df = self.rand_ordinal_n(reload_last_positive_with_concat_df, 10).select('review_id', 'ordinal')
            union_concated_df = reviews_groupby_user_df\
                                .join(union_records_df, 'review_id', 'inner')\
                                .select('positive',\
                                        'review_id',\
                                        'movie_id',\
                                        'category',\
                                        'concated_movie_id',\
                                        'concated_category',\
                                        'numItemsByUser',\
                                        'ordinal')
            ## save data ############################################################################################
            union_concated_df.write.format('parquet').mode('overwrite').save(local_prefix + '/dien/output/local_test')
            #########################################################################################################
        
        ## end if local_test exists ################################################################################
        reload_union_concated_df = spark.read.parquet(local_prefix + '/dien/output/local_test')
        
        ## split aggregated_labled_df by 1:9
        self.test_df = reload_union_concated_df.filter(col("ordinal") == 2).drop("ordinal")
        self.train_df = reload_union_concated_df.filter(col("ordinal") != 2).drop("ordinal")
        
        ## use window function to make sure same user records stay together
        self.test_df = self.shuffle_data_by_user(self.test_df)
        self.train_df = self.shuffle_data_by_user(self.train_df)
            
        ## build uid_dict, mid_dict and cat_dict
        columns = ['review_id', 'uid']
        zero = spark.createDataFrame([("A1Y6U82N6TYZPI",0)], columns)      
        
        self.uid_dict_df = zero.union(self.train_df\
                            .groupBy('review_id')\
                            .count()\
                            .withColumn('uid', row_number().over(Window.orderBy(desc('count'))))\
                            .drop("count"))
        
        columns = ['movie_id', 'mid']
        zero = spark.createDataFrame([("default_mid",0)], columns)  
        self.mid_dict_df = zero.union(self.train_df\
                            .withColumn("concated_movie_id", array_union(col("concated_movie_id"), array(col("movie_id"))))\
                            .select(explode(col("concated_movie_id")).alias("movie_id"))\
                            .groupBy('movie_id')\
                            .count()\
                            .filter(col('movie_id') != "default_mid")\
                            .withColumn('mid', row_number().over(Window.orderBy(desc('count'))))\
                            .drop("count"))
        
        columns = ['category', 'cat']
        zero = spark.createDataFrame([("default_cat",0)], columns) 
        self.cat_dict_df = zero.union(self.train_df\
                            .withColumn("concated_category", array_union(col("concated_category"), array(col("category"))))\
                            .select(explode(col("concated_category")).alias("category"))\
                            .groupBy('category')\
                            .count()\
                            .filter(col('category') != "default_cat")\
                            .withColumn('cat', row_number().over(Window.orderBy(desc('count'))))\
                            .drop("count"))

In [16]:
import findspark
findspark.init()
 
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import numpy as np
import pandas as pd
from timeit import default_timer as timer

t0 = timer()
spark = SparkSession\
    .builder\
    .master('yarn')\
    .appName("DIEN_DATA_PREPARE") \
    .getOrCreate()

# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
t1 = timer()
# Load reviews_info and item_info from HDFS
#reviews_info_df, item_info_df = load_csv()
reviews_info_df, item_info_df = load_pyspark_processed_csv()
%time data_processor = DataProcessor(spark, reviews_info_df, item_info_df)
%time data_processor.process()
t2 = timer()
print("\n==================== Process Time =======================\n")
print("Total process took %.3f secs" % (t2 - t0))
print("Details:")
print("start spark took %.3f secs" % (t1 - t0))
print("process took %.3f secs" % (t2 - t1))
print("\n==========================================================")

CPU times: user 9.3 s, sys: 592 ms, total: 9.89 s
Wall time: 37.8 s
CPU times: user 31.5 ms, sys: 12 ms, total: 43.5 ms
Wall time: 24 s


Total process took 61.873 secs
Details:
start spark took 0.003 secs
process took 61.870 secs



### Option2: using original python to convert (No need to run)

In [9]:
# Load original json as CSV
import sys
import random
import time
from time import time

def process_meta(file):
    fi = open(file, "r")
    fo = open("item-info", "w")
    for line in fi:
        obj = eval(line)
        cat = obj["categories"][0][-1]
        print(obj["asin"] + "\t" + cat, file=fo)

def process_reviews(file):
    fi = open(file, "r")
    user_map = {}
    fo = open("reviews-info", "w")
    for line in fi:
        obj = eval(line)
        userID = obj["reviewerID"]
        itemID = obj["asin"]
        rating = obj["overall"]
        time = obj["unixReviewTime"]
        print(userID + "\t" + itemID + "\t" + str(rating) + "\t" + str(time), file=fo)

local_prefix = "/mnt/nvme2/chendi/BlueWhale/x-deeplearning/xdl-algorithm-solution/DIEN/data/"
t0 = timer()
process_meta(local_prefix + 'meta_Books.json')
process_reviews(local_prefix + 'reviews_Books.json')
t1 = timer()

print("Convert initial csv from json took %.3f secs" % (t1 - t0))

Convert initial csv from json took 868.023 secs


In [3]:
spark.stop()