In [45]:
import copy
class person:
    """
    The item class, which record the change of the properties and the customer
    of the item
    """
    def __init__(self, id):
        self.person_id = id

        self.purchased_items = []
        self.viewed_items = []
        self.cart_items = []

    def purchase(self, item_id, transcation_id, timestamp):
        """
        record a purchase
        :param item_id: str
        :param transcation_id: str
        :param timestamp: str
        :return: None
        """
        self.purchased_items.append((item_id, transcation_id, timestamp))
    def view(self, item_id, timestamp):
        """
        record a view
        :param item_id: str
        :param timestamp: str
        :return: None
        """
        self.viewed_items.append((item_id,timestamp))
    def add_to_cart(self, item_id, timestamp):
        """
        record a add_to_cart
        :param item_id: str
        :param timestamp: str
        :return: None
        """
        self.cart_items.append((item_id,timestamp))
    def purchases_record(self):
        """
        return the record of the purchases of the person
        :return: list
        """
        return self.purchased_items
    def carts_record(self):
        """
        return the record of items have been added to the cart
        :return: list
        """
        return self.cart_items
    def views_record(self):
        """
        return the record of what the person have viewed
        :return: list
        """
        return self.viewed_items


In [46]:
class item:
    """
    The item class, which record the change of the properties and the customer
    of the item
    """
    def __init__(self, id):
        self.item_id = id
        self.properties = {}
        self.customers = []
        self.viewers = []
        self.cart_adders = []
    def property_change(self, pro_name, value, timestamp):
        """
        record a change of a property
        :param pro_name: str
        :param value: str
        :param timestamp: str
        :return:
        """
        if pro_name not in self.properties:
            self.properties[pro_name] = []
        self.properties[pro_name].append((value, timestamp))
    def purchase(self, customerID, transcation_id,timestamp):
        """
        record a purchase
        :param customerID: str
        :param transcation_id: str
        :param timestamp: str
        :return: None
        """
        self.customers.append((customerID,transcation_id, timestamp))
    def view(self, customerID, timestamp):
        """
        record a view
        :param customerID: str
        :param timestamp: str
        :return: None
        """
        self.viewers.append((customerID,timestamp))
    def add_to_cart(self, customerID, timestamp):
        """
        record a add_to_cart
        :param customerID: str
        :param timestamp: str
        :return: None
        """
        self.cart_adders.append((customerID,timestamp))
    def customers_record(self):
        """
        return the record of customers of the item
        :return: list
        """
        return self.customers
    def carts_record(self):
        """
        return the record of persons who added this item to the cart
        :return: list
        """
        return self.cart_adders
    def viewers_record(self):
        """
        return the record of persons who viewed this item
        :return: list
        """
        return self.viewers
    def property_record(self, pro_name):
        """
        return the record of a property of the item
        :param pro_name: str
        :return: list
        """
        return self.properties[pro_name]


In [47]:
class analyse():
    """
    the class to read and store data
    """
    def __init__(self):
        self.persons = {}
        self.items = {}
    def read_property(self, property_file):
        f, first = open(property_file, "r"), True
        for line in f:
            if first:
                first = False
                continue
            property_info = line.split(",")
            timestamp, item_id, pro_name, value = property_info[0], property_info[1], property_info[2], property_info[3]
            if item_id not in self.items:
                continue
                self.items[item_id] = item(item_id)
            self.items[item_id].property_change(pro_name, value, timestamp)
    def read_event(self, event_file):
        f, first = open(event_file, "r"), True
        for line in f:
            if first:
                first = False
                continue
            event_info = line.split(",")
            timestamp, person_id, event, item_id, transaction_id = event_info[0], event_info[1], event_info[2], event_info[3], event_info[4]
            if person_id not in self.persons:
                self.persons[person_id] = person(person_id)
            if item_id not in self.items:
                
                self.items[item_id] = item(item_id)
            if event == "transaction":
                self.persons[person_id].purchase(item_id, transaction_id, timestamp)
                self.items[item_id].purchase(person_id, transaction_id,timestamp)
            elif event == "view":
                self.persons[person_id].view(item_id, timestamp)
                self.items[item_id].view(person_id, timestamp)
            else:
                self.persons[person_id].add_to_cart(item_id, timestamp)
                self.items[item_id].add_to_cart(person_id, timestamp)



a = analyse()
a.read_event("/Users/Kiki/Downloads/retailrocket-recommender-system-dataset/events.csv")
a.read_property("/Users/Kiki/Downloads/retailrocket-recommender-system-dataset/item_properties_part1.csv")
a.read_property("/Users/Kiki/Downloads/retailrocket-recommender-system-dataset/item_properties_part2.csv")

In [48]:
def subset(threshold):
    """
    preprocess users in a, filter the users according to its purchased, viewd, and carted items
    :param threshold: int
    :return: {}
    """
    res = {}
    for person_name, person in a.persons.items():
        if len(person.viewed_items) + len(person.cart_items) > threshold and len(person.purchased_items) > threshold:
            res[person_name] = copy.deepcopy(person)
    return res

In [49]:
def subset_item(persons):
    """
    preprocess users, filter the users according to its purchased, viewd, and carted items
    :param person: {}
    :return: {}
    """
    res = {}
    for person_name, person in persons.items():
        
        for item in person.viewed_items + person.purchased_items + person.cart_items:
            
            if item[0] not in res:
                res[item[0]] = res.get(item[0], 0) + 1
                
    return res

In [59]:

def merge(item_after_subset):
    """
    merge items with same categoryid
    :param person: {}
    :return: {}
    """
    sec_subset_item = {}
    for item in item_after_subset:
        if "categoryid" not in a.items[item].properties:
            sec_subset_item[item] = copy.deepcopy(a.items[item])
            continue
        category_id = a.items[item].property_record("categoryid")[0][0][:-1]
        if category_id not in sec_subset_item:
            sec_subset_item[category_id] = copy.deepcopy(a.items[item])
            sec_subset_item[category_id].item_id = category_id
        else:
            for pro_name, values in a.items[item].properties.items():
                if pro_name not in sec_subset_item[category_id].properties:
                    sec_subset_item[category_id].properties[pro_name] = []
                sec_subset_item[category_id].properties[pro_name] += values
            sec_subset_item[category_id].viewers += a.items[item].viewers
            sec_subset_item[category_id].customers += a.items[item].customers
            sec_subset_item[category_id].cart_adders += a.items[item].cart_adders
    return sec_subset_item


In [91]:
subset_persons1, subset_persons2 = subset(100), subset(200)
print len(subset_persons1), len(subset_persons2)

15 3


In [87]:
item_sub_dic1,item_sub_dic2 = subset_item(subset_persons1), subset_item(subset_persons2)
print len(item_sub_dic1), len(item_sub_dic2)

11786 6323


In [88]:
merged_items1, merged_items2 = merge(item_sub_dic1), merge(item_sub_dic2)
print len(merged_items1), len(merged_items2)

1546 1304


In [92]:
def update_items_in_persons(subset_persons):
    """
    update items' ids recorded in filtered users with categoryid
    :param subset_persons: {}
    :param merged_items: {}
    :return: None
    """
    
    for person_name, person in subset_persons.items():
        for index in range(len(person.viewed_items)):
            
            if "categoryid" in a.items[person.viewed_items[index][0]].properties:
                category_id = a.items[person.viewed_items[index][0]].properties["categoryid"][0][0][:-1]
                time_stamp = person.viewed_items[index][1]
                person.viewed_items[index] = (category_id, time_stamp)
                
        for index in range(len(person.purchased_items)):
            if "categoryid" in a.items[person.purchased_items[index][0]].properties:
                category_id = a.items[person.purchased_items[index][0]].properties["categoryid"][0][0][:-1]
    
                time_stamp = person.purchased_items[index][1]
                person.purchased_items[index] = (category_id ,time_stamp)
        for index in range(len(person.cart_items)):
            if "categoryid" in a.items[person.cart_items[index][0]].properties:
                category_id = a.items[person.cart_items[index][0]].properties["categoryid"][0][0][:-1]
                time_stamp = person.cart_items[index][1]
                person.cart_items[index] = (category_id, time_stamp)
   

In [93]:
update_items_in_persons(subset_persons1)

In [94]:
update_items_in_persons(subset_persons2)