In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import time
import gc
from ast import literal_eval


def merge_select_topI(doc, aux_data, content,onKey = 'document_id'):
    doc_aux = doc.merge(aux_data, how = 'left', on = onKey)
    doc_aux.fillna(-99, inplace= True)
    doc_aux[content+ "_topI"] = doc_aux[content + "_conf"].apply(getTopOne)
    return doc_aux

def getTopOne(l):
    if l == -99:
        return -99
    else:
        l = sorted(l, key = lambda x :x[1], reverse=True)
        return l[0][0]
    
def convert_ts(timestamp_ms_relative):
    TIMESTAMP_DELTA=1465876799998
    return datetime.fromtimestamp((int(timestamp_ms_relative)+TIMESTAMP_DELTA)//1000)

def freq_concat(x, dtp = 'numerical'):
    """concatenate feature value : its frequency"""
    if dtp == 'numerical':
        return str(int(x[1])) + ':' + str(int(x[2]))
    elif dtp == 'str':
        return str(x[1]) + ':' + str(int(x[2]))
    
def group_concat(x):
    """concatenate for each user all feature values and their frequencies in order"""
    return "|".join(sorted(map(str, x.tolist())))

def group_concat(x, dtp = 'numerical'):
    if dtp == 'numerical':
        x = map(str,map(int, x))
    elif dtp == 'str':
        x = map(str, x)
    unique, counts = np.unique(x, return_counts=True)
    return '|'.join(map(lambda y: str(y[0]) + ':' + str(y[1]), zip(unique, counts)))

# In[2]:

meta = pd.read_csv("../data/documents_meta.csv.gz", index_col=False)

# Read dictionary doc files
categories = pd.read_csv("../data/documents_categories_dic.csv.gz", index_col = False)
entities = pd.read_csv("../data/documents_entities_dic.csv.gz", index_col= False)
topics = pd.read_csv("../data/documents_topics_dic.csv.gz", index_col= False)

categories.categ_conf = categories.categ_conf.apply(literal_eval)
entities.entity_conf = entities.entity_conf.apply(literal_eval)
topics.topic_conf = topics.topic_conf.apply(literal_eval)





In [4]:
features_summary = ['platform','geo_location'] # selected features for summary
features_summary_dtypes = ['numerical','str'] # corresponding feature types
uuid_hash_range = ['' + str(x) for x in range(0, 10)] # 0,1,...,9

for uuid_hash_idx in uuid_hash_range:
    if uuid_hash_idx != '0':
        break
    start_time = time.time()    
    print "process uuid hash idx: ", uuid_hash_idx
    # Read partition page view
    filename = "../input/pages_view_uid" + uuid_hash_idx +".csv.gz"
    partition00 = pd.read_csv(filename, index_col=False)
    partition00["timestamp"] = partition00["timestamp"].map(convert_ts)
    
    # Join on doc_id the doc meta features
    doc_ids = partition00[["document_id"]].drop_duplicates()
    doc_meta = meta.merge(doc_ids, how = 'left', on = 'document_id')
    #print "doc_meta before join: ", doc_meta.shape
    doc_topics = doc_meta.merge(topics, how = 'left', on = 'document_id')
    doc_topics.fillna(-99, inplace= True)
    doc_topics["topic_topI"] = doc_topics["topic_conf"].apply(getTopOne)
    
    doc_categ = merge_select_topI(doc_topics, categories, content = "categ")
    doc_entity = merge_select_topI(doc_categ, entities, content = "entity")
    doc_entity.drop(['topic_conf', 'categ_conf', 'entity_conf'], axis = 1, inplace = True)    
    partition00 = partition00.merge(doc_entity, how = 'left', on = 'document_id')
       
    partition00_gb = partition00.groupby(['uuid']) # group by object
        
    uu_summary = partition00_gb['timestamp'].count().reset_index().rename(columns =                                                                                    {'timestamp':'num_visits'})
    
    for ifeat, feature_freq in enumerate(features_summary):
        print feature_freq
        feature_dtype = features_summary_dtypes[ifeat]
        
        uu_summary.loc[:,feature_freq + "-freq"] = \
        partition00_gb[feature_freq].apply(list).reset_index()[feature_freq].apply(lambda x: group_concat(x, dtp = feature_dtype))

    # Save uu_summary to file
    filename_output = "../data/pages_view_feature/page_views_uusum_uid" + uuid_hash_idx + ".csv.gz"
    uu_summary.to_csv(filename_output, index = False, compression="gzip")
    
    # Memory clear
    # del partition00_gb, partition00, doc_ids, doc_meta, doc_topics, doc_categ, doc_entity, uu_summary
    # gc.collect()        
    print('It took %i seconds to process %s' % (time.time()-start_time, uuid_hash_idx) )

process uuid hash idx:  0
platform
geo_location
It took 1536 seconds to process 0


In [9]:
partition00.sort(['uuid']).head(20)

  if __name__ == '__main__':


Unnamed: 0,uuid,document_id,timestamp,platform,geo_location,traffic_source,source_id,publisher_id,publish_time,topic_topI,categ_topI,entity_topI
11707848,100001174c8854,1438654,2016-06-22 16:30:57,1,US>CA>803,1,5315.0,1046.0,2016-05-14 00:00:00,265.0,1408.0,3e2329d0532b88665cdd7c9625786738
1695938,100001174c8854,1856810,2016-06-15 17:43:38,1,US>CA>803,1,574.0,1161.0,2016-06-14 20:00:00,147.0,1902.0,543a19352803579575fe4a63dae66498
4576057,100001174c8854,2110889,2016-06-17 18:44:01,1,US>CA>803,1,694.0,1161.0,2016-06-17 14:00:00,183.0,1909.0,-99
11014727,1000018be5d703,2240510,2016-06-22 10:07:40,2,US>FL>528,1,663.0,61.0,2016-06-19 12:00:00,216.0,1702.0,9ebd449f947b6a9358f11aa77f3ce0d2
17836402,10000380c5ae26,2819961,2016-06-27 18:21:39,2,US>CA>803,1,1893.0,160.0,2016-06-28 00:00:00,184.0,1902.0,9e6e6bfa404b26cfb3012bd457fbdb68
6585486,10000380c5ae26,2185512,2016-06-19 00:37:10,2,US>CA>825,1,752.0,43.0,2016-06-18 10:00:00,51.0,1908.0,-99
4790241,10000380c5ae26,2112875,2016-06-17 18:27:58,2,US>CA>803,1,465.0,160.0,2016-06-17 08:00:00,77.0,1702.0,f53e449e38e3c2b4f1eadbd1e76a5715
13712587,10000489b154a3,2655482,2016-06-24 14:46:36,1,US>MI>563,1,7344.0,874.0,2016-06-23 00:00:00,45.0,1903.0,-99
10239173,100007fce4ea8f,394689,2016-06-21 17:44:56,1,US>MA>506,1,7343.0,874.0,-99,147.0,2006.0,06c0312fcb3a1445daeb4efd64952503
2584751,10000937259378,1477062,2016-06-15 11:50:57,2,US>NY>501,1,11339.0,1216.0,-99,174.0,1503.0,-99


In [8]:
uu_summary.head(20)

Unnamed: 0,uuid,num_visits,platform-freq,geo_location-freq
0,100001174c8854,3,1:3,US>CA>803:3
1,1000018be5d703,1,2:1,US>FL>528:1
2,10000380c5ae26,3,2:3,US>CA>803:2|US>CA>825:1
3,10000489b154a3,1,1:1,US>MI>563:1
4,100007fce4ea8f,1,1:1,US>MA>506:1
5,10000937259378,1,2:1,US>NY>501:1
6,10000a7dccaa45,1,1:1,US>WA>819:1
7,10000b8813b1b2,1,3:1,US>CA>855:1
8,10000e5b05def7,1,1:1,US>NE>652:1
9,100015b47f138f,14,3:14,US>CO>751:14
