In [1]:
import time, math, os
from tqdm import tqdm
import gc
import pickle
import random
from datetime import datetime
from operator import itemgetter
import numpy as np
import pandas as pd
import warnings
from collections import defaultdict
import collections
warnings.filterwarnings('ignore')

In [2]:
data_path = 'e:/tmp/' # 天池平台路径
csv_path = 'e:/tmp/itemcf_baseline_11-23.csv'  # 天池平台路径

In [3]:
# 节约内存的一个标配函数
def reduce_mem(df):
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
                                                                                                           100*(start_mem-end_mem)/start_mem,
                                                                                                           (time.time()-starttime)/60))
    return df

In [4]:
# debug模式：从训练集中划出一部分数据来调试代码
def get_all_click_sample(data_path, sample_nums=10000):
    """
        训练集中采样一部分数据调试
        data_path: 原数据的存储路径
        sample_nums: 采样数目（这里由于机器的内存限制，可以采样用户做）
    """
    all_click = pd.read_csv(data_path + 'train_click_log.csv')
    all_user_ids = all_click.user_id.unique()

    sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False) 
    all_click = all_click[all_click['user_id'].isin(sample_user_ids)]
    
    all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))
    return all_click

# 读取点击数据，这里分成线上和线下，如果是为了获取线上提交结果应该讲测试集中的点击数据合并到总的数据中
# 如果是为了线下验证模型的有效性或者特征的有效性，可以只使用训练集
def get_all_click_df(data_path='./data_raw/', offline=True):
    if offline:
        all_click = pd.read_csv(data_path + 'train_click_log.csv')
    else:
#         trn_click = pd.read_csv(data_path + 'train_click_log.csv')
          test_click = pd.read_csv(data_path + 'testA_click_log.csv')

#         all_click = trn_click.append(tst_click)
    
    all_click = test_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))
    return all_click

In [5]:
test_click_df = get_all_click_df(data_path, offline=False)

In [6]:
test_click_df

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,249999,160974,1506959142820,4,1,17,1,13,2
1,249999,160417,1506959172820,4,1,17,1,13,2
2,249998,160974,1506959056066,4,1,12,1,13,2
3,249998,202557,1506959086066,4,1,12,1,13,2
4,249997,183665,1506959088613,4,1,17,1,15,5
...,...,...,...,...,...,...,...,...,...
518005,221924,70758,1508211323220,4,3,2,1,25,2
518006,207823,331116,1508211542618,4,3,2,1,25,1
518007,207823,234481,1508211850103,4,3,2,1,25,1
518008,207823,211442,1508212189949,4,3,2,1,25,1


In [7]:
# 根据点击时间获取用户的点击文章序列   {user1: [(item1, time1), (item2, time2)..]...}
def get_user_item_time(click_df):
    # click_df参数是pandas中pd合并后的数据集 
    click_df = click_df.sort_values('click_article_id')
	# 就是按照表头click_timestamp大小排序    
    # 送命题  一定要有click_df =   
    # 否则你print(click_df)和print(click_df.sort_values('click_timestamp'))输出不一样
    
    def make_item_time_pair(df):
        # 拉锁函数
#         return list(zip(df['user_id'], df['click_article_id']))
        return list(df['click_article_id'])

    user_item_time_df = click_df.groupby('user_id').apply(lambda x: make_item_time_pair(x)).reset_index().rename(columns={0: 'news_id'})
#     print(user_item_time_df.groups)
#     user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))

    return  user_item_time_df

In [8]:
user_item_time_dict = get_user_item_time(test_click_df)

In [9]:
user_item_time_dict

Unnamed: 0,user_id,news_id
0,200000,"[191971, 194300, 195839]"
1,200001,[175040]
2,200002,"[70335, 156654, 159762, 207714, 297906, 298310..."
3,200003,"[17217, 57748, 107039, 156560, 156624, 159195,..."
4,200004,"[95716, 129165, 140659, 145472, 156624, 177155..."
...,...,...
49995,249995,"[16129, 25325, 30730, 32082, 48403, 58193, 590..."
49996,249996,[160974]
49997,249997,"[74719, 96755, 123909, 124337, 124667, 181686,..."
49998,249998,"[160974, 202557, 235105, 236207, 237524]"


In [10]:
csv_dict = pd.read_csv(csv_path,index_col='user_id')

In [11]:
type(csv_dict)

pandas.core.frame.DataFrame

In [12]:
csv_dict = {index: row.tolist() for index, row in csv_dict.iterrows()}
        

In [13]:
csv_dict

{200000: [237870, 194619, 194935, 314048, 195773],
 200001: [64329, 272143, 199198, 324823, 166581],
 200002: [300128, 297906, 300923, 61375, 293301],
 200003: [337143, 272143, 156619, 235230, 158536],
 200004: [336221, 234698, 235870, 95716, 235616],
 200005: [69932, 160974, 156964, 160417, 158536],
 200006: [199197, 284547, 235230, 183176, 206934],
 200007: [336254, 289003, 157478, 50864, 97530],
 200008: [233717, 234698, 293301, 235870, 159762],
 200009: [199198, 64329, 198659, 166581, 324823],
 200010: [158536, 162655, 160974, 218337, 64329],
 200011: [272143, 198659, 182394, 64329, 166581],
 200012: [123818, 123290, 124228, 124352, 199198],
 200013: [20691, 272143, 96210, 336245, 57616],
 200014: [156560, 162765, 272143, 235230, 158536],
 200015: [336221, 156964, 300082, 234698, 235870],
 200016: [20691, 96210, 336245, 198659, 64329],
 200017: [162839, 159279, 65831, 10145, 208322],
 200018: [234698, 96210, 288440, 20691, 336245],
 200019: [261612, 121834, 15226, 16911, 187067],
 

In [None]:
score=0.0
for user_id,news_list in user_item_time_dict.items():       
    for news_id in news_list: 
        print(news_id)
        try:
            score+=1.0/(csv_dict[int(user_id)].index(news_id)+1.0)
        except ValueError:
            pass
print(score)
print(score/len(user_item_time_dict))
print(len(user_item_time_dict))   
print(len(csv_dict))
            