In [1]:
import re
import numpy as np
import pandas as pd
from pandas import DataFrame

pd.set_option('display.max_columns', None)  # 显示时不折叠


# 时间列数据格式修改
def date_struct(string_date):
    date_list = re.split('[-/]', string_date)
    date_list[0] = '20' + date_list[0][-2:]
    date_list[1] = date_list[1] if len(date_list[1]) == 2 else '0' + date_list[1]
    date_list[2] = date_list[2] if len(date_list[2]) == 2 else '0' + date_list[2]
    date = date_list[0] + '-' + date_list[1] + '-' + date_list[2]
    return date


# 输入string/None, 正则表达式匹配shop数据集中avg_price的纯数字
def match_avg_price(avg_price):
    if avg_price is not None:
        avg_price_list = re.findall("\d+", str(avg_price))
        if len(avg_price_list) > 0:
            return avg_price_list[0]
        else:
            return avg_price


# 输入shop dataframe，修正avg_price，如果为[]则按照同区域中(先同小类型POI后同大类型POI)的平均价格；非[]则取其中数字
def process_avg_price(df_shop):
    s_avg_price_isnull = df_shop['match_avg_price'].isnull()
    for index, value in s_avg_price_isnull.iteritems():
        if value is True:
            if (df_shop.loc[index, "big_category"] == "景点" and df_shop.loc[index, "avg_price"] == 0) or df_shop.loc[index, "small_category"] == "图书馆" \
                    or df_shop.loc[index, "small_category"] == "大学":
                df_shop.loc[index, 'avg_price'] = 0
                continue
            else:
                df_similiar_shop = df_shop.loc[(df_shop["small_category"] == df_shop.loc[index]["small_category"])]
                df_similiar_shop = df_similiar_shop.dropna(subset=["match_avg_price"])
                df_similiar_shop["match_avg_price"] = df_similiar_shop["match_avg_price"].astype('int')
                if df_similiar_shop.shape[0] == 0:
                    df_similiar_shop = df_shop.loc[(df_shop["big_category"] == df_shop.loc[index]["big_category"])]
                    df_similiar_shop = df_similiar_shop.dropna(subset=["match_avg_price"])
                    df_similiar_shop["match_avg_price"] = df_similiar_shop["match_avg_price"].astype('int')
                price = int(df_similiar_shop['match_avg_price'].mean())
                df_shop.loc[index, 'avg_price'] = price
                continue
        else:
            df_shop.loc[index, 'avg_price'] = int(df_shop.loc[index, 'match_avg_price'])
    return df_shop


# 输入shop dataframe，处理其他属性(score1, score2, score3),先对小类型取平均,若小类型均为空则对大类型取平均
def process_attributes(df_shop):
    s_score_isnull = df_shop['score1'].isnull()
    for index, value in s_score_isnull.iteritems():
        if value is True:
            df_similiar_shop = df_shop.loc[(df_shop["small_category"] == df_shop.loc[index, "small_category"])]
            df_similiar_shop = df_similiar_shop.dropna(subset=["score1", "score2", "score3"])
            if df_similiar_shop.shape[0] == 0:
                df_similiar_shop = df_shop.loc[(df_shop["big_category"] == df_shop.loc[index, "big_category"])]
                df_similiar_shop = df_similiar_shop.dropna(subset=["score1", "score2", "score3"])
            df_similiar_shop["score1"] = df_similiar_shop["score1"].astype('float')
            df_similiar_shop["score2"] = df_similiar_shop["score2"].astype('float')
            df_similiar_shop["score3"] = df_similiar_shop["score3"].astype('float')
            df_shop.loc[index, 'score1'] = format(df_similiar_shop['score1'].mean(), '.1f')
            df_shop.loc[index, 'score2'] = format(df_similiar_shop['score2'].mean(), '.1f')
            df_shop.loc[index, 'score3'] = format(df_similiar_shop['score3'].mean(), '.1f')
        else:
            df_shop.loc[index, 'score1'] = float(df_shop.loc[index, 'score1'])
            df_shop.loc[index, 'score2'] = float(df_shop.loc[index, 'score2'])
            df_shop.loc[index, 'score3'] = float(df_shop.loc[index, 'score3'])
    return df_shop


# 处理review数据集中的userID.先保存ID:index的映射,然后逐个处理
def transform_user_id(df_review):
    userID_map_dict = {}
    userIDList = df_review["userID"].unique()
    for index, userID in enumerate(userIDList):
        userID_map_dict[userID] = index
    return userID_map_dict


def process_user_id(userID_map_dict, user_id):
    return userID_map_dict[user_id]


# 处理shop数据集和review数据集中的shopID.原理同上
def transform_shop_id(df_shop):
    shopID_map_dict = {}
    shopID_map_list = df_shop["shop_id"].unique()
    for index, shopID in enumerate(shopID_map_list):
        shopID_map_dict[shopID] = index
    return shopID_map_dict


def process_shop_id(shopID_map_dict, shop_id):
    return shopID_map_dict[shop_id]


# 分割review数据集中的训练集与测试集
def split_review_data(df_review):
    df_train_data = DataFrame({'userID': [0],
                               'username': ["my"],
                               'shopID': [0],
                               'date': ['1996/07/18'],
                               'score': [0]})
    df_test_data = DataFrame({'userID': [0],
                              'username': ["my"],
                              'shopID': [0],
                              'date': ['1996/07/18'],
                              'score': [0]})
    byGroup = df_review.groupby("userID")
    for index, group in byGroup:
        userCount = group.shape[0]
        # k = 0.8                  # 训练集的比重
        df_train = group[0:int(userCount*0.8)]
        df_test = group[int(userCount*0.8):]

        df_train_data = pd.concat([df_train_data, df_train])
        df_test_data = pd.concat([df_test_data, df_test])

    df_train_data = df_train_data[1:]
    df_test_data = df_test_data[1:]

    return df_train_data, df_test_data


# 获取用户的历史访问数据
# def get_user_history(df_train_data):
#     user_history_dict = {}
#     grouped = df_train_data.groupby("userID")
#     for userID, group in grouped:
#         for shopID in group["shopID"]:
#             if userID not in user_history_dict.keys():
#                 user_history_dict[userID] = [shopID]
#             else:
#                 user_history_dict[userID].append(shopID)
#     return user_history_dict

In [2]:
print("******Start clean dataset******")
# 店铺仍在开（属性齐全）且评论数不小于5条的遴选出来
df_shop = pd.read_csv("newData/dianping_shops1.csv", encoding="utf-8", low_memory=False)
df_shop = df_shop.loc[df_shop["status"] == 0]
df_shop = df_shop[df_shop.review_count >= 5]

******Start clean dataset******


In [3]:
# 遴选用户数据集中，访问次数大于k的shop
df_review = pd.read_csv("newData/review_data.csv", encoding="utf-8", low_memory=False)
df_reviewCountOfShop = df_review.shopID.value_counts()
df_reviewCountOfShop = df_reviewCountOfShop.reset_index(level=None, drop=False, name=None, inplace=False)
new_col = ['shop_id', 'appearance_counts']
df_reviewCountOfShop.columns = new_col
df_reviewCountOfShop = df_reviewCountOfShop[df_reviewCountOfShop.appearance_counts >= 10]         # 评论集中店铺访问数不少于k=10

In [4]:
# 在shop数据集中，根据访问次数>k的遴选结果，选取对应的shop行
df_cleanShop = df_shop.loc[df_shop["shop_id"].isin(df_reviewCountOfShop["shop_id"])]
print("数据集Shop数量为：", df_cleanShop.shape[0])

数据集Shop数量为： 21040


In [5]:
# 在review数据集中，遴选shopID 在 df_cleanShop 的那些评论
df_review = df_review.loc[df_review["shopID"].isin(df_cleanShop["shop_id"])]

# 在review数据集中，去除重复的访问记录，去除打分为空的访问记录
df_review = df_review[["userID", "username", "shopID", "shopname", "score", "date"]]
df_review['date'] = df_review.apply(lambda x: date_struct(x['date']), axis=1)
df_review.drop_duplicates(subset=["userID", "shopID", "score", "date"], keep="first", inplace=True)
df_review = df_review.dropna(subset=["score"])

# 在review数据集中，根据访问次数>k的遴选结果，选取对应的review行
byGroup = df_review.groupby('userID').aggregate(np.count_nonzero)
tags = byGroup[byGroup.username >= 10].index                          # 有效评论不少于10条的用户
df_review = df_review[df_review['userID'].isin(tags)]

# 在review数据集中，对于每个用户的访问记录按时间前后排序，以便后面划分训练集与测试集
df_review = df_review.groupby("userID").apply(lambda x: x.sort_values(by="date", ascending=True))
df_review = df_review.reset_index(drop=True)
print("数据集Review数量为：", df_review.shape[0])
print("数据集User数量为：", len(tags))

数据集Review数量为： 1100867
数据集User数量为： 26566


In [7]:
print("******Start preprocess dataset******")
df_shop = df_cleanShop
# # 处理平均价格与属性
# df_shop['match_avg_price'] = df_shop.apply(lambda x: match_avg_price(x['avg_price']), axis=1)
# df_shop = process_avg_price(df_shop)
# print("******Adjust shop avg_price successful******")
# df_shop = process_attributes(df_shop)
# print("******Adjust shop attributes successful******")

******Start preprocess dataset******


In [8]:
# 处理review数据集ID,index=[0,1,...];选取review数据集中有作用的列,减少内存占用
userID_map_dict = transform_user_id(df_review)
df_review["userID"] = df_review.apply(lambda x: process_user_id(userID_map_dict, x["userID"]), axis=1)
df_review = df_review[["userID", "username", "shopID", "date", "score"]]

shopID_map_dict = transform_shop_id(df_shop)
df_shop["shop_id"] = df_shop.apply(lambda x: process_shop_id(shopID_map_dict, x["shop_id"]), axis=1)
df_review["shopID"] = df_review.apply(lambda x: process_shop_id(shopID_map_dict, x["shopID"]), axis=1)
print("******Adjust Shop ID & User ID successful******")

******Adjust Shop ID & User ID successful******


In [13]:
# df_shop = df_shop[["shop_id", "name", 'avg_price', "regionname", "big_category", "small_category", "star", "longitude", "latitude", "score1", "score2", "score3", "review_count", "bookable"]]
# df_shop.to_csv("newData/Shop_data_10.csv", index=False, header=True)
df_review = df_review.reset_index(drop=True)
df_train_data, df_test_data = split_review_data(df_review)
df_train_data.to_csv("newData/Review_train.csv", index=False, header=True)
df_test_data.to_csv("newData/Review_test.csv", index=False, header=True)
print("******Split train & test dataset successful******")

******Split train & test dataset successful******
