In [240]:
import pandas as pd

In [241]:
DIR = '/Users/lcq-mac/pycharm_projects/algorithms/tf_practice/tf_learning/deep_recommend/ml-1m/'
users_file = 'users.dat'
movies_file = 'movies.dat'
ratings_file = 'ratings.dat'

In [242]:
users_title = ['UserID', 'Gender', 'Age', 'OccupationID', 'Zip-code']
df_users = pd.read_table(DIR+users_file, sep='::', header=None, names=users_title, engine='python')
df_users.head()

Unnamed: 0,UserID,Gender,Age,OccupationID,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [243]:
movies_title = ['MovieID', 'Title', 'Genres']
df_movies = pd.read_table(DIR+movies_file, sep='::', header=None, names=movies_title, engine = 'python')
df_movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [244]:
ratings_title = ['UserID','MovieID', 'Rating', 'timestamps']
df_ratings = pd.read_table(DIR+ratings_file, sep='::', header=None, names=ratings_title, engine = 'python')
df_ratings.head()
# df_ratings.dtypes

Unnamed: 0,UserID,MovieID,Rating,timestamps
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [245]:
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [246]:
def load_data():
    """
    Load data from file
    """
    # 读取用户数据
    users_title = ['UserID', 'Gender', 'Age', 'OccupationID', 'Zip-code']
    users = pd.read_table(DIR+users_file, sep='::', header=None, names=users_title, engine='python')
    users = users.filter(regex='UserID|Gender|Age|OccupationID')
    users_origin = users.values
    gender_map = {'M': 1, 'F': 0}
    users['Gender'] = users['Gender'].map(gender_map)
    age_map = {age: x for x, age in enumerate(set(users['Age']))}
    users['Age'] = users['Age'].map(age_map)
    
    # 读取评分数据
    ratings_title = ['UserID','MovieID', 'Rating', 'timestamps']
    ratings = pd.read_table(DIR+ratings_file, sep='::', header=None, names=ratings_title, engine = 'python')
#     ratings['irank'] = ratings.groupby('UserID').sorted_values('timestamps', ascending=False)
    ratings = ratings.filter(regex='UserID|MovieID|Rating')
    
    # 读取电影数据
    movies_title = ['MovieID', 'Title', 'Genres']
    movies = pd.read_table(DIR+movies_file, sep='::', header=None, names=movies_title, engine = 'python')
    movies_origin = movies.values
    # 将Title中的年份去除
    pattern = re.compile(r'^(.*)\((\d+)\)$')
    title_rm_year = {title: pattern.match(title).group(1) for i, title in enumerate(set(movies['Title']))}
    movies['Title'] = movies['Title'].map(title_rm_year)
    # 将电影类型转数字字典
    gener_type = set()
    for val in movies['Genres'].str.split('|'):
        gener_type.update(val)
    gener_type.add('<PAD>')
    movies_map = {movies_type: i for i, movies_type in enumerate(gener_type)}
    # 将电影类型换成等长数字类型，长度为18
    gener_map = {movie_types:[movies_map[movie_type] for movie_type in movie_types.split('|')] for i, movie_types in enumerate(set(movies['Genres']))}
    for key in gener_map:
        for cnt in range(max(movies_map.values()) - len(gener_map[key])):
            gener_map[key].insert(len(gener_map[key]) + cnt, movies_map['<PAD>'])
    movies['Genres'] = movies['Genres'].map(gener_map) 
    
    # 将电影title转数字字典
    title_set = set()
    for title_word in movies['Title'].str.split():
        title_set.update(title_word)
    title_set.add('<PAD>')
    title_int = {word: i for i, word in enumerate(title_set)}
    # 将电影title转成等长数字列表，长度为15
    title_count = 15
    title_map = {title: [title_int[word] for word in title.split()] for i, title in enumerate(set(movies['Title']))}
    for key in title_map.keys():
        for cnt in range(title_count - len(title_map[key])):
            title_map[key].insert(len(title_map[key]) + cnt, title_int['<PAD>'])
    movies['Title'] = movies['Title'].map(title_map)
    
    # 合并三个表
    data = pd.merge(pd.merge(users, ratings, how='inner', on='UserID'), movies, how='inner', on='MovieID')
    
    # 将数据分成X和Y两张表
    target_fields = ['Rating']
    features_pd, target_pd = data.drop(target_fields, axis=1), data[target_fields]
    features = features_pd.values
    target_values = target_pd.values
    
    return title_count, title_set, movies_map, features, target_values, ratings, users, movies, data, movies_origin, users_origin



In [247]:
title_count, title_set, movies_map, features, target_values, ratings, users, movies, data, movies_origin, users_origin = load_data()
import pickle
pickle.dump([title_count, title_set, movies_map, features, target_values, ratings, users, movies, data, movies_origin, users_origin], open('preprocess.p', 'wb'))



In [248]:
users.head()

Unnamed: 0,UserID,Gender,Age,OccupationID
0,1,0,0,10
1,2,1,5,16
2,3,1,6,15
3,4,1,2,7
4,5,1,6,20


In [249]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,"[2202, 1345, 2816, 2816, 2816, 2816, 2816, 281...","[18, 13, 6, 17, 17, 17, 17, 17, 17, 17, 17, 17..."
1,2,"[5041, 2816, 2816, 2816, 2816, 2816, 2816, 281...","[5, 13, 2, 17, 17, 17, 17, 17, 17, 17, 17, 17,..."
2,3,"[1397, 4291, 30, 2816, 2816, 2816, 2816, 2816,...","[6, 12, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17..."
3,4,"[3805, 5183, 3061, 2816, 2816, 2816, 2816, 281...","[6, 9, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,..."
4,5,"[2194, 1040, 3558, 1324, 526, 4704, 2816, 2816...","[6, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17..."


In [250]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [251]:
title_count, title_set, gener_type, features, target_values, ratings, users, movies, data, movies_origin, users_origin = pickle.load(open('preprocess.p', mode='rb'))


In [252]:
data.head()

Unnamed: 0,UserID,Gender,Age,OccupationID,MovieID,Rating,Title,Genres
0,1,0,0,10,1193,5,"[1630, 3804, 1344, 3558, 5048, 3283, 2816, 281...","[9, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17..."
1,2,1,5,16,1193,5,"[1630, 3804, 1344, 3558, 5048, 3283, 2816, 281...","[9, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17..."
2,12,1,6,12,1193,4,"[1630, 3804, 1344, 3558, 5048, 3283, 2816, 281...","[9, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17..."
3,15,1,6,7,1193,4,"[1630, 3804, 1344, 3558, 5048, 3283, 2816, 281...","[9, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17..."
4,17,1,3,1,1193,5,"[1630, 3804, 1344, 3558, 5048, 3283, 2816, 281...","[9, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17..."


In [253]:
features[0]

array([1, 0, 0, 10, 1193,
       list([1630, 3804, 1344, 3558, 5048, 3283, 2816, 2816, 2816, 2816, 2816, 2816, 2816, 2816, 2816]),
       list([9, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17])],
      dtype=object)

In [254]:
target_values[0]

array([5])

In [276]:
# 嵌入矩阵的维度
embed_dim = 32
# 用户id个数
uid_max = max(features.take(0, 1)) + 1 # 6040
# 性别个数
gender_max = max(features.take(1, 1)) + 1 # 1+1=2
# 年龄个数
age_max = max(features.take(2, 1)) + 1 # 6+1=7
# 职业个数
occu_max = max(features.take(3, 1)) + 1 # 20+1=21

# 电影ID个数
movie_id_max = max(features.take(4, 1)) + 1 # 3952
# 电影类型个数
movie_categories_max = max(movies_map.values()) + 1 # 18+1=19
# 电影名单词个数
movie_title_max = len(title_set)

#对电影类型嵌入向量做 加和操作 或者 mean操作 的标志，
combiner = "sum"

# 电影名长度
sentences_size = title_count # 15
# 文本卷积滑动窗口，分别为2， 3， 4， 5个单词
windows_size = {2, 3, 4, 5}
# 文本卷积核数量
filter_num = 8
# 将电影id转下表的字典，因为电影id所在的行跟其的id不一定完全一致，例如第5行的电影id不一定是5
movieid2idx = {val[0]: i for i, val in enumerate(movies.values)}

In [277]:
##超参数

num_epoch = 5
batch_size = 256
dropout_keep = 0.5
learning_rate = 0.0001
show_every_n_batches = 20

In [278]:
import tensorflow as tf
data.dtypes

UserID           int64
Gender           int64
Age              int64
OccupationID     int64
MovieID          int64
Rating           int64
Title           object
Genres          object
dtype: object

In [279]:
## 定义输入的占位符
def get_input():
    uid = tf.placeholder(tf.int32, [None, 1], name='uid')
    user_age = tf.placeholder(tf.int32, [None, 1], name='user_age')
    user_gender = tf.placeholder(tf.int32, [None, 1], name='user_gender')
    user_occu = tf.placeholder(tf.int32, [None, 1], name='user_occu')
    
    movie_id = tf.placeholder(tf.int32, [None, 1], name='movie_id')
    move_categories = tf.placeholder(tf.int32, [None, 18], name='move_categories')
    movie_titles = tf.placeholder(tf.int32, [None, 15], name='movie_titles')
    targets = tf.placeholder(tf.int32, [None, 1], name='targets')
    LearningRate = tf.placeholder(tf.int32, name='LearningRate')
    dropout_keep_prob = tf.placeholder(tf.int32, name='dropout_keep_prob')
    
    return uid, user_age, user_gender, user_occu, movie_id, move_categories, movie_titles, targets, LearningRate, dropout_keep_prob

    

In [280]:
## 构建user的嵌入矩阵
def get_user_embedding(uid, user_age, user_gender, user_occu):
    with tf.name_scope('user_embedding'):
        uid_embed_matrix = tf.Variable(tf.random_uniform([uid_max, embed_dim], -1, 1), name='uid_embed_matrix')
        uid_mebed_layer = tf.nn.embedding_lookup(uid_embed_matrix, uid, name='uid_embed_layer')
        
        age_embed_matrix = tf.Variable(tf.random_uniform([age_max, embed_dim//2], -1, 1), name='age_embed_matrix')
        age_embed_layer = tf.nn.embedding_lookup(age_embed_matrix, user_age, name='age_embed_layer')
        
        gender_embed_matrix = tf.Variable(tf.random_uniform([gender_max, embed_dim//2], -1, 1), name='gender_embed_matrix')
        gender_embed_layer = tf.nn.embedding_lookup(gender_embed_matrix, user_gender, name='gender_embed_layer')
        
        occu_embed_matrix = tf.Variable(tf.random_uniform([occu_max, embed_dim//2], -1, 1), name='occu_embed_matrix')
        occu_embed_layer = tf.nn.embedding_lookup(occu_embed_matrix, user_occu, name='occu_embed_layer')
        
    return uid_mebed_layer, age_embed_layer, gender_embed_layer, occu_embed_layer
        

In [281]:
## 将user的嵌入矩阵一起全连接生成user的特征
def get_user_feature_layer(uid_mebed_layer, age_embed_layer, gender_embed_layer, occu_embed_layer):
    with tf.name_scope('user_fc'):
        # 第一层全连接
        uid_fc_layer = tf.layers.dense(uid_mebed_layer, embed_dim, activation=tf.nn.relu)
        gender_fc_layer = tf.layers.dense(age_embed_layer, embed_dim, activation=tf.nn.relu)
        age_fc_layer = tf.layers.dense(gender_embed_layer, embed_dim, activation=tf.nn.relu)
        occu_fc_layer = tf.layers.dense(gender_embed_layer, embed_dim, activation=tf.nn.relu)
        # 第二层全连接
        user_combine_layer = tf.concat([uid_fc_layer, gender_fc_layer, age_fc_layer, occu_fc_layer], 2)
        user_combine_layer = tf.contrib.layers.fully_connected(user_combine_layer, 200, tf.tanh)
        
        user_combine_layer_flat = tf.reshape(user_combine_layer, [-1, 200])
        
    return user_combine_layer, user_combine_layer_flat
        

In [282]:

def get_movie_id_embed_layer(movie_id):
    with tf.name_scope('movie_embed'):
        movie_embed_matrix = tf.Variable(tf.random_uniform([movie_id_max, embed_dim], -1, 1), name='movie_embed_matrix')
        movie_embed_layer = tf.nn.embedding_lookup(movie_embed_matrix, movie_id, name='movie_embed_layer')
    
    return movie_embed_layer
        
        

In [283]:
# 对电影类型的多个向量做加和
def get_movie_categories_layer(movie_categories):
    with tf.name_scope('movie_categories_layer'):
        movie_categories_embed_matrix = tf.Variable(tf.random_uniform([movie_categories_max, embed_dim], -1, 1), name='movie_categories_embed_matrix')
        movie_categories_embed_layer = tf.nn.embedding_lookup(movie_categories_embed_matrix, movie_categories, name='movie_categories_embed_layer')
        if combiner == 'sum':
            movie_categories_embed_layer = tf.reduce_sum(movie_categories_embed_layer, axis=1, keep_dims=True)
            
    return movie_categories_embed_layer
    

In [287]:
# Movie title 的卷积神经网络实现
def get_movie_cnn_layer(movie_titles):
    #从嵌入矩阵中得到电影名对应的各个单词的嵌入向量
    with tf.name_scope("movie_embedding"):
        movie_title_embed_matrix = tf.Variable(tf.random_uniform([movie_title_max, embed_dim], -1, 1), name = "movie_title_embed_matrix")
        movie_title_embed_layer = tf.nn.embedding_lookup(movie_title_embed_matrix, movie_titles, name = "movie_title_embed_layer")
        movie_title_embed_layer_expand = tf.expand_dims(movie_title_embed_layer, -1)
    
    #对文本嵌入层使用不同尺寸的卷积核做卷积和最大池化
    pool_layer_lst = []
    for window_size in windows_size:
        with tf.name_scope("movie_txt_conv_maxpool_{}".format(window_size)):
            filter_weights = tf.Variable(tf.truncated_normal([window_size, embed_dim, 1, filter_num],stddev=0.1),name = "filter_weights")
            filter_bias = tf.Variable(tf.constant(0.1, shape=[filter_num]), name="filter_bias")
            
            conv_layer = tf.nn.conv2d(movie_title_embed_layer_expand, filter_weights, [1,1,1,1], padding="VALID", name="conv_layer")
            relu_layer = tf.nn.relu(tf.nn.bias_add(conv_layer,filter_bias), name ="relu_layer")
            
            maxpool_layer = tf.nn.max_pool(relu_layer, [1,sentences_size - window_size + 1 ,1,1], [1,1,1,1], padding="VALID", name="maxpool_layer")
            pool_layer_lst.append(maxpool_layer)

    #Dropout层
    with tf.name_scope("pool_dropout"):
        pool_layer = tf.concat(pool_layer_lst, 3, name ="pool_layer")
        max_num = len(windows_size) * filter_num
        pool_layer_flat = tf.reshape(pool_layer , [-1, 1, max_num], name = "pool_layer_flat")
    
        dropout_layer = tf.nn.dropout(pool_layer_flat, dropout_keep, name = "dropout_layer")
    return pool_layer_flat, dropout_layer

In [288]:
# 将Movie的各个层一起做全连接
def get_movie_feature_layer(movie_id_embed_layer, movie_categories_embed_layer, dropout_layer):
    with tf.name_scope("movie_fc"):
        #第一层全连接
        movie_id_fc_layer = tf.layers.dense(movie_id_embed_layer, embed_dim, name = "movie_id_fc_layer", activation=tf.nn.relu)
        movie_categories_fc_layer = tf.layers.dense(movie_categories_embed_layer, embed_dim, name = "movie_categories_fc_layer", activation=tf.nn.relu)
    
        #第二层全连接
        movie_combine_layer = tf.concat([movie_id_fc_layer, movie_categories_fc_layer, dropout_layer], 2)  #(?, 1, 96)
        movie_combine_layer = tf.contrib.layers.fully_connected(movie_combine_layer, 200, tf.tanh)  #(?, 1, 200)
    
        movie_combine_layer_flat = tf.reshape(movie_combine_layer, [-1, 200])
    return movie_combine_layer, movie_combine_layer_flat

In [289]:
# 构建计算图
tf.reset_default_graph()
train_graph = tf.Graph()
with train_graph.as_default():
    #获取输入占位符
    uid, user_gender, user_age, user_job, movie_id, movie_categories, movie_titles, targets, lr, dropout_keep_prob = get_input()
    #获取User的4个嵌入向量
    uid_embed_layer, gender_embed_layer, age_embed_layer, job_embed_layer = get_user_embedding(uid, user_gender, user_age, user_job)
    #得到用户特征
    user_combine_layer, user_combine_layer_flat = get_user_feature_layer(uid_embed_layer, gender_embed_layer, age_embed_layer, job_embed_layer)
    #获取电影ID的嵌入向量
    movie_id_embed_layer = get_movie_id_embed_layer(movie_id)
    #获取电影类型的嵌入向量
    movie_categories_embed_layer = get_movie_categories_layer(movie_categories)
    #获取电影名的特征向量
    pool_layer_flat, dropout_layer = get_movie_cnn_layer(movie_titles)
    #得到电影特征
    movie_combine_layer, movie_combine_layer_flat = get_movie_feature_layer(movie_id_embed_layer, 
                                                                                movie_categories_embed_layer, 
                                                                                dropout_layer)
    #计算出评分，要注意两个不同的方案，inference的名字（name值）是不一样的，后面做推荐时要根据name取得tensor
    with tf.name_scope("inference"):
        #将用户特征和电影特征作为输入，经过全连接，输出一个值的方案
#         inference_layer = tf.concat([user_combine_layer_flat, movie_combine_layer_flat], 1)  #(?, 200)
#         inference = tf.layers.dense(inference_layer, 1,
#                                     kernel_initializer=tf.truncated_normal_initializer(stddev=0.01), 
#                                     kernel_regularizer=tf.nn.l2_loss, name="inference")
        #简单的将用户特征和电影特征做矩阵乘法得到一个预测评分
#        inference = tf.matmul(user_combine_layer_flat, tf.transpose(movie_combine_layer_flat))
        inference = tf.reduce_sum(user_combine_layer_flat * movie_combine_layer_flat, axis=1)
        inference = tf.expand_dims(inference, axis=1)

    with tf.name_scope("loss"):
        # MSE损失，将计算值回归到评分
        cost = tf.losses.mean_squared_error(targets, inference )
        loss = tf.reduce_mean(cost)
    # 优化损失 
#     train_op = tf.train.AdamOptimizer(lr).minimize(loss)  #cost
    global_step = tf.Variable(0, name="global_step", trainable=False)
    optimizer = tf.train.AdamOptimizer(lr)
    gradients = optimizer.compute_gradients(loss)  #cost
    train_op = optimizer.apply_gradients(gradients, global_step=global_step)

In [290]:
inference

<tf.Tensor 'inference/ExpandDims:0' shape=(?, 1) dtype=float32>

In [291]:
# 取得batch
def get_batch(Xs, ys, batch_size):
    for start in range(0, len(Xs), batch_size):
        end = min(start+batch_size, len(Xs))
        yield Xs[start, end], ys[start, end]

In [None]:
# 训练网络
%matplotlib inline
import matplotlib.pyplot as plt

losses = {"tran": [], "test": []}

with tf.Session() as sess:
    