In [119]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU, LSTM
from keras.layers import Dropout
from keras.models import Sequential
from keras.layers import Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.utils.np_utils import to_categorical
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping
from keras_tqdm import TQDMNotebookCallback

import re
import datetime

# 加载数据集
ratings = pd.read_csv('ml-latest/ratings.csv')
print ('Shape of the ratings data frame:', ratings.shape)

tags = pd.read_csv('ml-latest/tags.csv')
print ('Shape of the tags data frame:', tags.shape)

movies = pd.read_csv('ml-latest/movies.csv')
print ('Shape of the movies data frame:', movies.shape)

tags = tags.sample(frac=0.2)
ratings = ratings.sample(frac=0.2)

('Shape of the ratings data frame:', (26024289, 4))
('Shape of the tags data frame:', (753170, 4))
('Shape of the movies data frame:', (45843, 3))


In [120]:
#除去时间，结合ratings和tags的数据
ratings = ratings.drop(['timestamp'],axis=1)
#ratings.head(n=5)
#Display summary statistics about data
#ratings.describe()
#Print sample tags data
#tags.head(n=5)
#Print sample movies data
#movies.head(n=5)
data = pd.merge(ratings, tags, how='inner')
print ('Shape of the joint data frame:', data.shape)
print (data.head(n=5))

('Shape of the joint data frame:', (26398, 5))
   userId  movieId  rating                      tag   timestamp
0  244193     1252     5.0                  cynical  1440325509
1   30344     7802     3.5                    gangs  1474401821
2   84574   117368     3.0         children cartoon  1422086833
3   84353    59615     3.5  worse than predecessors  1289238229
4   84353    59615     3.5            indiana jones  1289238199


## 清理 检查数据

In [121]:
#提取每个标签的数据个数
for column in data.columns:
    print (column, data[column].nunique())

('userId', 4117)
('movieId', 5534)
('rating', 10)
('tag', 7469)
('timestamp', 25956)


In [122]:
#从tag中删除任何特殊字符，以减少惟一标记的数量，并提高性能
data['rating'] = data['rating'].apply(lambda x: 1 if x > 4 else 0)
data['tag'] = data['tag'].apply(lambda x: str(x))
data['tag'] = data['tag'].map(lambda x: re.sub(r'([^\s\w]|_)+', '', x))
data['tag'] = data['tag'].str.lower()
#将时间转换为datetime格式
data['timestamp'] = data['timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))
data['timestamp'].astype('datetime64[ns]')[0:10]

0   2015-08-23 18:25:09
1   2016-09-21 04:03:41
2   2015-01-24 16:07:13
3   2010-11-09 01:43:49
4   2010-11-09 01:43:19
5   2016-09-27 15:28:37
6   2016-09-27 15:29:00
7   2006-02-19 23:32:14
8   2013-05-25 05:43:39
9   2016-04-14 20:51:09
Name: timestamp, dtype: datetime64[ns]

In [123]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['tag'])
sequences = tokenizer.texts_to_sequences(data['tag'])

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

pseq = pad_sequences(sequences)
pdseq = pd.DataFrame(pseq)

Found 6682 unique tokens.


In [124]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english',decode_error='ignore', analyzer='word')
corpus = data['tag'].values
wordvec = vectorizer.fit_transform(corpus.ravel())
wordvec = wordvec.toarray()

words = vectorizer.get_feature_names()
print("number of words in plot summary: ", len(words))
pdwordvec = pd.DataFrame(wordvec,columns=words)

('number of words in plot summary: ', 6508)


## 准备深度学习的数据
将斯坦福的glove.6B词汇嵌入作为预先训练的模型

In [125]:
embeddings_index = {}
f = open('glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((len(words), 100))
for i in range(len(words)):
    embedding_vector = embeddings_index.get(words[i])
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
pdembedding = pd.DataFrame(embedding_matrix.T,columns=words)

Found 400000 word vectors.


In [126]:
# LSTM的深度学习只使用单词序列
dpdata = pd.concat([data, pdseq], axis=1)
dpdata = dpdata.drop(['tag'], axis=1)
dpdata = dpdata.drop(['userId'], axis=1)
dpdata = dpdata.drop(['movieId'], axis=1)

In [85]:
#构建训练和数据集
train = dpdata[(dpdata['timestamp'] < '2016-08-01') ]
test = dpdata[(dpdata['timestamp'] >= '2016-08-01') ]

print "Number of rows in training data set:",(len(train))
print "Number of rows in test data set:", (len(test))

Number of rows in training data set: 22128
Number of rows in test data set: 4105


In [86]:
#删除时间戳
train = train.drop(['timestamp'], axis=1)
test = test.drop(['timestamp'], axis=1)
y_train = train['rating']
y_test = test['rating']
x_train = train.drop(['rating'], axis=1)
x_test = test.drop(['rating'], axis=1)

In [131]:
y_test_matrix = to_categorical(y_test)
y_train_matrix = to_categorical(y_train)
x_train_array = np.array(x_train)
x_test_array = np.array(x_test)

epochs = 20
lrate = 0.001
sgd = SGD(lr=lrate)
early_stopping = EarlyStopping(monitor='acc',patience=2)

model = Sequential()
model.add(Embedding(len(word_index)+1, 100, mask_zero=True, trainable=False))
model.add(LSTM(10, return_sequences=False))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, None, 100)         668300    
_________________________________________________________________
lstm_11 (LSTM)               (None, 10)                4440      
_________________________________________________________________
dense_11 (Dense)             (None, 2)                 22        
Total params: 672,762
Trainable params: 4,462
Non-trainable params: 668,300
_________________________________________________________________


In [132]:
# Fit the model 
model.fit(x_train_array, y_train_matrix, validation_data=(x_test_array, y_test_matrix), epochs=epochs, batch_size=100, class_weight='balanced', callbacks=[early_stopping, TQDMNotebookCallback()])
# Final evaluation of the model
scores = model.evaluate(x_test_array, y_test_matrix, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Train on 22128 samples, validate on 4105 samples


Epoch 1/20


Epoch 2/20


Epoch 3/20


Epoch 4/20


Epoch 5/20

Accuracy: 60.68%
