In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU, LSTM
from keras.layers import Dropout
from keras.models import Sequential
from keras.layers import Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.utils.np_utils import to_categorical
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping
from keras_tqdm import TQDMNotebookCallback

import re
import datetime

# 加载数据集
ratings = pd.read_csv('ml-latest/ratings.csv')
print ('Shape of the ratings data frame:', ratings.shape)

tags = pd.read_csv('ml-latest/tags.csv')
print ('Shape of the tags data frame:', tags.shape)

movies = pd.read_csv('ml-latest/movies.csv')
print ('Shape of the movies data frame:', movies.shape)

tags = tags.sample(frac=0.2)
ratings = ratings.sample(frac=0.2)

Using TensorFlow backend.


('Shape of the ratings data frame:', (26024289, 4))
('Shape of the tags data frame:', (753170, 4))
('Shape of the movies data frame:', (45843, 3))


In [2]:
#除去时间，结合ratings和tags的数据
ratings = ratings.drop(['timestamp'],axis=1)
#ratings.head(n=5)
#Display summary statistics about data
#ratings.describe()
#Print sample tags data
#tags.head(n=5)
#Print sample movies data
#movies.head(n=5)
data = pd.merge(ratings, tags, how='inner')
print ('Shape of the joint data frame:', data.shape)
print (data.head(n=5))

('Shape of the joint data frame:', (26193, 5))
   userId  movieId  rating              tag   timestamp
0  256510     5869     2.5       Bibliothek  1138659665
1  168200    69122     4.0           comedy  1292434606
2  172179    37733     1.5    disappointing  1264106059
3  190554    63072     4.5  Viggo Mortensen  1263758929
4  169568     3362     4.0        Al Pacino  1255086480


## 清理 检查数据

In [3]:
#提取每个标签的数据个数
for column in data.columns:
    print (column, data[column].nunique())

('userId', 4054)
('movieId', 5487)
('rating', 10)
('tag', 7416)
('timestamp', 25772)


In [4]:
#从tag中删除任何特殊字符，以减少惟一标记的数量，并提高性能
data['rating'] = data['rating'].apply(lambda x: 1 if x > 4 else 0)
data['tag'] = data['tag'].apply(lambda x: str(x))
data['tag'] = data['tag'].map(lambda x: re.sub(r'([^\s\w]|_)+', '', x))
data['tag'] = data['tag'].str.lower()
#将时间转换为datetime格式
data['timestamp'] = data['timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))
data['timestamp'].astype('datetime64[ns]')[0:10]

0   2006-01-31 06:21:05
1   2010-12-16 01:36:46
2   2010-01-22 04:34:19
3   2010-01-18 04:08:49
4   2009-10-09 19:08:00
5   2016-10-13 09:10:04
6   2016-10-13 09:07:06
7   2015-09-27 17:17:26
8   2017-01-04 02:32:53
9   2017-01-04 02:32:59
Name: timestamp, dtype: datetime64[ns]

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['tag'])
sequences = tokenizer.texts_to_sequences(data['tag'])

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

pseq = pad_sequences(sequences)
pdseq = pd.DataFrame(pseq)

Found 6575 unique tokens.


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english',decode_error='ignore', analyzer='word')
corpus = data['tag'].values
wordvec = vectorizer.fit_transform(corpus.ravel())
wordvec = wordvec.toarray()

words = vectorizer.get_feature_names()
print("number of words in plot summary: ", len(words))
pdwordvec = pd.DataFrame(wordvec,columns=words)

('number of words in plot summary: ', 6391)


## 准备深度学习的数据
将斯坦福的glove.6B词汇嵌入作为预先训练的模型

In [7]:
embeddings_index = {}
f = open('glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((len(words), 100))
for i in range(len(words)):
    embedding_vector = embeddings_index.get(words[i])
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
pdembedding = pd.DataFrame(embedding_matrix.T,columns=words)

Found 400000 word vectors.


In [8]:
# LSTM的深度学习只使用单词序列
dpdata = pd.concat([data, pdseq], axis=1)
dpdata = dpdata.drop(['tag'], axis=1)
dpdata = dpdata.drop(['userId'], axis=1)
dpdata = dpdata.drop(['movieId'], axis=1)

In [52]:
#构建训练和数据集
#train = dpdata[(dpdata['timestamp'] < '2016-08-01') ]
#test = dpdata[(dpdata['timestamp'] >= '2016-08-01') ]
from sklearn.model_selection import train_test_split
train, test = train_test_split(dpdata, test_size=0.2, random_state=0)
print "Number of rows in training data set:",(len(train))
print "Number of rows in test data set:", (len(test))

Number of rows in training data set: 20954
Number of rows in test data set: 5239


In [54]:
#删除时间戳
train = train.drop(['timestamp'], axis=1)
test = test.drop(['timestamp'], axis=1)
y_train = train['rating']
y_test = test['rating']
x_train = train.drop(['rating'], axis=1)
x_test = test.drop(['rating'], axis=1)

In [72]:
y_test_matrix = to_categorical(y_test)
y_train_matrix = to_categorical(y_train)
x_train_array = np.array(x_train)
x_test_array = np.array(x_test)
epochs = 20
lrate = 0.01
sgd = SGD(lr=lrate)
early_stopping = EarlyStopping(monitor='acc',patience=2)

model = Sequential()
model.add(Embedding(len(word_index)+1, 100, mask_zero=True, trainable=False))
model.add(LSTM(10, return_sequences=False))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, None, 100)         657600    
_________________________________________________________________
lstm_18 (LSTM)               (None, 10)                4440      
_________________________________________________________________
dense_18 (Dense)             (None, 2)                 22        
Total params: 662,062
Trainable params: 4,462
Non-trainable params: 657,600
_________________________________________________________________


In [76]:
# Fit the model 
#model.fit(x_train_array, y_train_matrix, validation_data=(x_test_array, y_ test_matrix), epochs=epochs, batch_size=100, verbose=1, class_weight='balanced')
model.fit(x_train_array, y_train_matrix, validation_data=(x_test_array, y_test_matrix), epochs=epochs, batch_size=50, class_weight='balanced')
# Final evaluation of the model
scores = model.evaluate(x_test_array, y_test_matrix, verbose=0)
print("Accuracy: %.4f%%" % (scores[1]*100))

Train on 20954 samples, validate on 5239 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy: 62.3592%


In [77]:
model = Sequential()
model.add(Embedding(len(word_index)+1, 100, mask_zero=True, trainable=False))
model.add(GRU(10, return_sequences=False))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_19 (Embedding)     (None, None, 100)         657600    
_________________________________________________________________
gru_1 (GRU)                  (None, 10)                3330      
_________________________________________________________________
dense_19 (Dense)             (None, 2)                 22        
Total params: 660,952
Trainable params: 3,352
Non-trainable params: 657,600
_________________________________________________________________


In [79]:
model.fit(x_train_array, y_train_matrix, validation_data=(x_test_array, y_test_matrix), epochs=epochs, batch_size=50, class_weight='balanced')
scores = model.evaluate(x_test_array, y_test_matrix, verbose=0)
print("Accuracy: %.4f%%" % (scores[1]*100))

Train on 20954 samples, validate on 5239 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy: 62.4928%


## 和传统方法进行加权计算

In [85]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(class_weight='balanced')
RFC.set_params(n_estimators=100)
RFC.fit(x_train,y_train)
y_pred = RFC.predict_proba(x_test)
R2_rfc = RFC.score(x_test,y_test) 
print "Accuracy of the test set for random forest is: ", np.round(R2_rfc,2)

Accuracy of the test set for random forest is:  0.58


In [86]:
#Calculate weighted probabilities 
y_predlstm = model.predict_proba(x_test_array)
y_pre = (0.6*y_predlstm + 0.4*y_pred)

#Predict ratings using the weighted probabilities
y_predensem = np.zeros((len(y_pre)))
for i in range(len(y_pre)):
    if y_pre[i,1] >= 0.5:
        y_predensem[i] = 1

from sklearn.metrics import accuracy_score
print "Accuracy of the test set for Ensemble model is: ", np.round(accuracy_score(y_test, y_predensem),2)   

Accuracy of the test set for Ensemble model is:  0.63
