In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter
import tensorflow as tf

import os
import pickle
import re
# from tensorflow_core.python.ops import math_ops
from urllib.request import urlretrieve
from os.path import isfile, isdir
from tqdm import tqdm
import zipfile
import hashlib

In [2]:
for module in np, pd, tf:
    print(module.__name__, module.__version__)

numpy 1.18.1
pandas 1.0.1
tensorflow 2.1.0


## 下载ml-1m 数据集
- 本项目使用的是MovieLens 1M 数据集，包含6040个用户在近3883部电影上的100w条评论。
- 数据集分为三个文件：用户数据users.dat，电影数据movies.dat和评分数据ratings.dat。

In [3]:
def _unzip(save_path, _, database_name, data_path):
    """
    Unzip wrapper with the same interface as _ungzip 解压
    :param save_path: The path of the gzip files
    :param database_name: Name of database
    :param data_path: Path to extract to
    :param _: HACK - Used to have to same interface as _ungzip
    """
    print('Extracting {}...'.format(database_name))
    with zipfile.ZipFile(save_path) as zf:
        zf.extractall(data_path)

def download_extract(database_name, data_path):
    """
    Download and extract database
    :param database_name: Database name
    """
    DATASET_ML1M = 'ml-1m'

    if database_name == DATASET_ML1M:
        url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
        hash_code = 'c4d9eecfca2ab87c1945afe126590906'
        extract_path = os.path.join(data_path, 'ml-1m')
        save_path = os.path.join(data_path, 'ml-1m.zip')
        extract_fn = _unzip

    if os.path.exists(extract_path):
        print('Found {} Data'.format(database_name))
        return

    if not os.path.exists(data_path):
        os.makedirs(data_path)

    if not os.path.exists(save_path):
        with DLProgress(unit='B', unit_scale=True, miniters=1, desc='Downloading {}'.format(database_name)) as pbar:
            urlretrieve(
                url,
                save_path,
                pbar.hook)

    assert hashlib.md5(open(save_path, 'rb').read()).hexdigest() == hash_code, \
        '{} file is corrupted.  Remove the file and try again.'.format(save_path)

    os.makedirs(extract_path)
    try:
        extract_fn(save_path, extract_path, database_name, data_path)
    except Exception as err:
        shutil.rmtree(extract_path)  # Remove extraction folder if there is an error
        raise err

    print('Done.')
    # Remove compressed data
#     os.remove(save_path)

class DLProgress(tqdm):
    """
    Handle Progress Bar while Downloading 进度条
    """
    last_block = 0

    def hook(self, block_num=1, block_size=1, total_size=None):
        """
        A hook function that will be called once on establishment of the network connection and
        once after each block read thereafter.
        :param block_num: A count of blocks transferred so far
        :param block_size: Block size in bytes
        :param total_size: The total size of the file. This may be -1 on older FTP servers which do not return
                            a file size in response to a retrieval request.
        """
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num

In [4]:
data_dir = './'
download_extract('ml-1m', data_dir)

Found ml-1m Data


## 用户数据
- 分别有用户ID、性别、年龄、职业ID和邮编等字段。
- 数据中的格式：UserID::Gender::Age::Occupation::Zip-code

In [5]:
users_title = ['UserID', 'Gender', 'Age', 'OccupationID', 'Zip-code']
users = pd.read_csv('./ml-1m/users.dat', sep='::', header=None, names=users_title, engine = 'python')
users.head()

Unnamed: 0,UserID,Gender,Age,OccupationID,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [6]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   UserID        6040 non-null   int64 
 1   Gender        6040 non-null   object
 2   Age           6040 non-null   int64 
 3   OccupationID  6040 non-null   int64 
 4   Zip-code      6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB


## 电影数据
- 分别有电影ID、电影名和电影风格等字段。
- 数据中的格式：MovieID::Title::Genres

In [7]:
movies_title = ['MovieID', 'Title', 'Genres']
movies = pd.read_csv('./ml-1m/movies.dat', sep='::', header=None, names=movies_title, engine = 'python')
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## 评分数据
- 分别有用户ID、电影ID、评分和时间戳等字段。
- 数据中的格式：UserID::MovieID::Rating::Timestamp

In [8]:
ratings_title = ['UserID','MovieID', 'Rating', 'timestamps']
ratings = pd.read_csv('./ml-1m/ratings.dat', sep='::', header=None, names=ratings_title, engine = 'python')
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,timestamps
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## 实现数据预处理
- UserID、Occupation和MovieID不用变。
- Gender字段：需要将‘F’和‘M’转换成0和1。
- Age字段：要转成7个连续数字0~6。
- Genres字段：是分类字段，要转成数字。首先将Genres中的类别转成字符串到数字的字典，然后再将每个电影的Genres字段转成数字列表，因为有些电影是多个Genres的组合。
- Title字段：处理方式跟Genres字段一样，首先创建文本到数字的字典，然后将Title中的描述转成数字的列表。另外Title中的年份也需要去掉。
- Genres和Title字段需要将长度统一，这样在神经网络中方便处理。空白部分用‘< PAD >’对应的数字填充

In [9]:
def load_data():
    """
    Load Dataset from File
    """
    #读取User数据
    users_title = ['UserID', 'Gender', 'Age', 'JobID', 'Zip-code']
    users = pd.read_csv('./ml-1m/users.dat', sep='::', header=None, names=users_title, engine = 'python')
    users = users.filter(regex='UserID|Gender|Age|JobID')
    users_orig = users.values
    #改变User数据中性别和年龄
    gender_map = {'F':0, 'M':1}
    users['Gender'] = users['Gender'].map(gender_map)

    age_map = {val:ii for ii,val in enumerate(set(users['Age']))}
    users['Age'] = users['Age'].map(age_map)

    #读取Movie数据集
    movies_title = ['MovieID', 'Title', 'Genres']
    movies = pd.read_csv('./ml-1m/movies.dat', sep='::', header=None, names=movies_title, engine = 'python')
    movies_orig = movies.values
    #将Title中的年份去掉
    pattern = re.compile(r'^(.*)\((\d+)\)$')

    title_map = {val:pattern.match(val).group(1) for ii,val in enumerate(set(movies['Title']))}
    movies['Title'] = movies['Title'].map(title_map)

    #电影类型转数字字典
    genres_set = set()
    for val in movies['Genres'].str.split('|'):
        genres_set.update(val)

    genres_set.add('<PAD>')
    genres2int = {val:ii for ii, val in enumerate(genres_set)}

    #将电影类型转成等长数字列表，长度是18
    genres_map = {val:[genres2int[row] for row in val.split('|')] for ii,val in enumerate(set(movies['Genres']))}

    for key in genres_map:
        for cnt in range(max(genres2int.values()) - len(genres_map[key])):
            genres_map[key].insert(len(genres_map[key]) + cnt,genres2int['<PAD>'])
    
    movies['Genres'] = movies['Genres'].map(genres_map)

    #电影Title转数字字典
    title_set = set()
    for val in movies['Title'].str.split():
        title_set.update(val)
    
    title_set.add('<PAD>')
    title2int = {val:ii for ii, val in enumerate(title_set)}

    #将电影Title转成等长数字列表，长度是15
    title_count = 15
    title_map = {val:[title2int[row] for row in val.split()] for ii,val in enumerate(set(movies['Title']))}
    
    for key in title_map:
        for cnt in range(title_count - len(title_map[key])):
            title_map[key].insert(len(title_map[key]) + cnt,title2int['<PAD>'])
    
    movies['Title'] = movies['Title'].map(title_map)

    #读取评分数据集
    ratings_title = ['UserID','MovieID', 'ratings', 'timestamps']
    ratings = pd.read_csv('./ml-1m/ratings.dat', sep='::', header=None, names=ratings_title, engine = 'python')
    ratings = ratings.filter(regex='UserID|MovieID|ratings')

    #合并三个表
    data = pd.merge(pd.merge(ratings, users), movies)
    
    #将数据分成X和y两张表
    target_fields = ['ratings']
    features_pd, targets_pd = data.drop(target_fields, axis=1), data[target_fields]
    
    features = features_pd.values
    targets_values = targets_pd.values
    
    return title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig


## 加载数据并保存到本地

In [10]:
title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig = load_data()
pickle.dump((title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig), open('preprocess.p', 'wb'))


In [11]:
users.head()


Unnamed: 0,UserID,Gender,Age,JobID
0,1,0,0,10
1,2,1,5,16
2,3,1,6,15
3,4,1,2,7
4,5,1,6,20


In [12]:
movies.head()


Unnamed: 0,MovieID,Title,Genres
0,1,"[4188, 208, 3163, 3163, 3163, 3163, 3163, 3163...","[0, 18, 3, 14, 14, 14, 14, 14, 14, 14, 14, 14,..."
1,2,"[3126, 3163, 3163, 3163, 3163, 3163, 3163, 316...","[5, 18, 4, 14, 14, 14, 14, 14, 14, 14, 14, 14,..."
2,3,"[2443, 461, 1602, 3163, 3163, 3163, 3163, 3163...","[3, 10, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14..."
3,4,"[3140, 4619, 4301, 3163, 3163, 3163, 3163, 316...","[3, 7, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,..."
4,5,"[3721, 376, 3917, 3050, 1255, 1668, 3163, 3163...","[3, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14..."


## 从本地读取数据

In [13]:
title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig = pickle.load(open('preprocess.p', mode='rb'))


In [14]:
data

Unnamed: 0,UserID,MovieID,ratings,Gender,Age,JobID,Title,Genres
0,1,1193,5,0,0,10,"[334, 3695, 1171, 3917, 812, 937, 3163, 3163, ...","[7, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14..."
1,2,1193,5,1,5,16,"[334, 3695, 1171, 3917, 812, 937, 3163, 3163, ...","[7, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14..."
2,12,1193,4,1,6,12,"[334, 3695, 1171, 3917, 812, 937, 3163, 3163, ...","[7, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14..."
3,15,1193,4,1,6,7,"[334, 3695, 1171, 3917, 812, 937, 3163, 3163, ...","[7, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14..."
4,17,1193,5,1,3,1,"[334, 3695, 1171, 3917, 812, 937, 3163, 3163, ...","[7, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14..."
...,...,...,...,...,...,...,...,...
1000204,5949,2198,5,1,4,17,"[5113, 3163, 3163, 3163, 3163, 3163, 3163, 316...","[9, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14..."
1000205,5675,2703,3,1,1,14,"[199, 1166, 3163, 3163, 3163, 3163, 3163, 3163...","[7, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14..."
1000206,5780,2845,1,1,4,17,"[3547, 3490, 3163, 3163, 3163, 3163, 3163, 316...","[7, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14..."
1000207,5851,3607,5,0,4,20,"[334, 4774, 2587, 3163, 3163, 3163, 3163, 3163...","[3, 7, 1, 14, 14, 14, 14, 14, 14, 14, 14, 14, ..."


In [25]:
def one_hot_obj_features(df, features):
    # 针对非list
    new_df = pd.get_dummies(df, columns=features, sparse=True)
    return new_df

In [26]:
def convert_str(var):
    return str(var).replace("[","").replace("]","").replace(",","")

def dump_series(df,var):
    return df.join(df[var].apply(convert_str).str.join('|').str.get_dummies())

In [28]:
none_list=['Age']
data_res = one_hot_obj_features(data, none_list)
data_res=dump_series(data_res,'Title')
#dummy_city    = pd.get_dummies(train2['city'],prefix = 'city')

# data_res=dump_series(data_res,'Genres')
# data_res.drop('Title',axis = 1,inplace = True)
# data_res.drop('Genres',axis = 1,inplace = True)
data_res

Unnamed: 0,UserID,MovieID,ratings,Gender,JobID,Title,Genres,Age_0,Age_1,Age_2,...,0,1,2,3,4,5,6,7,8,9
0,1,1193,5,0,10,"[334, 3695, 1171, 3917, 812, 937, 3163, 3163, ...","[7, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14...",1,0,0,...,0,1,1,1,1,1,1,1,1,1
1,2,1193,5,1,16,"[334, 3695, 1171, 3917, 812, 937, 3163, 3163, ...","[7, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14...",0,0,0,...,0,1,1,1,1,1,1,1,1,1
2,12,1193,4,1,12,"[334, 3695, 1171, 3917, 812, 937, 3163, 3163, ...","[7, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14...",0,0,0,...,0,1,1,1,1,1,1,1,1,1
3,15,1193,4,1,7,"[334, 3695, 1171, 3917, 812, 937, 3163, 3163, ...","[7, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14...",0,0,0,...,0,1,1,1,1,1,1,1,1,1
4,17,1193,5,1,1,"[334, 3695, 1171, 3917, 812, 937, 3163, 3163, ...","[7, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14...",0,0,0,...,0,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,5949,2198,5,1,17,"[5113, 3163, 3163, 3163, 3163, 3163, 3163, 316...","[9, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14...",0,0,0,...,0,1,0,1,0,1,1,0,0,0
1000205,5675,2703,3,1,14,"[199, 1166, 3163, 3163, 3163, 3163, 3163, 3163...","[7, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14...",0,1,0,...,0,1,0,1,0,0,1,0,0,1
1000206,5780,2845,1,1,17,"[3547, 3490, 3163, 3163, 3163, 3163, 3163, 316...","[7, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14...",0,0,0,...,1,1,0,1,1,1,1,1,0,1
1000207,5851,3607,5,0,20,"[334, 4774, 2587, 3163, 3163, 3163, 3163, 3163...","[3, 7, 1, 14, 14, 14, 14, 14, 14, 14, 14, 14, ...",0,0,0,...,0,1,1,1,1,1,1,1,1,0


## 后续模型参考部分TODO