In [1]:
# 用LinearRegression做评分预测
# MAE: 0.61

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import  mean_absolute_error

In [3]:
# 读取数据
rating_file = '../jupyter_files/ml-latest-small/ratings.csv'
movie_file = '../jupyter_files/ml-latest-small/movies.csv'

ratings = pd.read_csv(rating_file)
movies = pd.read_csv(movie_file)

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings.isnull().any()

userId       False
movieId      False
rating       False
timestamp    False
dtype: bool

In [7]:
movies.isnull().any()

movieId    False
title      False
genres     False
dtype: bool

In [8]:
data = pd.merge(ratings, movies, how='left', on='movieId')

In [9]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [10]:
# 将timestamp转换为datetime，只保留年份
data['timestamp'] = data['timestamp'].apply(lambda x: datetime.datetime.utcfromtimestamp(x).year)

In [11]:
# 从genres中获取电影的类型
data['genres'] = data['genres'].apply(lambda x: x.split('|'))

In [12]:
genre_labels = set()
for g in data['genres'].values:
    genre_labels = genre_labels.union(set(g))

In [13]:
for g in genre_labels:
    data[g] = data['genres'].apply(lambda x: 1 if g in x else 0)

In [14]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,Fantasy,Film-Noir,Musical,Sci-Fi,...,Adventure,Romance,Mystery,Crime,Horror,Western,War,(no genres listed),Comedy,Documentary
0,1,1,4.0,2000,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,1,3,4.0,2000,Grumpier Old Men (1995),"[Comedy, Romance]",0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,1,6,4.0,2000,Heat (1995),"[Action, Crime, Thriller]",0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1,47,5.0,2000,Seven (a.k.a. Se7en) (1995),"[Mystery, Thriller]",0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,1,50,5.0,2000,"Usual Suspects, The (1995)","[Crime, Mystery, Thriller]",0,0,0,0,...,0,0,1,1,0,0,0,0,0,0


In [15]:
# 求用户和电影偏差
group = data[['userId','rating']].groupby('userId').mean()
group.columns = ['user_bias']
group.reset_index(inplace=True)
data = pd.merge(data, group, on='userId', how='left')

group1 = data[['movieId','rating']].groupby('movieId').mean()
group1.columns = ['item_bias']
group1.reset_index(inplace=True)
data = pd.merge(data, group1, on='movieId', how='left')

In [16]:
data.drop(['userId', 'movieId', 'title', 'genres'], axis=1, inplace=True)  # 去除不用的列

In [17]:
data.head()

Unnamed: 0,rating,timestamp,Fantasy,Film-Noir,Musical,Sci-Fi,Action,IMAX,Children,Drama,...,Mystery,Crime,Horror,Western,War,(no genres listed),Comedy,Documentary,user_bias,item_bias
0,4.0,2000,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,4.366379,3.92093
1,4.0,2000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,4.366379,3.259615
2,4.0,2000,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,4.366379,3.946078
3,5.0,2000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,4.366379,3.975369
4,5.0,2000,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,4.366379,4.237745


In [18]:
# 划分训练集和测试集
X = data.iloc[:, 1:]
y = data.iloc[:, 0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2020)

In [19]:
# 特征的归一化
for col in ['timestamp', 'user_bias', 'item_bias']:
    min_value = X_train[col].min()
    max_value = X_train[col].max()
    X_train[col] = (X_train[col] - min_value) / (max_value - min_value)
    X_test[col] = (X_test[col] - min_value) / (max_value - min_value)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [20]:
X_train.head()

Unnamed: 0,timestamp,Fantasy,Film-Noir,Musical,Sci-Fi,Action,IMAX,Children,Drama,Thriller,...,Mystery,Crime,Horror,Western,War,(no genres listed),Comedy,Documentary,user_bias,item_bias
86023,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0.539789,0.349206
6533,0.181818,0,0,0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0.698239,0.722222
81687,0.954545,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0.298322,0.651515
8326,0.181818,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0.568552,0.577778
32633,0.818182,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0.507651,0.826584


In [21]:
# 训练模型
linear_model = LinearRegression().fit(X_train.values, y_train.values)

In [22]:
linear_model.coef_

array([-8.85281713e-02, -3.26468612e-03,  3.03050089e-03,  2.26541847e-02,
       -6.70933919e-03, -1.34813278e-02, -2.67754253e-02, -1.61038656e-02,
       -1.21281204e-03, -8.09735953e-03,  3.43053478e-02, -1.13244591e-02,
       -8.87471377e-03, -1.05032869e-02, -1.73425858e-03, -4.41327204e-03,
        1.75979651e-03,  8.45878841e-03,  6.99353585e-02,  5.97176040e-03,
        2.99888537e-02,  2.82465802e+00,  3.86565659e+00])

In [23]:
# 模型预测
y_test_pred = linear_model.predict(X_test.values)

In [24]:
# 模型评估
mae = mean_absolute_error(y_test.values, y_test_pred)

In [25]:
print("MAE:", mae)

MAE: 0.6135402556457775


In [26]:
# 观察一下，用所有评分的均值做预测的基准MAE是多少
y_base = y_train.mean()

In [27]:
erro_sum = 0
y_true = y_test.values
for i in range(len(y_true)):
    erro_sum += abs(y_true[i] - y_base)

In [28]:
base_mae = erro_sum / len(y_true)

In [29]:
print("BASE MAE:", base_mae)

BASE MAE: 0.829361433883728
