In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter
import tensorflow as tf

import os
import pickle
import re
# from tensorflow_core.python.ops import math_ops
from urllib.request import urlretrieve
from os.path import isfile, isdir
from tqdm import tqdm
import zipfile
import hashlib

In [2]:
users_title = ['UserID', 'Gender', 'Age', 'OccupationID', 'Zip-code']
users = pd.read_csv('./ml-1m/users.dat', sep='::', header=None, names=users_title, engine = 'python')
movies_title = ['MovieID', 'Title', 'Genres']
movies = pd.read_csv('./ml-1m/movies.dat', sep='::', header=None, names=movies_title, engine = 'python')
ratings_title = ['UserID','MovieID', 'Rating', 'timestamps']
ratings = pd.read_csv('./ml-1m/ratings.dat', sep='::', header=None, names=ratings_title, engine = 'python')

## 数据趋势查看

In [3]:
data = pd.merge(pd.merge(users, ratings), movies)

In [4]:
data

Unnamed: 0,UserID,Gender,Age,OccupationID,Zip-code,MovieID,Rating,timestamps,Title,Genres
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...,...,...,...,...
1000204,5949,M,18,17,47901,2198,5,958846401,Modulations (1998),Documentary
1000205,5675,M,35,14,30030,2703,3,976029116,Broken Vessels (1998),Drama
1000206,5780,M,18,17,92886,2845,1,958153068,White Boys (1999),Drama
1000207,5851,F,18,20,55410,3607,5,957756608,One Little Indian (1973),Comedy|Drama|Western


## 不同性别对电影的平均评分
- 采用数据透视，建立以Title为行索引，Gerder为列索引，mean为聚合方法来显示Rating中的数据。
- 这样就获得了一张我们自创的data_gender数据表，下面我们再对该数据表进行操作：

In [9]:
data_gender = data.pivot_table(values='Rating', index='Title', columns='Gender', aggfunc='mean')

In [8]:
data_gender

Gender,F,M
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375000,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024
...,...,...
"Zed & Two Noughts, A (1985)",3.500000,3.380952
Zero Effect (1998),3.864407,3.723140
Zero Kelvin (Kj�rlighetens kj�tere) (1995),,3.500000
Zeus and Roxanne (1997),2.777778,2.357143


- data_gender数据表中新插入了一列difference，用来存放男女用户评分的差值。
- 对difference列降序排列（或者升序），即可看到不同性别用户对相同电影评分差异最大的电影了

In [11]:
#男女生评分差距最大的电影
data_gender['difference'] = data_gender.F - data_gender.M
data_gender_sorted = data_gender.sort_values(by='difference', ascending=False)
data_gender_sorted

Gender,F,M,difference
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"James Dean Story, The (1957)",4.000000,1.000000,3.000000
"Spiders, The (Die Spinnen, 1. Teil: Der Goldene See) (1919)",4.000000,1.000000,3.000000
Country Life (1994),5.000000,2.000000,3.000000
Babyfever (1994),3.666667,1.000000,2.666667
"Woman of Paris, A (1923)",5.000000,2.428571,2.571429
...,...,...,...
With Friends Like These... (1998),,4.000000,
"Wooden Man's Bride, The (Wu Kui) (1994)",,3.000000,
Year of the Horse (1997),,3.250000,
Zachariah (1971),,3.500000,


## 平均分较高的电影

In [14]:

data_mean_rating = data.pivot_table(values='Rating', index='Title', aggfunc='mean')
data_mean_rating_sorted = data_mean_rating.sort_values(by='Rating', ascending=False)
data_mean_rating_sorted

Unnamed: 0_level_0,Rating
Title,Unnamed: 1_level_1
Ulysses (Ulisse) (1954),5.0
Lured (1947),5.0
Follow the Bitch (1998),5.0
Bittersweet Motel (2000),5.0
Song of Freedom (1936),5.0
...,...
"Fantastic Night, The (La Nuit Fantastique) (1949)",1.0
Cheetah (1989),1.0
Torso (Corpi Presentano Tracce di Violenza Carnale) (1973),1.0
Mutters Courage (1995),1.0


## 评分次数最多热门的电影

In [15]:
data_rating_num = data.groupby('Title').size()
data_rating_num_sorted = data_rating_num.sort_values(ascending=False)
data_rating_num_sorted

Title
American Beauty (1999)                                   3428
Star Wars: Episode IV - A New Hope (1977)                2991
Star Wars: Episode V - The Empire Strikes Back (1980)    2990
Star Wars: Episode VI - Return of the Jedi (1983)        2883
Jurassic Park (1993)                                     2672
                                                         ... 
Anna (1996)                                                 1
McCullochs, The (1975)                                      1
Shadows (Cienie) (1988)                                     1
Night Tide (1961)                                           1
Another Man's Poison (1952)                                 1
Length: 3706, dtype: int64

## 分析
- 为什么那些平均分高的电影，我从来没看过？甚至有些听都没听过呢？这个问题
- 是不符合常理的，毕竟国内外好电影大家按说都应该耳熟能详的，所以这其中一定存在错误

- 有些电影只有极少数的人（1-2人）看过，并且觉得很好看，给了很高的评分，这个时候我们去分析数据
- 的时候，得到的就是这种极小众认为好看的电影。因此，我们应该对评分次数做出最小值限定，使数据更加合理

In [27]:
data_mean_rating_number = data_mean_rating.loc[data_rating_num[data_rating_num > 400].index]
data_mean_rating_number_sorted = data_mean_rating_number.sort_values(by='Rating', ascending=False)
data_mean_rating_number_sorted

Unnamed: 0_level_0,Rating
Title,Unnamed: 1_level_1
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954),4.560510
"Shawshank Redemption, The (1994)",4.554558
"Godfather, The (1972)",4.524966
"Close Shave, A (1995)",4.520548
"Usual Suspects, The (1995)",4.517106
...,...
Judge Dredd (1995),2.308511
Batman & Robin (1997),2.257426
Congo (1995),2.238938
Wild Wild West (1999),2.158537


## 电影平均得分与热度综合起来

In [29]:
data_mean_rating_number_sorted['hot'] = data_rating_num_sorted[data_mean_rating_number_sorted.index]
data_mean_rating_number_sorted

Unnamed: 0_level_0,Rating,hot
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954),4.560510,628
"Shawshank Redemption, The (1994)",4.554558,2227
"Godfather, The (1972)",4.524966,2223
"Close Shave, A (1995)",4.520548,657
"Usual Suspects, The (1995)",4.517106,1783
...,...,...
Judge Dredd (1995),2.308511,564
Batman & Robin (1997),2.257426,606
Congo (1995),2.238938,565
Wild Wild West (1999),2.158537,902


## 总结：整体来说，评分高的，热度也高