In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('top_rated_9000_movies_on_TMDB.csv')#需要去那个网站注册下载文件

# 处理缺失值
df = df.dropna()
# 检查重复数据
df = df.drop_duplicates()
# 检查异常值（比如负数或不合理的评分）
df = df[(df['vote_average'] >= 0) & (df['vote_average'] <= 10)]

# 计算平均评分
average_rating = df['vote_average'].mean()
print(f"average: {average_rating}")

# 找出最高和最低评分的电影
highest_rated = df.loc[df['vote_average'].idxmax()]
lowest_rated = df.loc[df['vote_average'].idxmin()]

print(f"highest vote: {highest_rated['title']}, vote: {highest_rated['vote_average']}")
print(f"lowest vote: {lowest_rated['title']}, vote: {lowest_rated['vote_average']}")

# 绘制人气与评分的关系
plt.figure(figsize=(10, 6))
indices = np.arange(len(df))

plt.scatter(df['popularity'], df['vote_average'], alpha=0.3, c=indices, cmap='viridis')
plt.title('the relationship between popularity and vote')
plt.xlabel('popularity')
plt.ylabel('vote')
plt.show()

# 计算人气与评分的相关性
correlation = df['popularity'].corr(df['vote_average'])
print(f"correlation between popularity and vote: {correlation}")
# 分析不同电影类型的评分和人气
genre_groups = df.groupby('genre_ids').agg({'vote_average': 'mean', 'popularity': 'mean'}).reset_index()

# 绘制电影类型的评分和人气
plt.figure(figsize=(12, 8))
sns.barplot(data=genre_groups, x='genre_ids', y='vote_average', palette='viridis')
plt.title('Average ratings for different types of movies')
plt.xlabel('type')
plt.ylabel('vote_average')
plt.show()

plt.figure(figsize=(12, 8))
sns.barplot(data=genre_groups, x='genre_ids', y='popularity', palette='viridis')
plt.title('Popularity of different types of movies')
plt.xlabel('type')
plt.ylabel('popularity')
plt.show()


# 提取年份信息
df['release_year'] = pd.to_datetime(df['release_date']).dt.year

# 计算每年平均评分和人气
yearly_trends = df.groupby('release_year').agg({'vote_average': 'mean', 'popularity': 'mean'}).reset_index()

# 绘制每年评分和人气趋势
plt.figure(figsize=(14, 8))
sns.lineplot(data=yearly_trends, x='release_year', y='vote_average', label='vote_average')
sns.lineplot(data=yearly_trends, x='release_year', y='popularity', label='popularity')
plt.title('Annual trends in movie ratings and popularity')
plt.xlabel('year')
plt.ylabel('vote')
plt.legend()
plt.show()

# 评分分布
plt.figure(figsize=(10, 6))
sns.histplot(df['vote_average'], bins=20, kde=True, color='blue')
plt.title('Rating distribution')
plt.xlabel('vote')
plt.ylabel('frequency')
plt.show()

# 人气分布
plt.figure(figsize=(10, 6))
sns.histplot(df['popularity'], bins=20, kde=True, color='green')
plt.title('Popularity distribution')
plt.xlabel('popularity')
plt.ylabel('frequncy')
plt.show()

# 简单的基于平均评分的推荐
def recommend_movies(n=5):
    top_movies = df.sort_values(by='vote_average', ascending=False).head(n)
    return top_movies[['title', 'vote_average']]

# 推荐前5部电影
recommendations = recommend_movies(5)
print("recommend:")
print(recommendations)