In [None]:
import pandas as pd
import re

# 数据导入
file_path = '/content/drive/MyDrive/DMSC.csv'
df = pd.read_csv(file_path, encoding='utf-8')

# 初步查看数据
print(df.head())
print(df.info())

# 缺失值处理：删除缺失评论或评分的行
df.dropna(subset=['Comment', 'Star'], inplace=True)

# 重复值处理：删除重复行
df.drop_duplicates(inplace=True)

# 转换数据格式：日期列转换为datetime格式
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# 删除转换失败的日期行（即原始数据中日期格式错误的数据）
df.dropna(subset=['Date'], inplace=True)

# 评论内容清洗：去除多余符号和表情符号
df['Comment'] = df['Comment'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# 防止输出中文乱码
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']  # 设置默认字体
plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像时负号'-'显示为方块的问题

# 输出清洗后的数据表中的前五个数据
print(df.head())

# 输出清洗后的完整数据表
df.to_csv('/content/drive/MyDrive/cleaned_DMSC.csv', index=False, encoding='utf-8-sig')


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
pip install wordcloud

Collecting wordcloud
  Downloading wordcloud-1.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading wordcloud-1.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.1/511.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: wordcloud
Successfully installed wordcloud-1.9.3


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import os
from matplotlib.font_manager import FontProperties

# 数据导入
file_path = '/content/drive/MyDrive/cleaned_DMSC.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig')

# 字体路径
font_path = '/content/drive/MyDrive/SimHei.ttf'

# 创建保存词云的目录
output_dir = '/content/drive/MyDrive/wordcloud'
os.makedirs(output_dir, exist_ok=True)

# 字体属性设置
font_properties = FontProperties(fname=font_path)

# 对前三个电影进行高频词汇分析并生成词云
unique_movies = df['Movie_Name_CN'].unique()[:3]
for movie in unique_movies:
    movie_data = df[df['Movie_Name_CN'] == movie]
    comments = ' '.join(movie_data['Comment'].dropna())

    # 使用指定的字体生成词云
    wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=200, font_path=font_path, collocations=False).generate(comments)

    # 绘制词云
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for {movie}', fontsize=16, fontproperties=font_properties)
    plt.savefig(f'{output_dir}/{movie}_wordcloud.png')
    plt.close()

# 输出完成提示
print("前三个电影的词云已保存至 /content/drive/MyDrive/wordcloud 目录")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from matplotlib.font_manager import FontProperties
import seaborn as sns

# 数据导入
file_path = '/content/drive/MyDrive/cleaned_DMSC.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig')

# 字体路径
font_path = '/content/drive/MyDrive/SimHei.ttf'

# 创建保存可视化的目录
output_dir = '/content/drive/MyDrive/visualizations'
os.makedirs(output_dir, exist_ok=True)

# 字体属性设置
font_properties = FontProperties(fname=font_path)

# 对前三个电影进行评分分布可视化
unique_movies = df['Movie_Name_CN'].unique()[:3]
for movie in unique_movies:
    movie_data = df[df['Movie_Name_CN'] == movie]
    plt.figure(figsize=(10, 6))
    sns.histplot(movie_data['Star'], bins=10, kde=True)
    plt.title(f'Rating Distribution for {movie}', fontsize=16, fontproperties=font_properties)
    plt.xlabel('Rating')
    plt.ylabel('Number of Reviews')
    plt.savefig(f'{output_dir}/{movie}_rating_distribution.png')
    plt.close()

# 输出完成提示
print("前三个电影的评分分布已保存至 /content/drive/MyDrive/visualizations 目录")

前三个电影的评分分布已保存至 /content/drive/MyDrive/visualizations 目录


In [None]:
# 修改日期列的方式，避免 SettingWithCopyWarning
movie_data.loc[:, 'Date'] = pd.to_datetime(movie_data['Date'])

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from matplotlib.font_manager import FontProperties

# 数据导入
file_path = '/content/drive/MyDrive/cleaned_DMSC.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig')

# 字体路径
font_path = '/content/drive/MyDrive/SimHei.ttf'
font_properties = FontProperties(fname=font_path)

# 防止输出中文乱码
plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用SimHei字体
plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像时负号'-'显示为方块的问题

# 创建保存评分随时间变化图的目录
time_output_dir = '/content/drive/MyDrive/time_change'
os.makedirs(time_output_dir, exist_ok=True)

# 评分随时间变化：为每个电影绘制评分随时间变化的趋势图
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df.dropna(subset=['Date'], inplace=True)

df['YearMonth'] = df['Date'].dt.to_period('M')
unique_movies = df['Movie_Name_CN'].unique()[:3]  # 只取前三个电影进行可视化
for movie in unique_movies:
    plt.figure(figsize=(10, 6))
    movie_data = df[df['Movie_Name_CN'] == movie]
    avg_rating_by_month = movie_data.groupby('YearMonth')['Star'].mean()
    plt.plot(avg_rating_by_month.index.astype(str), avg_rating_by_month.values, marker='o')
    plt.title(f'{movie} Average Rating Over Time', fontsize=16, fontproperties=font_properties)
    plt.xlabel('Year-Month')
    plt.ylabel('Average Rating')
    plt.xticks(rotation=45)
    plt.savefig(f'{time_output_dir}/{movie}_rating_over_time.png')
    plt.close()

# 输出完成提示
print("前三个电影的评分随时间变化的图已保存至 /content/drive/MyDrive/time_change 目录")



前三个电影的评分随时间变化的图已保存至 /content/drive/MyDrive/time_change 目录


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from matplotlib.font_manager import FontProperties

# 数据导入
file_path = '/content/drive/MyDrive/cleaned_DMSC.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig')

# 字体路径
font_path = '/content/drive/MyDrive/SimHei.ttf'
font_properties = FontProperties(fname=font_path)

# 防止输出中文乱码
plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用SimHei字体
plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像时负号'-'显示为方块的问题

# 创建保存评论字数与评分关系的目录
length_output_dir = '/content/drive/MyDrive/comment_length_analysis'
os.makedirs(length_output_dir, exist_ok=True)

# 计算评论字数
df['Comment_Length'] = df['Comment'].apply(lambda x: len(str(x)))

# 对数据进行降采样，采样频率为3000
df_sampled = df.sample(n=3000, random_state=42)

# 评论字数与评分的关系可视化（使用散点图，降采样）
plt.figure(figsize=(12, 8))
sns.scatterplot(x='Comment_Length', y='Star', data=df_sampled, alpha=0.3)
plt.title('Relationship between Comment Length and Rating (Sampled Data)', fontsize=16, fontproperties=font_properties)
plt.xlabel('Comment Length')
plt.ylabel('Rating')
plt.savefig(f'{length_output_dir}/comment_length_vs_rating_scatter_sampled.png')
plt.close()

# 输出完成提示
print("评论字数与评分的关系散点图（采样数据）已保存至 /content/drive/MyDrive/comment_length_analysis 目录")




评论字数与评分的关系散点图（采样数据）已保存至 /content/drive/MyDrive/comment_length_analysis 目录
