In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import os

# 创建目录
output_dir = r'D:\jupyter_project\individual_assignment\taobao'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 下载必要的NLTK资源
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# 读取数据
file_path = 'taobao_reviews_us_en.csv'
df = pd.read_csv(file_path)

# 保存原始数据副本
df_original = df.copy()

# 文本清洗函数
def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    # 转换为小写
    text = text.lower()
    
    # 移除URL
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # 移除用户名标记
    text = re.sub(r'@\w+', '', text)
    
    # 移除特殊字符和标点符号，但保留句子结构
    text = re.sub(r'[^\w\s\.\,\!\?]', '', text)
    
    # 移除数字
    text = re.sub(r'\d+', '', text)
    
    # 移除多余的空格
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# 应用文本清洗
df['清洗后评论'] = df['评论内容'].apply(clean_text)

# 分词和去除停用词
stop_words = set(stopwords.words('english'))

def tokenize_and_remove_stopwords(text):
    if not isinstance(text, str) or text == "":
        return []
    
    # 分词
    tokens = word_tokenize(text)
    
    # 去除停用词
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]
    
    return tokens

df['分词结果'] = df['清洗后评论'].apply(tokenize_and_remove_stopwords)

# 词形还原
lemmatizer = WordNetLemmatizer()

def lemmatize_text(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

df['词形还原'] = df['分词结果'].apply(lemmatize_text)

# 将词形还原后的词转回文本
df['处理后文本'] = df['词形还原'].apply(lambda x: ' '.join(x))

# 保存清洗后的数据
cleaned_file_path = os.path.join(output_dir, 'cleaned_taobao_reviews.csv')
df[['评论ID', '用户名', '评论内容', '清洗后评论', '处理后文本', '评分', '点赞数', '评论时间']].to_csv(cleaned_file_path, index=False)

# 输出清洗前后的对比示例
sample_size = 5
sample_indices = np.random.choice(df.shape[0], sample_size, replace=False)
samples = df.iloc[sample_indices]

# 第一个文件：样本输出
with open(os.path.join(output_dir, 'text_cleaning_samples.txt'), 'w', encoding='utf-8') as f:
    f.write("文本清洗前后对比示例\n")
    f.write("=" * 80 + "\n\n")
    
    for i, row in samples.iterrows():
        f.write(f"示例 {i+1}:\n")
        f.write(f"原始文本: {row['评论内容']}\n")
        f.write(f"清洗后文本: {row['清洗后评论']}\n")
        f.write(f"分词结果: {', '.join(row['分词结果'])}\n")
        f.write(f"最终处理文本: {row['处理后文本']}\n")
        f.write("-" * 80 + "\n\n")



# 统计清洗前后的文本长度变化
df['原始长度'] = df['评论内容'].apply(len)
df['清洗后长度'] = df['清洗后评论'].apply(len)
df['处理后长度'] = df['处理后文本'].apply(len)

length_stats = {
    '原始文本': {
        '平均长度': df['原始长度'].mean(),
        '最小长度': df['原始长度'].min(),
        '最大长度': df['原始长度'].max()
    },
    '清洗后文本': {
        '平均长度': df['清洗后长度'].mean(),
        '最小长度': df['清洗后长度'].min(),
        '最大长度': df['清洗后长度'].max()
    },
    '处理后文本': {
        '平均长度': df['处理后长度'].mean(),
        '最小长度': df['处理后长度'].min(),
        '最大长度': df['处理后长度'].max()
    }
}

# 建议文件名和路径全用英文
# 第二个文件：cleaned_samples.txt
with open("cleaned_samples.txt", "w", encoding="utf-8") as f:
    for i, row in samples.iterrows():
        f.write(f"示例 {i+1}:\n")
        f.write(f"原始文本: {row['评论内容']}\n")
        f.write(f"清洗后文本: {row['清洗后评论']}\n")
        f.write(f"分词结果: {', '.join(row['分词结果'])}\n")
        f.write("\n")


print("数据清洗和预处理完成，结果已保存到output目录")
print(f"清洗后的数据保存在: {cleaned_file_path}")
print(f"清洗前后的文本对比示例保存在: {os.path.join(output_dir, 'text_cleaning_samples.txt')}")
print(f"文本长度统计保存在: {os.path.join(output_dir, 'text_length_stats.txt')}")


[nltk_data] Downloading package punkt to D:\Anaconda3\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to D:\Anaconda3\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to D:\Anaconda3\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


数据清洗和预处理完成，结果已保存到output目录
清洗后的数据保存在: D:\jupyter_project\individual_assignment\taobao\cleaned_taobao_reviews.csv
清洗前后的文本对比示例保存在: D:\jupyter_project\individual_assignment\taobao\text_cleaning_samples.txt
文本长度统计保存在: D:\jupyter_project\individual_assignment\taobao\text_length_stats.txt


In [2]:
!pip install nltk



In [3]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to D:\Anaconda3\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to D:\Anaconda3\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to D:\Anaconda3\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to D:\Anaconda3\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
import pandas as pd
import numpy as np
import re
import string
import os
import string
# 创建目录
output_dir = r'D:\jupyter_project\individual_assignment\taobao'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 读取数据
file_path = 'taobao_reviews_us_en.csv'
df = pd.read_csv(file_path)

# 保存原始数据副本
df_original = df.copy()

# 文本清洗函数
def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    # 转换为小写
    text = text.lower()
    
    # 移除URL
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # 移除用户名标记
    text = re.sub(r'@\w+', '', text)
    
    # 移除特殊字符和标点符号，但保留句子结构
    text = re.sub(r'[^\w\s\.\,\!\?]', '', text)
    
    # 移除数字
    text = re.sub(r'\d+', '', text)
    
    # 移除多余的空格
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# 应用文本清洗
df['清洗后评论'] = df['评论内容'].apply(clean_text)

# 简单分词（不使用NLTK）
def simple_tokenize(text):
    if not isinstance(text, str) or text == "":
        return []
    
    # 简单分词：按空格分割
    tokens = text.split()
    
    # 简单过滤：移除长度为1的词和常见英文停用词
    common_stopwords = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 
                        'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 
                        'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 
                        'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 
                        'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 
                        'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 
                        'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 
                        'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 
                        'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 
                        'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 
                        'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 
                        'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'}
    
    tokens = [word for word in tokens if word not in common_stopwords and len(word) > 1]
    
    return tokens

df['分词结果'] = df['清洗后评论'].apply(simple_tokenize)

# 将分词结果转回文本
df['处理后文本'] = df['分词结果'].apply(lambda x: ' '.join(x))

# 保存清洗后的数据
cleaned_file_path = os.path.join(output_dir, 'cleaned_taobao_reviews.csv')
df[['评论ID', '用户名', '评论内容', '清洗后评论', '处理后文本', '评分', '点赞数', '评论时间']].to_csv(cleaned_file_path, index=False)

# 输出清洗前后的对比示例
sample_size = 5
sample_indices = np.random.choice(df.shape[0], sample_size, replace=False)
samples = df.iloc[sample_indices]

with open(os.path.join(output_dir, 'text_cleaning_samples.txt'), 'w', encoding='utf-8') as f:

    f.write("文本清洗前后对比示例\n")
    f.write("=" * 80 + "\n\n")
    
    for i, row in samples.iterrows():
        f.write(f"示例 {i+1}:\n")
        f.write(f"原始文本: {row['评论内容']}\n")
        f.write(f"清洗后文本: {row['清洗后评论']}\n")
        f.write(f"分词结果: {', '.join(row['分词结果'])}\n")
        f.write(f"最终处理文本: {row['处理后文本']}\n")
        f.write("-" * 80 + "\n\n")

# 统计清洗前后的文本长度变化
df['原始长度'] = df['评论内容'].apply(len)
df['清洗后长度'] = df['清洗后评论'].apply(len)
df['处理后长度'] = df['处理后文本'].apply(len)

length_stats = {
    '原始文本': {
        '平均长度': df['原始长度'].mean(),
        '最小长度': df['原始长度'].min(),
        '最大长度': df['原始长度'].max()
    },
    '清洗后文本': {
        '平均长度': df['清洗后长度'].mean(),
        '最小长度': df['清洗后长度'].min(),
        '最大长度': df['清洗后长度'].max()
    },
    '处理后文本': {
        '平均长度': df['处理后长度'].mean(),
        '最小长度': df['处理后长度'].min(),
        '最大长度': df['处理后长度'].max()
    }
}

# 用 utf-8 编码写入，支持 emoji 和所有字符
with open("cleaned_samples.txt", "w", encoding="utf-8") as f:
    for i, row in samples.iterrows():
        f.write(f"示例 {i+1}:\n")
        f.write(f"原始文本: {row['评论内容']}\n")
        f.write(f"清洗后文本: {row['清洗后评论']}\n")
        f.write(f"分词结果: {', '.join(row['分词结果'])}\n")
        f.write("\n")


print("数据清洗和预处理完成，结果已保存到output目录")
print(f"清洗后的数据保存在: {cleaned_file_path}")
print(f"清洗前后的文本对比示例保存在: {os.path.join(output_dir, 'text_cleaning_samples.txt')}")
print(f"文本长度统计保存在: {os.path.join(output_dir, 'text_length_stats.txt')}")


数据清洗和预处理完成，结果已保存到output目录
清洗后的数据保存在: D:\jupyter_project\individual_assignment\taobao\cleaned_taobao_reviews.csv
清洗前后的文本对比示例保存在: D:\jupyter_project\individual_assignment\taobao\text_cleaning_samples.txt
文本长度统计保存在: D:\jupyter_project\individual_assignment\taobao\text_length_stats.txt


In [5]:
print("数据清洗和预处理完成，结果已保存到 taobao 目录")
print(f"清洗后的数据保存在: {os.path.join(output_dir, 'cleaned_taobao_reviews.csv')}")
print(f"清洗前后的文本对比示例保存在: {os.path.join(output_dir, 'text_cleaning_samples.txt')}")
print(f"文本长度统计保存在: {os.path.join(output_dir, 'text_length_stats.txt')}")

数据清洗和预处理完成，结果已保存到 taobao 目录
清洗后的数据保存在: D:\jupyter_project\individual_assignment\taobao\cleaned_taobao_reviews.csv
清洗前后的文本对比示例保存在: D:\jupyter_project\individual_assignment\taobao\text_cleaning_samples.txt
文本长度统计保存在: D:\jupyter_project\individual_assignment\taobao\text_length_stats.txt


In [6]:
print("以下是输出目录下的文件：")
for f in os.listdir(output_dir):
    print("✅", f)

以下是输出目录下的文件：
✅ .ipynb_checkpoints
✅ Aclean2.1l去标点.ipynb
✅ cleaned_samples.txt
✅ cleaned_taobao_reviews.csv
✅ taobao_cleaner.ipynb
✅ taobao_crawler.ipynb
✅ taobao_reviews_cn_zh.csv
✅ taobao_reviews_us_en.csv
✅ text_cleaning_samples.txt
✅ 整理数据.ipynb


In [11]:
import pandas as pd
import string

# 读取原始数据
input_file = 'cleaned_taobao_reviews.csv'
df = pd.read_csv(input_file)

# 查看前几行数据
print("原始数据示例:")
print(df.head())

# 定义函数：移除标点符号
def remove_punctuation(text):
    if isinstance(text, str):
        return text.translate(str.maketrans('', '', string.punctuation))
    return text

# 应用清洗函数到多个列
columns_to_clean = ['清洗后评论', '处理后文本']
for col in columns_to_clean:
    if col in df.columns:
        df[col] = df[col].apply(remove_punctuation)
        print(f"已移除列 '{col}' 中的标点符号")
    else:
        print(f"未找到列：'{col}'，请确认列名")

# 保存清洗后的数据
output_file = 'cleaned_taobao_reviews.csv'
df.to_csv(output_file, index=False)

print(f"清洗后的数据已保存为 {output_file}")



原始数据示例:
                                   评论ID                        用户名  \
0  fa046802-4a1c-45d3-84a6-68c02e875ed5  Maria Nelly Arevalo Toala   
1  588015dc-9dfe-408f-8bdf-f6a3c274780d              Rayna Maschan   
2  342b6a5f-1f2d-4ecb-af6a-35b3da4116e7                 Celina Lee   
3  bd012696-c2d0-436c-a163-970b89e8ae21                       Greg   
4  48835ad9-b550-4a5d-8775-7268349d4e01                 Linda Wong   

                                                评论内容  \
0  I've been using this app for about a month, an...   
1  Language Barrier and unfortunately, the only l...   
2  Keeps having you sign in repeatedly then claim...   
3  It's pretty satisfying and cheap. Only thing i...   
4  I bought more than two hundred products since ...   

                                               清洗后评论  \
0  ive been using this app for about a month, and...   
1  language barrier and unfortunately, the only l...   
2  keeps having you sign in repeatedly then claim...   
3  its pre