In [None]:
import os
import json
import pandas as pd
from typing import Optional, Dict, List, Any
def _load_data(filename: str) -> List[Dict]:
    """Load data as a list of dictionaries."""
    #file_path = os.path.join(data_dir, filename)
    with open(filename, 'r', encoding='utf-8') as file:
        return [json.loads(line) for line in file]
user_data = {user['user_id']: user for user in _load_data('user.json')}
amazon_user = [user for user in user_data.values() if user.get('source')== 'yelp']


## Data analysis

依次分析三个json file，注意
1. 每一个json file都要注意查看不同的source，yelp，amazon，goodreads里的东西都不一样
2. 即使都是amazon的东西，json里面每行的键也可能是不一样的，需要check

- 经过检查，所有json文件，只要来源一样，键也都是一样的

3. 不同source的数据应该设计不同的分析方式

### item.json

In [1]:
import json  
from collections import defaultdict  

# 定义要读取的 JSON 文件路径  
file_path = 'item.json'  

# 使用字典来存储不同来源的数据  
source_data = defaultdict(list)  

# 逐行读取 JSON 文件  
with open(file_path, 'r', encoding='utf-8') as file:  
    for line in file:  
        try:  
            # 加载每行的 JSON 数据  
            review = json.loads(line.strip())  
            
            # 按照 source 分类  
            source = review.get('source')  
            source_data[source].append(review)  
        
        except json.JSONDecodeError as e:  
            print(f"解析错误: {e} - 在当前行: {line}")  

# 分析每个来源的数据特点  
for source, reviews in source_data.items():  
    print(f"\n{source.upper()} 数据分析:")  
    
    # 打印数据总数  
    print(f"总记录数: {len(reviews)}")  
    
    # 获取第一条记录的键  
    if reviews:  
        first_record = reviews[0]  
        print("键的组成:")  
        for key in first_record.keys():  
            print(f"- {key}")  
        
        # 打印第一条记录作为示例  
        print("\n第一条记录示例:")  
        print(json.dumps(first_record, ensure_ascii=False, indent=2))


YELP 数据分析:
总记录数: 32869
键的组成:
- item_id
- name
- address
- city
- state
- postal_code
- latitude
- longitude
- stars
- review_count
- is_open
- attributes
- categories
- hours
- source
- type

第一条记录示例:
{
  "item_id": "tUFrWirKiKi_TAnsVWINQQ",
  "name": "Target",
  "address": "5255 E Broadway Blvd",
  "city": "Tucson",
  "state": "AZ",
  "postal_code": "85711",
  "latitude": 32.223236,
  "longitude": -110.880452,
  "stars": 3.5,
  "review_count": 22,
  "is_open": 0,
  "attributes": {
    "BikeParking": "True",
    "BusinessAcceptsCreditCards": "True",
    "RestaurantsPriceRange2": "2",
    "CoatCheck": "False",
    "RestaurantsTakeOut": "False",
    "RestaurantsDelivery": "False",
    "Caters": "False",
    "WiFi": "u'no'",
    "BusinessParking": "{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}",
    "WheelchairAccessible": "True",
    "HappyHour": "False",
    "OutdoorSeating": "False",
    "HasTV": "False",
    "RestaurantsReservations": "False",
  

分析：对于item，三个数据集键都不一样，要分别处理。yelp主要是target那种商家（所以才叫business，其它压根不是business），amazon主要是刻板印象中amazon上的物品，goodreads自然是卖书的。

In [2]:
import json  
from collections import defaultdict  

# 定义要读取的 JSON 文件路径  
file_path = 'review.json'  

# 使用字典来存储不同来源的数据  
source_data = defaultdict(list)  

# 逐行读取 JSON 文件  
with open(file_path, 'r', encoding='utf-8') as file:  
    for line in file:  
        try:  
            # 加载每行的 JSON 数据  
            review = json.loads(line.strip())  
            
            # 按照 source 分类  
            source = review.get('source')  
            source_data[source].append(review)  
        
        except json.JSONDecodeError as e:  
            print(f"解析错误: {e} - 在当前行: {line}")  

# 分析每个来源的数据特点  
for source, reviews in source_data.items():  
    print(f"\n{source.upper()} 数据分析:")  
    
    # 打印数据总数  
    print(f"总记录数: {len(reviews)}")  
    
    # 获取第一条记录的键  
    if reviews:  
        first_record = reviews[0]  
        print("键的组成:")  
        for key in first_record.keys():  
            print(f"- {key}")  
        
        # 打印第一条记录作为示例  
        print("\n第一条记录示例:")  
        print(json.dumps(first_record, ensure_ascii=False, indent=2))


YELP 数据分析:
总记录数: 1827321
键的组成:
- review_id
- user_id
- item_id
- stars
- useful
- funny
- cool
- text
- date
- source
- type

第一条记录示例:
{
  "review_id": "BiTunyQ73aT9WBnpR9DZGw",
  "user_id": "OyoGAe7OKpv6SyGZT5g77Q",
  "item_id": "7ATYjTIgM3jUlt4UM3IypQ",
  "stars": 5.0,
  "useful": 1,
  "funny": 0,
  "cool": 1,
  "text": "I've taken a lot of spin classes over the years, and nothing compares to the classes at Body Cycle. From the nice, clean space and amazing bikes, to the welcoming and motivating instructors, every class is a top notch work out.\n\nFor anyone who struggles to fit workouts in, the online scheduling system makes it easy to plan ahead (and there's no need to line up way in advanced like many gyms make you do).\n\nThere is no way I can write this review without giving Russell, the owner of Body Cycle, a shout out. Russell's passion for fitness and cycling is so evident, as is his desire for all of his clients to succeed. He is always dropping in to classes to check in/pr

In [3]:
import json  
from collections import defaultdict  

# 定义要读取的 JSON 文件路径  
file_path = 'user.json'  

# 使用字典来存储不同来源的数据  
source_data = defaultdict(list)  

# 逐行读取 JSON 文件  
with open(file_path, 'r', encoding='utf-8') as file:  
    for line in file:  
        try:  
            # 加载每行的 JSON 数据  
            review = json.loads(line.strip())  
            
            # 按照 source 分类  
            source = review.get('source')  
            source_data[source].append(review)  
        
        except json.JSONDecodeError as e:  
            print(f"解析错误: {e} - 在当前行: {line}")  

# 分析每个来源的数据特点  
for source, reviews in source_data.items():  
    print(f"\n{source.upper()} 数据分析:")  
    
    # 打印数据总数  
    print(f"总记录数: {len(reviews)}")  
    
    # 获取第一条记录的键  
    if reviews:  
        first_record = reviews[0]  
        
        # 特殊处理 friends 键  
        processed_record = first_record.copy()  
        if 'friends' in processed_record:  
            processed_record['friends'] = f"[friends列表，共 {len(first_record['friends'])} 个朋友]"  
        
        print("键的组成:")  
        for key in first_record.keys():  
            print(f"- {key}")  
        
        # 打印第一条记录作为示例  
        print("\n第一条记录示例:")  
        print(json.dumps(processed_record, ensure_ascii=False, indent=2))


YELP 数据分析:
总记录数: 558111
键的组成:
- user_id
- name
- review_count
- yelping_since
- useful
- funny
- cool
- elite
- friends
- fans
- average_stars
- compliment_hot
- compliment_more
- compliment_profile
- compliment_cute
- compliment_list
- compliment_note
- compliment_plain
- compliment_cool
- compliment_funny
- compliment_writer
- compliment_photos
- source

第一条记录示例:
{
  "user_id": "qVc8ODYU5SZjKXVBgXdI7w",
  "name": "Walker",
  "review_count": 585,
  "yelping_since": "2007-01-25 16:47:26",
  "useful": 7217,
  "funny": 1259,
  "cool": 5994,
  "elite": "2007",
  "friends": "[friends列表，共 359878 个朋友]",
  "fans": 267,
  "average_stars": 3.91,
  "compliment_hot": 250,
  "compliment_more": 65,
  "compliment_profile": 55,
  "compliment_cute": 56,
  "compliment_list": 18,
  "compliment_note": 232,
  "compliment_plain": 844,
  "compliment_cool": 467,
  "compliment_funny": 467,
  "compliment_writer": 239,
  "compliment_photos": 180,
  "source": "yelp"
}

AMAZON 数据分析:
总记录数: 194216
键的组成:
- user_id


In [6]:
## 检查是否键的格式都一样
import json  
from collections import defaultdict  

file_path = 'item.json'  
source_data = defaultdict(list)  
source_keys = defaultdict(set)  # 新增：用于记录每个来源的所有可能键  

# 逐行读取 JSON 文件  
with open(file_path, 'r', encoding='utf-8') as file:  
    for line in file:  
        try:  
            review = json.loads(line.strip())  
            source = review.get('source')  
            
            # 记录每个来源的所有可能键  
            source_keys[source].update(review.keys())  
            
            source_data[source].append(review)  
        
        except json.JSONDecodeError as e:  
            print(f"解析错误: {e} - 在当前行: {line}")  

# 分析每个来源的数据特点  
for source, reviews in source_data.items():  
    print(f"\n{source.upper()} 数据分析:")  
    
    # 打印数据总数  
    print(f"总记录数: {len(reviews)}")  
    
    # 打印该来源所有可能的键  
    print("所有可能的键:")  
    for key in sorted(source_keys[source]):  
        print(f"- {key}")  
    
    # 统计每个键的缺失情况  
    print("\n键的缺失情况:")  
    total_records = len(reviews)  
    for key in sorted(source_keys[source]):  
        present_count = sum(1 for review in reviews if key in review)  
        missing_percentage = (total_records - present_count) / total_records * 100  
        print(f"- {key}: 缺失 {missing_percentage:.2f}%")  


YELP 数据分析:
总记录数: 32869
所有可能的键:
- address
- attributes
- categories
- city
- hours
- is_open
- item_id
- latitude
- longitude
- name
- postal_code
- review_count
- source
- stars
- state
- type

键的缺失情况:
- address: 缺失 0.00%
- attributes: 缺失 0.00%
- categories: 缺失 0.00%
- city: 缺失 0.00%
- hours: 缺失 0.00%
- is_open: 缺失 0.00%
- item_id: 缺失 0.00%
- latitude: 缺失 0.00%
- longitude: 缺失 0.00%
- name: 缺失 0.00%
- postal_code: 缺失 0.00%
- review_count: 缺失 0.00%
- source: 缺失 0.00%
- stars: 缺失 0.00%
- state: 缺失 0.00%
- type: 缺失 0.00%

AMAZON 数据分析:
总记录数: 76047
所有可能的键:
- author
- average_rating
- bought_together
- categories
- description
- details
- features
- images
- item_id
- main_category
- price
- rating_number
- source
- store
- subtitle
- title
- type
- videos

键的缺失情况:
- author: 缺失 0.00%
- average_rating: 缺失 0.00%
- bought_together: 缺失 0.00%
- categories: 缺失 0.00%
- description: 缺失 0.00%
- details: 缺失 0.00%
- features: 缺失 0.00%
- images: 缺失 0.00%
- item_id: 缺失 0.00%
- main_category: 缺失 0.00%
- 