In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import ast
import itertools
import networkx as nx
import matplotlib.font_manager as fm
import plotly.express as px

## 1. 연도별 최고 관객수 동원 배우

In [2]:
# 데이터 로드
df = pd.read_csv('5.2_dataset_forviz_add_gener.csv')

# 매출액과 관객수 데이터 형태 처리
df['매출액'] = df['매출액'].str.replace(',', '').astype(float)
df['관객수'] = df['관객수'].str.replace(',', '').astype(int)
df['개봉일'] = pd.to_datetime(df['개봉일'])
df['연도'] = df['개봉일'].dt.year

# 배우 데이터 정리
def safe_eval(x):
    try:
        return eval(x)
    except:
        return []
df['actor_main_name'] = df['actor_main_name'].apply(safe_eval)

# 연도별 배우의 총 관객수 및 최대 히트작 정보 계산
actor_audience = {}
for index, row in df.iterrows():
    year = row['연도']
    audience = row['관객수']
    revenue = row['매출액']
    movie_name = row['영화명']
    actors = row['actor_main_name']
    for actor in actors:
        if year not in actor_audience:
            actor_audience[year] = {}
        if actor not in actor_audience[year]:
            actor_audience[year][actor] = {'Total Audience': 0, 'Top Movie Audience': 0, 'Top Movie Revenue': 0, 'Top Movie': ''}
        actor_audience[year][actor]['Total Audience'] += audience
        if audience > actor_audience[year][actor]['Top Movie Audience']:
            actor_audience[year][actor]['Top Movie Audience'] = audience
            actor_audience[year][actor]['Top Movie Revenue'] = revenue
            actor_audience[year][actor]['Top Movie'] = movie_name

# 결과 데이터프레임 생성
results = []
for year, actors in actor_audience.items():
    top_actor = max(actors, key=lambda x: actors[x]['Total Audience'])
    actor_info = actors[top_actor]
    results.append({
        'Year': year,
        'Actor': top_actor,
        'Total Audience': actor_info['Total Audience'],
        'Top Movie Audience': actor_info['Top Movie Audience'],
        'Top Movie Revenue': actor_info['Top Movie Revenue'],
        'Top Movie': actor_info['Top Movie']
    })
result_df = pd.DataFrame(results)

# 한국 숫자 단위로 변환하는 함수 정의 및 적용
def korean_number_format(num):
    if num < 10000:
        return str(num)
    elif num < 100000000:
        return f'{num // 10000}만 {num % 10000}' if num % 10000 != 0 else f'{num // 10000}만'
    else:
        billions = num // 100000000
        return f'{billions}억'

result_df['Top Movie Audience'] = result_df['Top Movie Audience'].apply(korean_number_format)
result_df['Top Movie Revenue'] = result_df['Top Movie Revenue'].apply(korean_number_format)

# 시각화
fig = px.bar(result_df, x='Year', y='Total Audience', color='Actor', text='Actor',
             title='연도별 가장 많은 관객을 동원한 배우',
             hover_data={'Top Movie Audience': True, 'Top Movie Revenue': True, 'Top Movie': True},
             labels={
                 'Year': '연도',
                 'Actor': '배우',
                 'Total Audience': '총 관객수',
                 'Top Movie Audience': '최고 히트작 관객수',
                 'Top Movie Revenue': '최고 히트작 매출액',
                 'Top Movie': '최고 히트작'
             })
fig.update_traces(textposition='outside')
fig.show()
fig.write_html('jh_actor_audience_year.html')