#### 目次
1. movielensのデータを取得する
2. movielensのデータを加工し、保存する
3. TMDBのデータを取得する
4. TMDBのデータを保存する
5. 上記よりtrain, valid, testを作成する

In [1]:
import requests
import zipfile
from io import BytesIO
import os
from tqdm import tqdm
from dotenv import load_dotenv

import json
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', 100)

In [3]:
load_dotenv()
tmdb_key = os.getenv('TMDB_KEY')

#### 1.movielensのデータを取得する

下記のデータをダウンロードし、ml-20mフォルダに入れる  
https://grouplens.org/datasets/movielens/20m/

In [4]:
url = "http://files.grouplens.org/datasets/movielens/ml-20m.zip"

response = requests.get(url)
with zipfile.ZipFile(BytesIO(response.content)) as z:
    ml_20m_dir = [info.filename for info in z.infolist() if info.filename.endswith('ml-20m/')][0]
    z.extractall(path=".", members=[info for info in z.infolist() if info.filename.startswith(ml_20m_dir)])

print("MovieLens 20M データセットが ml-20m/ ディレクトリに保存されました。")

MovieLens 20M データセットが ml-20m/ ディレクトリに保存されました。


#### 2.movielensのデータを加工し、保存する

In [5]:
movies_path = './ml-20m/movies.csv'
ratings_path = './ml-20m/ratings.csv'
links_path = './ml-20m/links.csv'
tags_path = './ml-20m/tags.csv'
genome_scores_path = './ml-20m/genome-scores.csv'
genome_tags_path = './ml-20m/genome-tags.csv'

In [6]:
movies_df = pd.read_csv(movies_path)
ratings_df = pd.read_csv(ratings_path)
links_df = pd.read_csv(links_path)
tags_df = pd.read_csv(tags_path)
genome_scores_df = pd.read_csv(genome_scores_path)
genome_tags_df = pd.read_csv(genome_tags_path)

In [7]:
print(f"Length of movies_df: {len(movies_df)}")
print(f"Length of ratings_df: {len(ratings_df)}")
print(f"Length of links_df: {len(links_df)}")
print(f"Length of tags_df: {len(tags_df)}")
print(f"Length of genome_scores_df: {len(genome_scores_df)}")
print(f"Length of genome_tags_df: {len(genome_tags_df)}")

Length of movies_df: 27278
Length of ratings_df: 20000263
Length of links_df: 27278
Length of tags_df: 465564
Length of genome_scores_df: 11709768
Length of genome_tags_df: 1128


In [8]:
movies_df.head(1)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


movies_df(映画情報)
- 映画ID
- タイトル
- ジャンル

In [9]:
ratings_df.head(1)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027


ratings_df(ユーザーごとの映画評価値)
- ユーザーID
- 映画ID
- 評価値
- タイムスタンプ

In [10]:
links_df.head(1)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0


links_df(各プラットフォーム毎のID)
- 映画ID
- IMDB映画ID
- TMDB映画ID

In [11]:
tags_df.head(1)

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180


tags_df(ユーザー入力のタグ)
- ユーザーID
- 映画ID
- タグ
- タイムスタンプ

In [12]:
genome_scores_df.head(1)

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025


genome_scores_df(映画とタグの関連度)
- 映画ID
- タグID
- 関連度

In [13]:
genome_tags_df.head(1)

Unnamed: 0,tagId,tag
0,1,7


genome_tags_df(タグIDとタグ)
- タグID
- タグ

1. ratings_dfのmovieIdをlinks_dfのtmdbIdに変更する
2. 重複と欠損値の行を削除する
3. それ以外は使用しない

In [14]:
ratings_df = ratings_df.merge(links_df, on='movieId')

In [15]:
ratings_df = ratings_df[['userId', 'tmdbId', 'rating', 'timestamp']]

In [16]:
# 各列の統計値を個別に計算して表示
for column in ratings_df.columns:
    print(f"Column: {column}")
    print(f"最大値: {ratings_df[column].max()}")
    print(f"最小値: {ratings_df[column].min()}")
    print(f"平均値: {ratings_df[column].mean()}" if pd.api.types.is_numeric_dtype(ratings_df[column]) else "平均値: N/A")
    print(f"欠損値の数: {ratings_df[column].isnull().sum()}")
    print('---')

Column: userId
最大値: 138493
最小値: 1
平均値: 69045.87258292554
欠損値の数: 0
---
Column: tmdbId
最大値: 420743.0
最小値: 2.0
平均値: 9379.592749253903
欠損値の数: 12582
---
Column: rating
最大値: 5.0
最小値: 0.5
平均値: 3.5255285642993797
欠損値の数: 0
---
Column: timestamp
最大値: 1427784002
最小値: 789652004
平均値: 1100917921.6771142
欠損値の数: 0
---


In [17]:
ratings_df['tmdbId'] = pd.to_numeric(ratings_df['tmdbId'], errors='coerce')
ratings_df.dropna(subset=['tmdbId'], inplace=True)
ratings_df['tmdbId'] = ratings_df['tmdbId'].astype(int)

In [18]:
ratings_df.head(1)

Unnamed: 0,userId,tmdbId,rating,timestamp
0,1,8844,3.5,1112486027


In [19]:
ratings_df.to_csv('./data/ratings_df.csv', index=False)

#### 3.TMDBのデータを取得する

- tmdbIdのユニークな値からデータフレームを作成
- かなり時間がかかるので注意（データ数を減らしたり、取得する項目を減らして調整してください）

In [20]:
unique_tmdbIds = ratings_df['tmdbId'].unique()
unique_tmdbIds.sort()
movies_df = pd.DataFrame(unique_tmdbIds, columns=['tmdbId'])
print(f"Length of movies_df: {len(movies_df)}")

Length of movies_df: 26483


In [21]:
column_types = {
    'is_adult': ('boolean', np.nan),
    'budget': ('Int64', 0),
    'genre_names': ('string', np.nan),
    'genre_ids': ('string', np.nan),
    'original_language': ('string', np.nan),
    'original_title': ('string', np.nan),
    'overview': ('string', np.nan),
    'popularity': ('float64', 0),
    'production_companies_names': ('string', np.nan),
    'production_companies_ids': ('string', np.nan),
    'production_companies_origin_country': ('string', np.nan),
    'production_countries_names': ('string', np.nan),
    'release_date': ('datetime64[ns]', pd.NaT),
    'revenue': ('Int64', 0),
    'runtime': ('Int64', 0),
    'spoken_languages_english_name': ('string', np.nan),
    'status': ('string', np.nan),
    'tagline': ('string', np.nan),
    'title': ('string', np.nan),
    'video': ('boolean', np.nan),
    'vote_average': ('float64', 0),
    'vote_count': ('Int64', 0),
    'backdrop_path': ('string', np.nan),
    'poster_path': ('string', np.nan),
    'overview_jp': ('string', np.nan),
    'tagline_jp': ('string', np.nan),
    'title_jp': ('string', np.nan)
}

for column, (dtype, initial_value) in column_types.items():
    movies_df[column] = initial_value
    if dtype == 'datetime':
        movies_df[column] = pd.to_datetime(movies_df[column])
    else:
        movies_df[column] = movies_df[column].astype(dtype)

print(movies_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26483 entries, 0 to 26482
Data columns (total 28 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   tmdbId                               26483 non-null  int32         
 1   is_adult                             0 non-null      boolean       
 2   budget                               26483 non-null  Int64         
 3   genre_names                          0 non-null      string        
 4   genre_ids                            0 non-null      string        
 5   original_language                    0 non-null      string        
 6   original_title                       0 non-null      string        
 7   overview                             0 non-null      string        
 8   popularity                           26483 non-null  float64       
 9   production_companies_names           0 non-null      string        
 10  production

In [22]:
processed_ids = set()

In [26]:
for movie_id in tqdm(movies_df['tmdbId'].tolist()):
    if movie_id in processed_ids:
        continue

    try:
        url = f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={tmdb_key}'
        response = requests.get(url)
        data = json.loads(response.text)
        
        is_adult = data.get('adult')
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'is_adult'] = is_adult
        
        budget = data.get('budget')
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'budget'] = budget
        
        genres = data.get('genres')
        if isinstance(genres, list):
          genre_names = [genre['name'] for genre in genres]
          genre_ids = [genre['id'] for genre in genres]
        else:
          genre_names = []
          genre_ids = []
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'genre_names'] = [','.join(genre_names)]
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'genre_ids'] = [','.join(str(genre_ids))]
        
        original_language = data.get('original_language')
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'original_language'] = original_language
        
        original_title = data.get('original_title')
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'original_title'] = original_title
        
        overview = data.get('overview')
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'overview'] = overview
        
        popularity = data.get('popularity')
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'popularity'] = popularity
        
        production_companies = data.get('production_companies')
        if isinstance(production_companies, list):
          production_companies_names = [pc['name'] for pc in production_companies]
          production_companies_ids = [pc['id'] for pc in production_companies]
          production_companies_origin_country = [pc['origin_country'] for pc in production_companies]
        else:
          production_companies_names = []
          production_companies_ids = []
          production_companies_origin_country = []
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'production_companies_names'] = [','.join(production_companies_names)]
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'production_companies_ids'] = [','.join(str(production_companies_ids))]
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'production_companies_origin_country'] = [','.join(production_companies_origin_country)]
        
        production_countries = data.get('production_countries')
        if isinstance(genres, list):
          production_countries_names = [pc['name'] for pc in production_countries]
          # production_countries_isos = [pc['iso_3166_1'] for pc in production_countries]
        else:
          genre_names = []
          # genre_ids = []
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'production_countries_names'] = [','.join(production_countries_names)]
        # movies_df.loc[movies_df['tmdbId'] == movie_id, 'production_countries_isos'] = [','.join(str(production_countries_isos))]
        
        release_date = data.get('release_date')
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'release_date'] = release_date
        
        revenue = data.get('revenue')
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'revenue'] = revenue
        
        runtime = data.get('runtime')
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'runtime'] = runtime
        
        spoken_languages = data.get('spoken_languages')
        if isinstance(genres, list):
          # spoken_languages_names = [sl['name'] for sl in spoken_languages]
          spoken_languages_english_name = [sl['english_name'] for sl in spoken_languages]
        else:
          # spoken_languages_names = []
          spoken_languages_english_name = []
        # movies_df.loc[movies_df['tmdbId'] == movie_id, 'spoken_languages_names'] = [','.join(spoken_languages_names)]
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'spoken_languages_english_name'] = [','.join(spoken_languages_english_name)]
        
        status = data.get('status')
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'status'] = status
        
        tagline = data.get('tagline')
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'tagline'] = tagline
        
        title = data.get('title')
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'title'] = title
        
        video = data.get('video')
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'video'] = video
        
        vote_average = data.get('vote_average')
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'vote_average'] = vote_average
        
        vote_count = data.get('vote_count')
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'vote_count'] = vote_count
        
        poster_path = None
        backdrop_path = None
        poster_path = data.get('poster_path')
        backdrop_path = data.get('poster_path')
        if poster_path is not None and backdrop_path is not None:
          movie_backdrop_path = "https://image.tmdb.org/t/p/w500" + backdrop_path
          movie_poster_path = "https://image.tmdb.org/t/p/w500" + poster_path
        
          movies_df.loc[movies_df['tmdbId'] == movie_id, 'backdrop_path'] = movie_backdrop_path
          movies_df.loc[movies_df['tmdbId'] == movie_id, 'poster_path'] = movie_poster_path
        
        url = f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={tmdb_key}&language=ja'
        response = requests.get(url)
        data = json.loads(response.text)
        
        overview = data.get('overview')
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'overview_jp'] = overview
        
        tagline = data.get('tagline')
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'tagline_jp'] = tagline
        
        title = data.get('title')
        movies_df.loc[movies_df['tmdbId'] == movie_id, 'title_jp'] = title

        processed_ids.add(movie_id)
    except Exception as e:
        print(f"エラー発生: {e} - movie_id: {movie_id}")

100%|████████████████████████████████████████████████████████████████████████████| 26483/26483 [09:57<00:00, 44.29it/s]


In [27]:
movies_df.head(1)

Unnamed: 0,tmdbId,is_adult,budget,genre_names,genre_ids,original_language,original_title,overview,popularity,production_companies_names,production_companies_ids,production_companies_origin_country,production_countries_names,release_date,revenue,runtime,spoken_languages_english_name,status,tagline,title,video,vote_average,vote_count,backdrop_path,poster_path,overview_jp,tagline_jp,title_jp
0,2,False,0,"Drama,Comedy,Romance,Crime","[,1,8,,, ,3,5,,, ,1,0,7,4,9,,, ,8,0,]",fi,Ariel,After the coal mine he works at closes and his...,16.786,Villealfa Filmproductions,"[,2,3,0,3,]",FI,Finland,1988-10-21,0,73,Finnish,Released,,Ariel,False,7.1,275,https://image.tmdb.org/t/p/w500/ojDg0PGvs6R9xY...,https://image.tmdb.org/t/p/w500/ojDg0PGvs6R9xY...,,,真夜中の虹


In [28]:
movies_df = movies_df.replace('', np.nan)

In [29]:
missing_values = movies_df.isnull().sum()
print(missing_values)

tmdbId                                     0
is_adult                                 273
budget                                   273
genre_names                              436
genre_ids                                  0
original_language                        273
original_title                           273
overview                                 357
popularity                               273
production_companies_names              2556
production_companies_ids                   0
production_companies_origin_country     4848
production_countries_names               996
release_date                             275
revenue                                  273
runtime                                  273
spoken_languages_english_name            791
status                                   273
tagline                                 9268
title                                    273
video                                    273
vote_average                             273
vote_count

#### 4. TMDBのデータを保存する

- ジャンルをワンホットエンコーディング

In [30]:
genres = movies_df['genre_names'].str.get_dummies(',')
genres = genres.add_prefix('genre_')
movies_df = pd.concat([movies_df, genres], axis=1)

In [31]:
movies_df.head(1)

Unnamed: 0,tmdbId,is_adult,budget,genre_names,genre_ids,original_language,original_title,overview,popularity,production_companies_names,production_companies_ids,production_companies_origin_country,production_countries_names,release_date,revenue,runtime,spoken_languages_english_name,status,tagline,title,video,vote_average,vote_count,backdrop_path,poster_path,overview_jp,tagline_jp,title_jp,genre_Action,genre_Adventure,genre_Animation,genre_Comedy,genre_Crime,genre_Documentary,genre_Drama,genre_Family,genre_Fantasy,genre_History,genre_Horror,genre_Music,genre_Mystery,genre_Romance,genre_Science Fiction,genre_TV Movie,genre_Thriller,genre_War,genre_Western
0,2,False,0,"Drama,Comedy,Romance,Crime","[,1,8,,, ,3,5,,, ,1,0,7,4,9,,, ,8,0,]",fi,Ariel,After the coal mine he works at closes and his...,16.786,Villealfa Filmproductions,"[,2,3,0,3,]",FI,Finland,1988-10-21,0,73,Finnish,Released,,Ariel,False,7.1,275,https://image.tmdb.org/t/p/w500/ojDg0PGvs6R9xY...,https://image.tmdb.org/t/p/w500/ojDg0PGvs6R9xY...,,,真夜中の虹,0,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0


In [32]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26483 entries, 0 to 26482
Data columns (total 47 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   tmdbId                               26483 non-null  int32         
 1   is_adult                             26210 non-null  boolean       
 2   budget                               26210 non-null  Int64         
 3   genre_names                          26047 non-null  string        
 4   genre_ids                            26483 non-null  string        
 5   original_language                    26210 non-null  string        
 6   original_title                       26210 non-null  string        
 7   overview                             26126 non-null  string        
 8   popularity                           26210 non-null  float64       
 9   production_companies_names           23927 non-null  string        
 10  production

In [33]:
movies_df.to_csv('./data/movies_df.csv', index=False)