In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)


In [2]:
df_incomplete = pd.read_json("data/dataset_nottoyé.json")
df = pd.read_json("data/dataset_donnees_complementaires.json")

df.drop(["jpbox_fr_title"], axis = 1, inplace=True)

In [3]:
df[df["jpbox_country"] == "Japon"]["imdb_budget"].value_counts()
df[df["jpbox_country"] == "Corée du Sud"]["imdb_budget"].value_counts()

# df["imdb_budget"].value_counts().head(20)

imdb_budget
10 000 000 $US (estimé)      2
10 000 000 000 ₩ (estimé)    2
1 300 000 000 ₩ (estimé)     1
6 420 000 $US (estimé)       1
19 000 000 000 ₩ (estimé)    1
100 000 $US (estimé)         1
14 750 000 $US (estimé)      1
4 200 000 000 ₩ (estimé)     1
5 000 000 $US (estimé)       1
12 215 500 000 ₩ (estimé)    1
16 000 000 $US (estimé)      1
39 200 000 $US (estimé)      1
11 400 000 $US (estimé)      1
Name: count, dtype: int64

In [4]:
# extraction des devises

import re

def extraire_devise(budget_str):
    if pd.isnull(budget_str):
        return None

    budget_str = budget_str.replace(" (estimé)", "").replace("\u202f", "").replace("\xa0", "").strip()
    match = re.search(r"[^\d\s]+$", budget_str)
    if match:
        return match.group()
    return None

# Appliquer à la colonne
df["devise"] = df["imdb_budget"].apply(extraire_devise)

df["devise"].unique()

array([None, '$US', '€', 'F', 'ATS', 'FIM', '$BN', '$CA', 'HKD', '£GB',
       '$AU', 'CNY', '₩', 'R$', '₹', 'CZK', '$NZ', 'SEK', '₪', 'NOK',
       '$SG', 'DKK', 'HUF', 'DEM', '$AR', 'RUR', 'PLN', 'EGP', 'ESP',
       'JPY', '$CO', 'CHF'], dtype=object)

In [5]:
# conversion des devises : 

conversion_rates = {
    "$US": 1,
    "€": 1, 
    # Devises standards
    "CNY": 1/7.2,
    "JPY": 1/150,
    "HKD": 1/7.8,
    "₩": 1/1350,
    "₹": 1/83,
    "CZK": 1/23,
    "$NZ": 1/0.60,
    "$AU": 1/0.65,
    "$CA": 1/0.73,
    "$SG": 1/0.74,
    "SEK": 1/10.6,
    "DKK": 1/6.9,
    "HUF": 1/370,
    "₪": 1/3.7,
    "NOK": 1/11,
    "£GB": 1/1.25,
    "R$": 1/5.1,
    "CHF": 1/0.91,
    "$AR": 1/850,
    "$CO": 1/3900,
    "EGP": 1/50,
    "PLN": 1/4.1,
    "RUR": 1/90,
    # Devises obsolètes (taux approximatifs)
    "F": 1/6.5,      # Franc français
    "ATS": 1/12.7,   # Schilling autrichien
    "FIM": 1/5.4,    # Markka finlandais
    "ESP": 1/140,    # Peseta espagnole
    "DEM": 1/1.7,    # Deutsche Mark
    # Cas douteux
    "$BN": 1, 
}


# fonction de conversion

def convert_to_usd(budget_str):
    if pd.isnull(budget_str):
        return None

    # Nettoyage
    budget_str = budget_str.replace(" (estimé)", "").replace("\u202f", "").replace("\xa0", "").strip()

    # Extraction
    match = re.match(r"([\d.,]+)\s*([^\d\s]+)$", budget_str)
    if not match:
        return None

    montant_str, devise = match.groups()

    # Conversion du montant
    try:
        montant = int(montant_str.replace(",", "").replace(".", "").strip())
    except:
        return None

    # Conversion
    taux = conversion_rates.get(devise)
    if taux is None:
        print(f"⚠️ Devise non reconnue : '{devise}' dans '{budget_str}'")
        return None

    return montant * taux

df["budget_dollar"] = df["imdb_budget"].apply(convert_to_usd)
df["budget_dollar"] = df["budget_dollar"].apply(lambda x: '{:.0f}'.format(x) if pd.notnull(x) else None)




In [6]:
df[df["jpbox_country"] == "Japon"][["imdb_budget", "budget_dollar"]].value_counts()


imdb_budget                 budget_dollar
30 000 000 $US (estimé)     30000000         2
120 000 $US (estimé)        120000           1
10 000 000 $US (estimé)     10000000         1
15 000 000 $US (estimé)     15000000         1
16 000 000 $US (estimé)     16000000         1
20 000 000 $US (estimé)     20000000         1
132 000 000 $US (estimé)    132000000        1
29 000 000 $US (estimé)     29000000         1
2 000 000 000 JPY (estimé)  13333333         1
2 500 000 $US (estimé)      2500000          1
40 000 000 $US (estimé)     40000000         1
4 000 000 € (estimé)        4000000          1
4 500 000 $US (estimé)      4500000          1
4 530 000 € (estimé)        4530000          1
60 000 000 CNY (estimé)     8333333          1
Name: count, dtype: int64

In [7]:
df["jpbox_budget"] = pd.to_numeric(df["jpbox_budget"])
df.isna().sum()

allocine_fr_title             0
jpbox_released_year           0
jpbox_actors                525
jpbox_directors             995
allocine_writer            1427
allocine_distribution      1577
jpbox_country                 0
jpbox_budget               2699
jpbox_category                0
jpbox_released_date           0
allocine_classification       0
jpbox_duration                0
jpbox_weekly_entrances        0
duration_minutes              0
imdb_url                      0
imdb_title                    0
imdb_released_year            0
imdb_directors                8
imdb_writer                  24
imdb_actors                   0
imdb_distribution            34
imdb_budget                3225
devise                     3225
budget_dollar              3225
dtype: int64

In [8]:
df.shape
df["imdb_actors"].value_counts()


imdb_actors
[]                                                          119
[Ariane Ascaride, Jean-Pierre Darroussin, Gérard Meylan]      7
[Daniel Radcliffe, Emma Watson, Rupert Grint]                 6
[Kristen Stewart, Robert Pattinson, Taylor Lautner]           4
[Jean-Paul Rouve, Isabelle Nanty, Claire Nadeau]              4
                                                           ... 
[Lukas Ionesco, Diane Rouxel, Théo Cholbi]                    1
[Aenne Schwarz, Andreas Döhler, Hans Löw]                     1
[Angus Cameron, Saskia Sassen, Brigitte Alepin]               1
[Manolo Solo, Jose Coronado, Ana Torrent]                     1
[Neta Riskin, Golshifteh Farahani, Yehuda Almagor]            1
Name: count, Length: 7084, dtype: int64

In [9]:
df["jpbox_directors"] = df["jpbox_directors"].fillna(df["imdb_directors"])
df["allocine_writer"] = df["allocine_writer"].fillna(df["imdb_writer"])
df["allocine_distribution"] = df["allocine_distribution"].fillna(df["imdb_distribution"])
df["jpbox_budget"] = df["jpbox_budget"].fillna(df["budget_dollar"])
# df["jpbox_budget"] = df["jpbox_budget"].str.replace(r"[^\d]", "", regex=True)

df["jpbox_actors"] = df["jpbox_actors"].fillna(df["imdb_actors"])
df["imdb_actors"] = df.apply(lambda row: row["imdb_actors"] if len(row["imdb_actors"]) > len(row["jpbox_actors"]) else row["jpbox_actors"],axis=1)

df.isna().sum()


allocine_fr_title             0
jpbox_released_year           0
jpbox_actors                  0
jpbox_directors               5
allocine_writer              20
allocine_distribution        14
jpbox_country                 0
jpbox_budget               1927
jpbox_category                0
jpbox_released_date           0
allocine_classification       0
jpbox_duration                0
jpbox_weekly_entrances        0
duration_minutes              0
imdb_url                      0
imdb_title                    0
imdb_released_year            0
imdb_directors                8
imdb_writer                  24
imdb_actors                   0
imdb_distribution            34
imdb_budget                3225
devise                     3225
budget_dollar              3225
dtype: int64

In [10]:
df = df.drop([
    "imdb_url", 
    "imdb_title", 
    "imdb_released_year", 
    "imdb_directors", 
    "imdb_writer", 
    "jpbox_actors", 
    "imdb_distribution", 
    "imdb_budget", 
],axis = 1)

In [11]:
df.isna().sum()

allocine_fr_title             0
jpbox_released_year           0
jpbox_directors               5
allocine_writer              20
allocine_distribution        14
jpbox_country                 0
jpbox_budget               1927
jpbox_category                0
jpbox_released_date           0
allocine_classification       0
jpbox_duration                0
jpbox_weekly_entrances        0
duration_minutes              0
imdb_actors                   0
devise                     3225
budget_dollar              3225
dtype: int64

In [12]:
df = df.dropna(subset="jpbox_directors")
df = df.dropna(subset="allocine_writer")
df = df.dropna(subset="allocine_distribution")

# df["jpbox_budget"] = pd.to_numeric(df["jpbox_budget"], errors="coerce")

# country_budget_mean = df.groupby('jpbox_country')['jpbox_budget'].transform('mean')
# df['jpbox_budget'] = df['jpbox_budget'].fillna(round(country_budget_mean))

print(df.shape)
df.isna().sum()

(7251, 16)


allocine_fr_title             0
jpbox_released_year           0
jpbox_directors               0
allocine_writer               0
allocine_distribution         0
jpbox_country                 0
jpbox_budget               1902
jpbox_category                0
jpbox_released_date           0
allocine_classification       0
jpbox_duration                0
jpbox_weekly_entrances        0
duration_minutes              0
imdb_actors                   0
devise                     3196
budget_dollar              3196
dtype: int64

In [13]:
df.dtypes

allocine_fr_title          object
jpbox_released_year         int64
jpbox_directors            object
allocine_writer            object
allocine_distribution      object
jpbox_country              object
jpbox_budget               object
jpbox_category             object
jpbox_released_date        object
allocine_classification    object
jpbox_duration             object
jpbox_weekly_entrances      int64
duration_minutes            int64
imdb_actors                object
devise                     object
budget_dollar              object
dtype: object

In [14]:
# liste des pays ou il nous manque le budget

df[df["jpbox_budget"].isna()]["jpbox_country"].value_counts()

jpbox_country
France             1044
Etats-Unis          218
Grande-Bretagne     109
Japon                59
Allemagne            44
                   ... 
Pays-Bas              1
Bulgarie              1
Afrique du Sud        1
Bosnie                1
Georgie               1
Name: count, Length: 66, dtype: int64

In [15]:
df.rename(columns={
    "jpbox_released_year" : "released_year",
    "imdb_actors" : "actors",
    "jpbox_directors" : "directors",
    "jpbox_country" : "country",
    "jpbox_budget" : "budget",
    "jpbox_category" : "category",
    "jpbox_released_date" : "released_date",
    "jpbox_duration" : "duration",
    "jpbox_weekly_entrances" : "weekly_entrances",
    "allocine_writer" : "writer",
    "allocine_fr_title" : "fr_title",
    "allocine_classification" : "classification",
    "allocine_distribution" : "distribution"
}, inplace=True)

In [16]:
df = df.drop(["budget_dollar", "devise"], axis = 1)

In [17]:
df[df["country"] == "Japon"]
# df.head()

Unnamed: 0,fr_title,released_year,directors,writer,distribution,country,budget,category,released_date,classification,duration,weekly_entrances,duration_minutes,actors
1123,"Aniki, mon frere",2000,Takeshi Kitano,Takeshi Kitano,Bac Films,Japon,10000000,Thriller,13/12/2000,Interdit - 12 ans,1h 54min,120974,114,"[Takeshi Kitano, Omar Epps, Kuroudo Maki]"
1455,Your Name,2016,Makoto Shinkai,Makoto Shinkai,Eurozoom,Japon,4000000,Animation,28/12/2016,Interdit - 10 ans,1h 47min,77345,107,"[Yoann Borg, Ryûnosuke Kamiki, Alice Orsat]"
1582,Battle Royale,2001,Kinji Fukasaku,Kenta Fukasaku,90 933 entrées,Japon,4500000,Drame,21/11/2001,Interdit - 16 ans,1h 53min,61322,113,"[Tatsuya Fujiwara, Aki Maeda, Taro Yamamoto]"
1802,Nobody knows,2004,Hirokazu Kore-eda,Hirokazu Kore-eda,ARP Sélection,Japon,,Drame,10/11/2004,Interdit - 12 ans,2h 21min,45361,141,"[Yûya Yagira, Ayu Kitaura, Hiei Kimura]"
1845,Still the Water,2014,Naomi Kawase,Naomi Kawase,Haut et Court,Japon,,Romance,01/10/2014,Tout public,1h 59min,42247,119,"[Nijirô Murakami, Jun Yoshinaga, Miyuki Matsuda]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5900,"Pokémon 2, le pouvoir est en toi",2000,Kunihiko Yuyama,Takeshi Shudo,Warner Bros. France,Japon,30000000.0,Animation,20/12/2000,Tout public,1h 40min,190699,100,"[Veronica Taylor, Rachael Lillis, Ted Lewis]"
6074,Dragon Ball Super: Super Hero,2022,Tetsuro Kodama,Akira Toriyama,Sony Pictures Releasing France,Japon,,Animation,05/10/2022,Tout public,1h 40min,218070,100,"[Brigitte Lecordier, Masako Nozawa, Mark Lesser]"
6526,"Albator, Corsaire de l'Espace",2013,Shinji Aramaki,Harutoshi Fukui,Océan Films,Japon,30000000.0,Animation,25/12/2013,Interdit - 10 ans,1h 50min,343181,110,"[Shun Oguri, Haruma Miura, Yû Aoi]"
6879,One Piece Film - Red,2022,Gorô Taniguchi,Brooklyn El-Omar,Toei Animation,Japon,,Animation,10/08/2022,Tout public,1h 55min,575182,115,"[Mayumi Tanaka, Kazuya Nakai, Akemi Okamura]"


In [18]:
df["actor_1"] = df["actors"].apply(lambda x: x[0] if len(x) > 0 else None)
df["actor_2"] = df["actors"].apply(lambda x: x[1] if len(x) > 1 else None)
df["actor_3"] = df["actors"].apply(lambda x: x[2] if len(x) > 2 else None)

df.drop(["actors"], inplace = True, axis = 1)


In [19]:
df.isna().sum()

fr_title               0
released_year          0
directors              0
writer                 0
distribution           0
country                0
budget              1902
category               0
released_date          0
classification         0
duration               0
weekly_entrances       0
duration_minutes       0
actor_1               72
actor_2              105
actor_3              128
dtype: int64

In [20]:
df[df["actor_1"].isna()]

df[["actor_1", "actor_2", "actor_3"]] = df[["actor_1", "actor_2", "actor_3"]].fillna("no_actor")

In [21]:
print(df.shape)
df.isna().sum()


(7251, 16)


fr_title               0
released_year          0
directors              0
writer                 0
distribution           0
country                0
budget              1902
category               0
released_date          0
classification         0
duration               0
weekly_entrances       0
duration_minutes       0
actor_1                0
actor_2                0
actor_3                0
dtype: int64

In [22]:
df["budget"].value_counts()

df.isna().sum()



fr_title               0
released_year          0
directors              0
writer                 0
distribution           0
country                0
budget              1902
category               0
released_date          0
classification         0
duration               0
weekly_entrances       0
duration_minutes       0
actor_1                0
actor_2                0
actor_3                0
dtype: int64

In [23]:
import json


df.to_csv("DATASET_FINAL.csv", index = False)

records = df.replace({np.nan: None}).to_dict(orient="records")
with open("DATASET_FINAL.json", "w", encoding="utf-8") as f:
    f.write("[\n")
    for i, record in enumerate(records):
        json_str = json.dumps(record, ensure_ascii=False)
        if i < len(records) - 1:
            f.write(f"    {json_str},\n")
        else:
            f.write(f"    {json_str}\n")
    f.write("]\n")


In [24]:
df["budget"].value_counts()

df["country"].value_counts().head(20)
df.isna().sum()

fr_title               0
released_year          0
directors              0
writer                 0
distribution           0
country                0
budget              1902
category               0
released_date          0
classification         0
duration               0
weekly_entrances       0
duration_minutes       0
actor_1                0
actor_2                0
actor_3                0
dtype: int64

In [25]:
df[df["country"] == "Japon"].head(20)

Unnamed: 0,fr_title,released_year,directors,writer,distribution,country,budget,category,released_date,classification,duration,weekly_entrances,duration_minutes,actor_1,actor_2,actor_3
1123,"Aniki, mon frere",2000,Takeshi Kitano,Takeshi Kitano,Bac Films,Japon,10000000.0,Thriller,13/12/2000,Interdit - 12 ans,1h 54min,120974,114,Takeshi Kitano,Omar Epps,Kuroudo Maki
1455,Your Name,2016,Makoto Shinkai,Makoto Shinkai,Eurozoom,Japon,4000000.0,Animation,28/12/2016,Interdit - 10 ans,1h 47min,77345,107,Yoann Borg,Ryûnosuke Kamiki,Alice Orsat
1582,Battle Royale,2001,Kinji Fukasaku,Kenta Fukasaku,90 933 entrées,Japon,4500000.0,Drame,21/11/2001,Interdit - 16 ans,1h 53min,61322,113,Tatsuya Fujiwara,Aki Maeda,Taro Yamamoto
1802,Nobody knows,2004,Hirokazu Kore-eda,Hirokazu Kore-eda,ARP Sélection,Japon,,Drame,10/11/2004,Interdit - 12 ans,2h 21min,45361,141,Yûya Yagira,Ayu Kitaura,Hiei Kimura
1845,Still the Water,2014,Naomi Kawase,Naomi Kawase,Haut et Court,Japon,,Romance,01/10/2014,Tout public,1h 59min,42247,119,Nijirô Murakami,Jun Yoshinaga,Miyuki Matsuda
1874,Tabou,2000,Nagisa Oshima,Nagisa Oshima,1 nomination,Japon,,Comédie dramatique,17/05/2000,Tout public,1h 40min,42157,100,Takeshi Kitano,Shinji Takeda,Tadanobu Asano
1917,De l'eau tiède sous un pont rouge,2001,Shôhei Imamura,Yo Henmi,BAP Inc.,Japon,,Comédie,28/11/2001,Tout public,1h 59min,37127,119,Kôji Yakusho,Misa Shimizu,Mitsuko Baishô
1918,"Les Enfants Loups, Ame & Yuki",2012,Mamoru Hosoda,Mamoru Hosoda,Eurozoom,Japon,,Animation,29/08/2012,Interdit - 6 ans,1h 57min,36672,117,Kumiko Aso,Aoi Miyazaki,Megumi Hayashibara
1927,A Man,2024,Kei Ishikawa,Kôsuke Mukai,Art House,Japon,40000000.0,Thriller,31/01/2024,Tout public,2h 01min,38635,121,Satoshi Tsumabuki,Sakura Andô,Masataka Kubota
2064,La Saveur des ramen,2018,Eric Khoo,Tan Fong Cheng,Art House / KMBO,Japon,,Drame,03/10/2018,Tout public,1h 30min,29450,90,Takumi Saitoh,Jeanette Aw Ee-Ping,Mark Lee (III)
