In [7]:
import requests
import pandas as pd
from tqdm import tqdm
import time

# Ta clé API TMDb
api_key = '9d6fb86c8e6e0842572265e5c282ca7e'  # Remplace ici !

# URL de base
base_url = 'https://api.themoviedb.org/3/discover/movie'

# Paramètres généraux
params = {
    'api_key': api_key,
    'sort_by': 'popularity.desc',
    'include_adult': 'false',
    'include_video': 'false',
    'language': 'en-US',
    'page': 1
}

# Stocker les résultats
movies = []

# On va récupérer 50 pages (environ 1000 films, 20 par page)
for page in tqdm(range(1, 51)):
    params['page'] = page
    response = requests.get(base_url, params=params)
    data = response.json()

    for movie in data.get('results', []):
        movies.append({
            'id': movie.get('id'),
            'title': movie.get('title'),
            'release_date': movie.get('release_date'),
            'genres': movie.get('genre_ids'),
            'language': movie.get('original_language'),
            'popularity': movie.get('popularity'),
            'vote_count': movie.get('vote_count'),
            'vote_average': movie.get('vote_average'),
            'overview': movie.get('overview')
        })

    time.sleep(0.3)  # petite pause pour éviter d’abuser

# Créer un DataFrame
df = pd.DataFrame(movies)

# Nettoyage : supprimer les films sans note
df = df[df['vote_average'] > 0]

# Sauvegarde
df.to_csv('tmdb_movies_dataset.csv', index=False)

print(f"✅ {len(df)} films collectés et sauvegardés dans tmdb_movies_dataset.csv")


100%|██████████| 50/50 [00:27<00:00,  1.80it/s]

✅ 985 films collectés et sauvegardés dans tmdb_movies_dataset.csv





In [8]:
df

Unnamed: 0,id,title,release_date,genres,language,popularity,vote_count,vote_average,overview
0,1197306,A Working Man,2025-03-26,"[28, 80, 53]",en,633.0389,480,6.385,Levon Cade left behind a decorated military ca...
1,668489,Havoc,2025-04-24,"[28, 80, 53]",en,580.3429,281,6.600,When a drug heist swerves lethally out of cont...
2,950387,A Minecraft Movie,2025-03-31,"[10751, 35, 12, 14]",en,423.6209,752,6.200,Four misfits find themselves struggling with o...
3,1276073,Bullet Train Explosion,2025-04-23,"[28, 53, 80, 18]",ja,349.9755,94,6.691,When panic erupts on a Tokyo-bound bullet trai...
4,1092899,The Siege,2023-03-10,"[28, 53]",en,291.2397,43,5.384,International assassin Walker is compromised d...
...,...,...,...,...,...,...,...,...,...
995,348893,Boyka: Undisputed IV,2016-08-01,"[28, 18, 53]",en,12.7799,1360,7.149,In the fourth installment of the fighting fran...
996,500,Reservoir Dogs,1992-09-02,"[80, 53]",en,13.8161,14592,8.121,A botched robbery indicates a police informant...
997,507241,The Killer's Game,2024-09-12,"[28, 35, 53]",en,13.0216,403,6.500,When top hitman Joe Flood is diagnosed with a ...
998,7446,Tropic Thunder,2008-08-09,"[28, 35, 12, 10752]",en,14.7222,6383,6.686,A group of self-absorbed actors set out to mak...


In [9]:
import ast

# 1. Mapping TMDb des genres
genre_id_map = {
    28: "Action",
    12: "Adventure",
    16: "Animation",
    35: "Comedy",
    80: "Crime",
    99: "Documentary",
    18: "Drama",
    10751: "Family",
    14: "Fantasy",
    36: "History",
    27: "Horror",
    10402: "Music",
    9648: "Mystery",
    10749: "Romance",
    878: "Science Fiction",
    10770: "TV Movie",
    53: "Thriller",
    10752: "War",
    37: "Western"
}

# 2. Fonction pour convertir [ids] → ['names']
def convert_genre_ids(genre_list):
    return [genre_id_map.get(gid) for gid in genre_list if gid in genre_id_map]

# 3. Nettoyer la colonne "genres"
# (au cas où c’est une chaîne de caractères)
df['genres'] = df['genres'].apply(lambda x: ast.literal_eval(str(x)))
df['genre_names'] = df['genres'].apply(convert_genre_ids)

# 4. Ajouter une colonne texte lisible
df['genres_str'] = df['genre_names'].apply(lambda x: ', '.join(x))

# 5. (Optionnel) One-hot encoding multilabel
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
genre_dummies = pd.DataFrame(mlb.fit_transform(df['genre_names']), columns=mlb.classes_)

# Fusionner avec le DataFrame d’origine
df = pd.concat([df.reset_index(drop=True), genre_dummies.reset_index(drop=True)], axis=1)


# Affichage
print(df[['title', 'genres_str'] + list(genre_dummies.columns)].head())


                    title                          genres_str  Action  \
0           A Working Man             Action, Crime, Thriller       1   
1                   Havoc             Action, Crime, Thriller       1   
2       A Minecraft Movie  Family, Comedy, Adventure, Fantasy       0   
3  Bullet Train Explosion      Action, Thriller, Crime, Drama       1   
4               The Siege                    Action, Thriller       1   

   Adventure  Animation  Comedy  Crime  Documentary  Drama  Family  ...  \
0          0          0       0      1            0      0       0  ...   
1          0          0       0      1            0      0       0  ...   
2          1          0       1      0            0      0       1  ...   
3          0          0       0      1            0      1       0  ...   
4          0          0       0      0            0      0       0  ...   

   History  Horror  Music  Mystery  Romance  Science Fiction  TV Movie  \
0        0       0      0        0  

In [10]:
df

Unnamed: 0,id,title,release_date,genres,language,popularity,vote_count,vote_average,overview,genre_names,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,1197306,A Working Man,2025-03-26,"[28, 80, 53]",en,633.0389,480,6.385,Levon Cade left behind a decorated military ca...,"[Action, Crime, Thriller]",...,0,0,0,0,0,0,0,1,0,0
1,668489,Havoc,2025-04-24,"[28, 80, 53]",en,580.3429,281,6.600,When a drug heist swerves lethally out of cont...,"[Action, Crime, Thriller]",...,0,0,0,0,0,0,0,1,0,0
2,950387,A Minecraft Movie,2025-03-31,"[10751, 35, 12, 14]",en,423.6209,752,6.200,Four misfits find themselves struggling with o...,"[Family, Comedy, Adventure, Fantasy]",...,0,0,0,0,0,0,0,0,0,0
3,1276073,Bullet Train Explosion,2025-04-23,"[28, 53, 80, 18]",ja,349.9755,94,6.691,When panic erupts on a Tokyo-bound bullet trai...,"[Action, Thriller, Crime, Drama]",...,0,0,0,0,0,0,0,1,0,0
4,1092899,The Siege,2023-03-10,"[28, 53]",en,291.2397,43,5.384,International assassin Walker is compromised d...,"[Action, Thriller]",...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980,348893,Boyka: Undisputed IV,2016-08-01,"[28, 18, 53]",en,12.7799,1360,7.149,In the fourth installment of the fighting fran...,"[Action, Drama, Thriller]",...,0,0,0,0,0,0,0,1,0,0
981,500,Reservoir Dogs,1992-09-02,"[80, 53]",en,13.8161,14592,8.121,A botched robbery indicates a police informant...,"[Crime, Thriller]",...,0,0,0,0,0,0,0,1,0,0
982,507241,The Killer's Game,2024-09-12,"[28, 35, 53]",en,13.0216,403,6.500,When top hitman Joe Flood is diagnosed with a ...,"[Action, Comedy, Thriller]",...,0,0,0,0,0,0,0,1,0,0
983,7446,Tropic Thunder,2008-08-09,"[28, 35, 12, 10752]",en,14.7222,6383,6.686,A group of self-absorbed actors set out to mak...,"[Action, Comedy, Adventure, War]",...,0,0,0,0,0,0,0,0,1,0


In [11]:
# Normaliser popularity entre 0 et 10
min_pop = df['popularity'].min()
max_pop = df['popularity'].max()

df['popularity_score'] = (df['popularity'] - min_pop) / (max_pop - min_pop) * 10


In [12]:
df

Unnamed: 0,id,title,release_date,genres,language,popularity,vote_count,vote_average,overview,genre_names,...,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western,popularity_score
0,1197306,A Working Man,2025-03-26,"[28, 80, 53]",en,633.0389,480,6.385,Levon Cade left behind a decorated military ca...,"[Action, Crime, Thriller]",...,0,0,0,0,0,0,1,0,0,10.000000
1,668489,Havoc,2025-04-24,"[28, 80, 53]",en,580.3429,281,6.600,When a drug heist swerves lethally out of cont...,"[Action, Crime, Thriller]",...,0,0,0,0,0,0,1,0,0,9.156751
2,950387,A Minecraft Movie,2025-03-31,"[10751, 35, 12, 14]",en,423.6209,752,6.200,Four misfits find themselves struggling with o...,"[Family, Comedy, Adventure, Fantasy]",...,0,0,0,0,0,0,0,0,0,6.648863
3,1276073,Bullet Train Explosion,2025-04-23,"[28, 53, 80, 18]",ja,349.9755,94,6.691,When panic erupts on a Tokyo-bound bullet trai...,"[Action, Thriller, Crime, Drama]",...,0,0,0,0,0,0,1,0,0,5.470378
4,1092899,The Siege,2023-03-10,"[28, 53]",en,291.2397,43,5.384,International assassin Walker is compromised d...,"[Action, Thriller]",...,0,0,0,0,0,0,1,0,0,4.530479
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980,348893,Boyka: Undisputed IV,2016-08-01,"[28, 18, 53]",en,12.7799,1360,7.149,In the fourth installment of the fighting fran...,"[Action, Drama, Thriller]",...,0,0,0,0,0,0,1,0,0,0.074525
981,500,Reservoir Dogs,1992-09-02,"[80, 53]",en,13.8161,14592,8.121,A botched robbery indicates a police informant...,"[Crime, Thriller]",...,0,0,0,0,0,0,1,0,0,0.091107
982,507241,The Killer's Game,2024-09-12,"[28, 35, 53]",en,13.0216,403,6.500,When top hitman Joe Flood is diagnosed with a ...,"[Action, Comedy, Thriller]",...,0,0,0,0,0,0,1,0,0,0.078393
983,7446,Tropic Thunder,2008-08-09,"[28, 35, 12, 10752]",en,14.7222,6383,6.686,A group of self-absorbed actors set out to mak...,"[Action, Comedy, Adventure, War]",...,0,0,0,0,0,0,0,1,0,0.105606


In [24]:
df.to_csv('tmdb_movies_dataset_.csv', index=False)
print("✅ Fichier 'tmdb_movies_dataset_.csv' sauvegardé.")


✅ Fichier 'tmdb_movies_dataset_.csv' sauvegardé.


In [28]:
import csv
import pandas as pd

# Lecture manuelle ligne par ligne, skip si problème
rows = []
with open('tmdb_movies_dataset_part2.csv', newline='', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        try:
            rows.append(row)
        except:
            continue

# Convertir en DataFrame
df2 = pd.DataFrame(rows)
df2


Unnamed: 0,id,title,release_date,genres,language,popularity,vote_count,vote_average,overview
0,276907,Legend,2015-09-09,"[80, 53]",en,14.3952,3966,7.054,"Suave, charming and volatile, Reggie Kray and ..."
1,1893,Star Wars: Episode I - The Phantom Menace,1999-05-19,"[12, 28, 878]",en,12.7219,14941,6.6,"Anakin Skywalker, a young slave strong with th..."
2,841,Dune,1984-12-14,"[28, 878, 12]",en,14.0893,3167,6.183,"In the year 10,191, the most precious substanc..."
3,14443,The Garbage Pail Kids Movie,1987-08-22,"[12, 878, 35, 10751]",en,11.6129,106,3.5,Seven disgusting kids but nevertheless of inte...
4,68735,Warcraft,2016-05-25,"[28, 12, 14]",en,14.0983,6896,6.384,The peaceful realm of Azeroth stands on the br...
...,...,...,...,...,...,...,...,...,...
1950,9415,Murder at 1600,1997-04-18,"[28, 18, 9648, 53, 80]",en,8.6223,542,6.1,A secretary is found dead in a White House bat...
1951,9884,Collateral Damage,2002-02-06,"[28, 53, 18]",en,7.9877,1396,5.76,Firefighter Gordon Brewer is plunged into the ...
1952,9359,Maverick,1994-05-20,"[28, 12, 35, 18, 37]",en,8.167,1636,6.881,Bret Maverick is a gambler who would rather co...
1953,8965,Atlantis: Milo's Return,2003-02-25,"[14, 16, 878, 10751, 28]",en,8.6966,3839,6.3,Milo and Kida reunite with their friends to in...


In [29]:
#df1 = pd.read_csv('tmdb_movies_dataset_.csv')
df_final = pd.concat([df, df2], ignore_index=True)
df_final = df_final.drop_duplicates(subset='id')
df_final.to_csv('tmdb_movies_full_dataset.csv', index=False)
print(f"✅ {len(df_final)} films uniques dans le dataset combiné")


✅ 2800 films uniques dans le dataset combiné


In [30]:
df_final

Unnamed: 0,id,title,release_date,genres,language,popularity,vote_count,vote_average,overview,genre_names,...,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western,popularity_score
0,1197306,A Working Man,2025-03-26,"[28, 80, 53]",en,633.0389,480,6.385,Levon Cade left behind a decorated military ca...,"[Action, Crime, Thriller]",...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,10.000000
1,668489,Havoc,2025-04-24,"[28, 80, 53]",en,580.3429,281,6.6,When a drug heist swerves lethally out of cont...,"[Action, Crime, Thriller]",...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9.156751
2,950387,A Minecraft Movie,2025-03-31,"[10751, 35, 12, 14]",en,423.6209,752,6.2,Four misfits find themselves struggling with o...,"[Family, Comedy, Adventure, Fantasy]",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.648863
3,1276073,Bullet Train Explosion,2025-04-23,"[28, 53, 80, 18]",ja,349.9755,94,6.691,When panic erupts on a Tokyo-bound bullet trai...,"[Action, Thriller, Crime, Drama]",...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.470378
4,1092899,The Siege,2023-03-10,"[28, 53]",en,291.2397,43,5.384,International assassin Walker is compromised d...,"[Action, Thriller]",...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.530479
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2935,9415,Murder at 1600,1997-04-18,"[28, 18, 9648, 53, 80]",en,8.6223,542,6.1,A secretary is found dead in a White House bat...,,...,,,,,,,,,,
2936,9884,Collateral Damage,2002-02-06,"[28, 53, 18]",en,7.9877,1396,5.76,Firefighter Gordon Brewer is plunged into the ...,,...,,,,,,,,,,
2937,9359,Maverick,1994-05-20,"[28, 12, 35, 18, 37]",en,8.167,1636,6.881,Bret Maverick is a gambler who would rather co...,,...,,,,,,,,,,
2938,8965,Atlantis: Milo's Return,2003-02-25,"[14, 16, 878, 10751, 28]",en,8.6966,3839,6.3,Milo and Kida reunite with their friends to in...,,...,,,,,,,,,,
