In [None]:
import csv
import time
import requests
import json
import os
import sys
from tqdm import tqdm
from datetime import datetime, timedelta

input_file = 'ml-latest-small/links.csv'
output_file = 'movie_info2.csv'
log_file = 'request_log.txt'  
state_file = 'process_state.json' 

API_KEYS = [
    "f8e988f2",
    "555f1493",
    "b0d221b4",
    "c86493c7",
    "7b84cc60",
    "4e952303",
    "be7cc8cc",
    "b3e8365b",
    "37bb452e",
    "a30f4ad5",
    "a30f4ad5"
]
current_key_index = 0
BASE_URL = "http://www.omdbapi.com/?"
TMDB_API_KEY = "tu_tmdb_api_key_aquí"
TMDB_BASE_URL = "https://api.themoviedb.org/3/movie/"

api_failures = {key: 0 for key in API_KEYS}
max_consecutive_failures = 3  
max_total_failures = 10 

def save_state(current_index, total_ids, processed_ids=None):
    state = {
        "last_processed_index": current_index,
        "total_ids": total_ids,
        "processed_count": len(processed_ids) if processed_ids else current_index,
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "current_api_key_index": current_key_index,
        "processed_ids": processed_ids if processed_ids else []
    }
    
    with open(state_file, 'w', encoding='utf-8') as f:
        json.dump(state, f, ensure_ascii=False, indent=2)
    

def load_state():
    if os.path.exists(state_file):
        try:
            with open(state_file, 'r', encoding='utf-8') as f:
                state = json.load(f)
            
            global current_key_index
            if 'current_api_key_index' in state:
                current_key_index = state['current_api_key_index']
                
            return state
        except Exception as e:
            print(f"Error")
    
    return None

def log_request(imdb_id, api_key, successful):
    with open(log_file, 'a', encoding='utf-8') as f:
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        result = "éxito" if successful else "fallo"
        f.write(f"{timestamp},{imdb_id},{api_key},{result}\n")

def get_next_api_key():
    global current_key_index
    current_key_index = (current_key_index + 1) % len(API_KEYS)
    return API_KEYS[current_key_index]

def check_api_health():
    all_keys_failing = all(count >= max_consecutive_failures for count in api_failures.values())
    
    too_many_total_failures = total_api_failures >= max_total_failures
    
    return not (all_keys_failing or too_many_total_failures)

def get_movie_info_omdb(imdb_id, api_key):
    global total_api_failures
    
    if imdb_id.startswith('tt'):
        clean_id = imdb_id
    else:
        clean_id = f"tt{imdb_id}"
    
    url = f"{BASE_URL}apikey={api_key}&i={clean_id}"
    
    try:
        response = requests.get(url)
        data = response.json()
        
        if data.get('Response') == 'True':
            api_failures[api_key] = 0
            log_request(imdb_id, api_key, True)
            return data
        else:
            error = data.get('Error', 'Desconocido')
            api_failures[api_key] += 1
            total_api_failures += 1
            
            if "exceeded" in error.lower() or "limit" in error.lower():
                return None
                
            log_request(imdb_id, api_key, False)
            print(f"Error")
            return None
    except Exception as e:
        api_failures[api_key] += 1
        total_api_failures += 1
        log_request(imdb_id, api_key, False)
        print(f"Error en solicitud para ID {imdb_id}: {str(e)}")
        return None

def get_movie_info_tmdb(imdb_id):
    if imdb_id.startswith('tt'):
        clean_id = imdb_id
    else:
        clean_id = f"tt{imdb_id}"
    
    url = f"{TMDB_BASE_URL}find/{clean_id}?api_key={TMDB_API_KEY}&external_source=imdb_id"
    
    try:
        response = requests.get(url)
        find_data = response.json()
        
        movie_results = find_data.get('movie_results', [])
        if movie_results and len(movie_results) > 0:
            movie_id = movie_results[0]['id']
            
            details_url = f"{TMDB_BASE_URL}{movie_id}?api_key={TMDB_API_KEY}&append_to_response=credits,keywords"
            details_response = requests.get(details_url)
            movie_data = details_response.json()
            
            adapted_data = {
                'imdbID': clean_id,
                'Title': movie_data.get('title', ''),
                'Year': movie_data.get('release_date', '')[:4] if movie_data.get('release_date') else '',
                'Plot': movie_data.get('overview', ''),
                'imdbRating': str(movie_data.get('vote_average', '')),
                'Runtime': f"{movie_data.get('runtime', '')} min",
                'Genre': ', '.join([g['name'] for g in movie_data.get('genres', [])]),
                'Director': ', '.join([c['name'] for c in movie_data.get('credits', {}).get('crew', []) if c.get('job') == 'Director']),
                'Actors': ', '.join([c['name'] for c in movie_data.get('credits', {}).get('cast', [])[:5]]),
                'Language': movie_data.get('original_language', '').upper(),
                'Country': ', '.join([c['name'] for c in movie_data.get('production_countries', [])]),
                'Source': 'TMDb'  
            }
            return adapted_data
        else:
            return None
    except Exception as e:
        print(f"Error")
        return None

def get_movie_info(imdb_id):
    if not check_api_health():
        return "API_FAILURE"  
    api_key = API_KEYS[current_key_index]
    data = get_movie_info_omdb(imdb_id, api_key)
    
    attempts = 0
    while data is None and attempts < len(API_KEYS):
        api_key = get_next_api_key()
        data = get_movie_info_omdb(imdb_id, api_key)
        attempts += 1
    
    if data is None and TMDB_API_KEY:
        data = get_movie_info_tmdb(imdb_id)
    
    return data

def init_csv_file(fieldnames):
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

def append_to_csv(movie_data, fieldnames):
    filtered_data = {k: v for k, v in movie_data.items() if k in fieldnames}
    
    with open(output_file, 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writerow(filtered_data)

def get_all_fields(movies_data):
    all_fields = set()
    for movie in movies_data:
        if isinstance(movie, dict):
            all_fields.update(movie.keys())
    
    fieldnames = ['imdbID'] if 'imdbID' in all_fields else []
    fieldnames.extend(sorted([f for f in all_fields if f != 'imdbID']))
    return fieldnames

def load_processed_ids():
    processed_ids = []
    if os.path.exists(output_file):
        try:
            with open(output_file, 'r', encoding='utf-8') as csvfile:
                reader = csv.DictReader(csvfile)
                for row in reader:
                    if 'imdbID' in row:
                        processed_ids.append(row['imdbID'])
        except Exception as e:
            print(f"Error")
    return processed_ids

def process_in_batches(imdb_ids, start_index=0, processed_ids=None):
    batch_size = 500  
    pause_minutes = 0  
    
    if processed_ids is None:
        processed_ids = load_processed_ids()
    
    total_ids = len(imdb_ids)
    
    remaining_ids = [id for id in imdb_ids[start_index:] if id not in processed_ids]
    total_batches = (len(remaining_ids) + batch_size - 1) // batch_size
    
    sample_size = min(10, len(remaining_ids))
    sample_data = []
    
    for i in range(sample_size):
        if i < len(remaining_ids):
            movie_data = get_movie_info(remaining_ids[i])
            if movie_data and isinstance(movie_data, dict):
                sample_data.append(movie_data)
    
    all_fields = get_all_fields(sample_data)
    
    if not os.path.exists(output_file) or os.path.getsize(output_file) == 0:
        init_csv_file(all_fields)
    
    current_batch = 0
    for i in range(0, len(remaining_ids), batch_size):
        current_batch += 1
        batch = remaining_ids[i:i+batch_size]
        current_global_index = start_index + i
        
        for idx, imdb_id in enumerate(tqdm(batch, desc=f"Lote {current_batch}")):
            movie_data = get_movie_info(imdb_id)
            
            if movie_data == "API_FAILURE":

                current_index = current_global_index + idx
                
                save_state(current_index, total_ids, processed_ids)
                
                return
            
            if movie_data and isinstance(movie_data, dict):
                append_to_csv(movie_data, all_fields)
                processed_ids.append(imdb_id)
        
        save_state(current_global_index + len(batch), total_ids, processed_ids)
        
        if current_batch < total_batches:
            pause_until = datetime.now() + timedelta(minutes=pause_minutes)
            time.sleep(pause_minutes * 60)
    

imdb_ids = []
try:
    with open(input_file, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if 'imdbId' in row:
                imdb_ids.append(row['imdbId'])
            elif 'tmdbId' in row:
                imdb_ids.append(row['tmdbId'])
except Exception as e:
    sys.exit(1)

previous_state = load_state()
start_index = 0
processed_ids = []

if previous_state:
    start_index = previous_state.get('last_processed_index', 0)
    if 'processed_ids' in previous_state:
        processed_ids = previous_state.get('processed_ids', [])

process_in_batches(imdb_ids, start_index, processed_ids)



In [None]:
import pandas as pd
import requests
import time
import os
import json
import logging

class MovieInfoScraper:
    def __init__(self, input_file, output_file, tmdb_api_key):
        self.input_file = input_file
        self.output_file = output_file
        self.tmdb_api_key = tmdb_api_key
        self.progress_file = 'movie_info_progress.json'
        
        logging.basicConfig(
            filename='movie_scraper.log', 
            level=logging.INFO, 
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)
    
    def load_progress(self):
        if os.path.exists(self.progress_file):
            with open(self.progress_file, 'r') as f:
                return json.load(f)
        return {'processed_ids': [], 'last_processed_index': -1}
    
    def save_progress(self, processed_ids, last_processed_index):
        progress = {
            'processed_ids': processed_ids,
            'last_processed_index': last_processed_index
        }
        with open(self.progress_file, 'w') as f:
            json.dump(progress, f)
    
    def fetch_movie_details(self, tmdb_id):

        try:
            movie_url = f"https://api.themoviedb.org/3/movie/{tmdb_id}?api_key={self.tmdb_api_key}"
            movie_response = requests.get(movie_url)
            movie_data = movie_response.json()

            movie_info = {
                'tmdbId': tmdb_id,
                'title': movie_data.get('title', 'N/A'),
                'rating': movie_data.get('vote_average', 'N/A'),
                'release_date': movie_data.get('release_date', 'N/A'),
                'original_language': movie_data.get('original_language', 'N/A'),
                'origin_country': movie_data.get('production_countries', ['N/A'])[0] if movie_data.get('production_countries') else 'N/A',
                'votes': movie_data.get('vote_count', 'N/A'),
                'budget': movie_data.get('budget', 'N/A'),
                'revenue': movie_data.get('revenue', 'N/A'),
                'runtime': movie_data.get('runtime', 'N/A')
            }

            return movie_info

        except Exception as e:
            self.logger.error(f"Error fetching details for movie {tmdb_id}: {e}")
            return None
    
    def scrape_movies(self, max_retries=3):
        links_df = pd.read_csv(self.input_file)
        
        progress = self.load_progress()
        processed_ids = progress['processed_ids']
        start_index = progress['last_processed_index'] + 1
        
        if os.path.exists(self.output_file):
            output_df = pd.read_csv(self.output_file)
        else:
            output_df = pd.DataFrame(columns=[
                'tmdbId', 'title', 'rating', 'release_date', 
                'original_language', 'origin_country', 'votes', 
                'budget', 'revenue', 'runtime'
            ])
        
        for index in range(start_index, len(links_df)):
            try:
                tmdb_id = int(links_df.iloc[index]['tmdbId'])
                
                if tmdb_id in processed_ids:
                    continue
                
                movie_info = None
                for attempt in range(max_retries):
                    movie_info = self.fetch_movie_details(tmdb_id)
                    if movie_info:
                        break
                
                if movie_info:
                    new_row = pd.DataFrame([movie_info])
                    output_df = pd.concat([output_df, new_row], ignore_index=True)
                    output_df.to_csv(self.output_file, index=False)
                    
                    processed_ids.append(tmdb_id)
                    self.save_progress(processed_ids, index)
                
                
            except Exception as e:
                self.logger.error(f"Error general procesando índice {index}: {e}")
        
        self.logger.info("Scraping de películas completado")

scraper = MovieInfoScraper(
    input_file='ml-latest-small/links.csv', 
    output_file='movie_info.csv', 
    tmdb_api_key='c768da44d48e2d1706a2451e62972bad'
)
scraper.scrape_movies()

In [None]:
import pandas as pd

df = pd.read_csv("movie_info2.csv")
df_cleaned = df.drop_duplicates(subset=["imdbID"], keep="first")
df_cleaned.to_csv("movie_info_cleaned.csv", index=False)



Archivo limpio guardado como movie_info_cleaned.csv
