In [1]:
import os
import re
import requests
import pandas as pd

In [2]:
def extract_movie_titles(root_path):
    movie_titles = []

    # Supported video file extensions
    video_extensions = ['.mp4', '.mkv', '.avi', '.mov','.flv']

    for root, dirs, files in os.walk(root_path):
        for file in files:
            if any(file.endswith(ext) for ext in video_extensions):
                # Extracting filename without extension
                title = os.path.splitext(file)[0]
                movie_titles.append(title)

    return movie_titles

In [3]:
def clean_title(title):
    # A series of regular expressions to clean up the title
    patterns = [
        r'\[.*?\]',  # Remove anything in square brackets
        r'\(.*?\)',  # Remove anything in round brackets
        r'1080p|720p|480p',  # Remove common resolutions
        r'BluRay|DVD|HDRip|HD',  # Remove release types
        r'film completo',  # Specific phrases
        r'Full Movie by Film&Clips',
        r'by .*$',  # 'by' and everything that follows
        r'Versione Restaurata',
        r'Subtitle',
        r'Film',
        r'Full',
        r'Pelicula Completa',
        r'IN ITALIANO',
        r'Movie',
        r'(?<![a-zA-Z])-(?![a-zA-Z])',  # Remove dashes not between words
        # r'\d{4}.*$',  # Remove any four digits (like 1985) and everything that follows
    ]
    
    for pattern in patterns:
        # re.IGNORECASE makes the regex pattern case insensitive
        title = re.sub(pattern, '', title, flags=re.IGNORECASE)
    
    # Replacing underscores with space
    title = title.replace('_', ' ')
    
    title = title.strip()  # Remove any leading/trailing whitespace
    return title



In [4]:
movie_list = extract_movie_titles('D:/Films')

In [5]:
movie_list = [clean_title(movie) for movie in movie_list]

In [8]:
movie_list

['Frankenweenie',
 'Il Castello Errante di Howl',
 'La sposa cadavere',
 'The nightmare before Christmas 1993',
 'The Polar Express 2004',
 'Il Grande Nord',
 'Jason and The Argonauts',
 '. Distretto 13',
 'Agente 007 Licenza di uccidere, 1962',
 'Clint Eastwood  Dove Osano Le Aquile 1969',
 'Dalla Russia Con Amore',
 'Diabolik',
 'Distretto 13',
 'I guerreri della notte',
 "I tre dell'operazione Drago",
 'Il Braccio Violento della Legge',
 'Il Furore Della Cina Colpisce Ancora',
 "Indiana Jones e i Predatori Dell'arca Perduta",
 'Indiana Jones e il Tempio Maledetto',
 'La ghigliottina volante',
 "Ocean's eleven",
 'Quel maledetto treno blindato',
 "Un colpo all'italiana",
 'Fist Of Fury 1972',
 'Il furore della cina colpisce ancora',
 "L'ultimo combattimento di Chen",
 'The Way of The Dragon 1972',
 'Banditi a Milano1968',
 'Cani arrabbiati',
 'Il Caso Venere Privata 1970',
 "Il cinico l'infame il violento",
 'Il Grande Racket',
 'La morte risale a ieri sera',
 'La Polizia Chiede Aiut

In [29]:
TMDB_API_KEY = 'fc134c5992c565508575ff65e3ce04de'
BASE_URL = 'https://api.themoviedb.org/3/'

In [30]:
# for movie in movie_list:
#     get_movie_details(movie)

In [41]:
def append_no_result_data(data_dict, title):
    """
    Helper function to append "no result" data to the movies_data dictionary.
    """
    data_dict['title'].append(title)
    data_dict['release_year'].append('')
    data_dict['director'].append('')
    data_dict['genre'].append('')
    data_dict['duration_minutes'].append('')
    data_dict['country_of_production'].append('')
    data_dict['language'].append('')
    data_dict['main_actors'].append('')
    data_dict['rating'].append('')
    data_dict['synopsis'].append('')
    data_dict['Seen'].append('')  # Placeholder for manual input
    data_dict['Subtitles'].append('')  # Placeholder for manual input

In [2]:
def get_movie_details(titles):
    # Create an empty dictionary to hold the details of all movies
    movies_data = {
        'file_title': [],
        'title': [],
        'release_year': [],
        'director': [],
        'genre': [],
        'duration_minutes': [],
        'country_of_production': [],
        'language': [],
        'main_actors': [],
        'rating': [],
        'synopsis': [],
        'Seen': [],
        'Subtitles': []
    }

    for title in titles:
    # Format the file title
        formatted_title = title.lower().capitalize()
        movies_data['file_title'].append(formatted_title)

        # Attempt to extract the year from the title
        year_match = re.search(r'(\d{4})', title)
    
        # Check if title is entirely numeric and if it's 4 digits long
        if title.isdigit() and len(title) == 4:
            year = ''
            clean_title = title
        else:
            year = year_match.group(1) if year_match else ''
            clean_title = title.replace(year, '').strip()

        search_url = f"{BASE_URL}search/movie?api_key={TMDB_API_KEY}&query={clean_title}"
        response = requests.get(search_url).json()

        # If we find results
        if response['results']:
            if year:  # if a year is given
                filtered_results = [result for result in response['results'] if year in result['release_date']]
                if not filtered_results:
                    movie_id = None
                else:
                    movie_id = filtered_results[0]['id']
            else:
                movie_id = response['results'][0]['id']

            if movie_id:  # If a valid movie_id is found
                # Fetching general movie details
                movie_url = f"{BASE_URL}movie/{movie_id}?api_key={TMDB_API_KEY}"
                movie_data = requests.get(movie_url).json()

                # Fetching movie credits for director information
                credits_url = f"{BASE_URL}movie/{movie_id}/credits?api_key={TMDB_API_KEY}"
                credits_data = requests.get(credits_url).json()

                # Extracting director's name
                director = next((member['name'] for member in credits_data['crew'] if member['job'] == 'Director'), None)

                # Adding details to the movies_data dictionary
                movies_data['title'].append(movie_data.get('original_title', 'N/A'))
                movies_data['release_year'].append(movie_data.get('release_date', 'N/A').split('-')[0])
                movies_data['director'].append(director if director else 'N/A')
                movies_data['genre'].append(', '.join([g['name'] for g in movie_data['genres']]) if movie_data.get('genres') else 'N/A')
                movies_data['duration_minutes'].append(movie_data.get('runtime', 'N/A'))
                movies_data['country_of_production'].append(movie_data['production_countries'][0]['name'] if movie_data.get('production_countries') else 'N/A')
                movies_data['language'].append(movie_data.get('original_language', 'N/A'))
                movies_data['main_actors'].append(', '.join([c['name'] for c in credits_data['cast'][:3]]) if credits_data.get('cast') else 'N/A')
                movies_data['rating'].append(movie_data.get('vote_average', 'N/A'))
                movies_data['synopsis'].append(movie_data.get('overview', 'N/A'))
                movies_data['Seen'].append('')  # Placeholder for manual input
                movies_data['Subtitles'].append('')  # Placeholder for manual input
            else:  # If the filtered results don't match the year or no valid movie ID
                append_no_result_data(movies_data, title)
        else:  # If no results are found at all
            append_no_result_data(movies_data, title)

    return movies_data


In [43]:
movies = get_movie_details(movie_list)

In [44]:
def save_to_csv(movies_data, csv_path):
    """
    Saves the movies data to a CSV file. If the CSV file already exists,
    it appends the new data without adding duplicates based on the 'title' column.
    """
    # If the CSV file exists, read it
    if os.path.exists(csv_path):
        existing_df = pd.read_csv(csv_path)
    else:
        existing_df = pd.DataFrame()

    # Convert the movies_data dict to a DataFrame
    new_df = pd.DataFrame(movies_data)

    # Append the new data to the existing data without adding duplicates
    combined_df = pd.concat([existing_df, new_df]).drop_duplicates(subset=['title','release_year']).reset_index(drop=True)

    # Save the combined data back to the CSV file
    combined_df.to_csv(csv_path, index=False)



In [45]:
# Usage:
# Assuming movies_data contains the data you want to save and "movies.csv" is your file
save_to_csv(movies, "movies.csv")