In [2]:
import pandas as pd
import csv


In [None]:
# Method 1: Try with different parameters for robust parsing
try:
    print("Attempting to read CSV with robust parameters...")
    movies = pd.read_csv(
        "TMDB_all_movies.csv", 
        on_bad_lines='skip',
        encoding='utf-8',
        quoting=csv.QUOTE_MINIMAL,
        engine='python',  # Use Python engine which is more forgiving
        sep=',',
        low_memory=False,
        error_bad_lines=False  # For older pandas versions
    )
    print(f"Successfully loaded {len(movies)} rows")
    print("Dataset shape:", movies.shape)
    print("\nFirst few rows:")
    print(movies.head())
    
except Exception as e:
    print(f"Method 1 failed: {e}")
    
    # Method 2: Try reading in chunks
    try:
        print("\nTrying to read in chunks...")
        chunk_list = []
        chunk_size = 10000
        
        for chunk in pd.read_csv(
            "TMDB_all_movies.csv", 
            chunksize=chunk_size,
            on_bad_lines='skip',
            encoding='utf-8',
            engine='python'
        ):
            chunk_list.append(chunk)
            print(f"Processed chunk, total rows so far: {len(chunk_list) * chunk_size}")
            
        movies = pd.concat(chunk_list, ignore_index=True)
        print(f"Successfully loaded {len(movies)} rows using chunking method")
        print("Dataset shape:", movies.shape)
        print("\nFirst few rows:")
        print(movies.head())
        
    except Exception as e2:
        print(f"Method 2 also failed: {e2}, stopping here.")
        
        # Method 3: Try the alternative CSV file
        # try:
        #     print("\nTrying alternative CSV file...")
        #     movies = pd.read_csv("movie_data_tmbd.csv", on_bad_lines='skip')
        #     print(f"Successfully loaded alternative CSV with {len(movies)} rows")
        #     print("Dataset shape:", movies.shape)
        #     print("\nFirst few rows:")
        #     print(movies.head())
            
        # except Exception as e3:
        #     print(f"All methods failed. Error: {e3}")
        #     print("You may need to manually clean the CSV file or use a different data source.")

In [7]:
movies = pd.read_csv('all_movies.csv')
movies.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,...,spoken_languages,cast,director,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,poster_path
0,2,Ariel,7.1,346.0,Released,1988-10-21,0.0,73.0,0.0,tt0094675,...,suomi,"Merja Pulkkinen, Eetu Hilkamo, Turo Pajala, Es...",Aki Kaurismäki,Timo Salminen,Aki Kaurismäki,Aki Kaurismäki,,7.4,9221.0,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg
1,3,Shadows in Paradise,7.293,409.0,Released,1986-10-17,0.0,74.0,0.0,tt0092149,...,"suomi, English, svenska","Esko Nikkari, Mari Rantasila, Marina Martinoff...",Aki Kaurismäki,Timo Salminen,Aki Kaurismäki,Mika Kaurismäki,,7.4,8048.0,/nj01hspawPof0mJmlgfjuLyJuRN.jpg
2,5,Four Rooms,5.862,2694.0,Released,1995-12-09,4257354.0,98.0,4000000.0,tt0113101,...,English,"Lili Taylor, Madonna, Tamlyn Tomita, Marc Lawr...","Quentin Tarantino, Robert Rodriguez, Allison A...","Phil Parmet, Guillermo Navarro, Andrzej Sekula...","Quentin Tarantino, Robert Rodriguez, Allison A...","Lawrence Bender, Quentin Tarantino, Alexandre ...",Combustible Edison,6.7,114364.0,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg
3,6,Judgment Night,6.5,351.0,Released,1993-10-15,12136938.0,109.0,21000000.0,tt0107286,...,English,"Deirdre Kelly, Peter Greene, Will Zahrn, Jerem...",Stephen Hopkins,Peter Levy,"Jere Cunningham, Lewis Colick","Marilyn Vance, Lloyd Segan, Gene Levy",Alan Silvestri,6.6,20035.0,/3rvvpS9YPM5HB2f4HYiNiJVtdam.jpg
4,8,Life in Loops (A Megacities RMX),7.5,27.0,Released,2006-01-01,0.0,80.0,42000.0,tt0825671,...,"English, हिन्दी, 日本語, Pусский, Español",,Timo Novotny,Wolfgang Thaler,"Michael Glawogger, Timo Novotny","Ulrich Gehmacher, Timo Novotny",,8.1,285.0,/7ln81BRnPR2wqxuITZxEciCe1lc.jpg


In [4]:
# Filter movies from 2016-2020 (inclusive)
start_year = 2016
end_year = 2020

# Convert release_date to datetime if it's not already
movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')

# Create a year column for easier filtering
movies['year'] = movies['release_date'].dt.year

# Filter movies from 2016-2020 (inclusive)
recent_movies = movies[(movies['year'] >= start_year) & (movies['year'] <= end_year)]

# Remove rows where release_date is NaT (Not a Time) 
recent_movies = recent_movies.dropna(subset=['release_date'])

print(f"Total movies in dataset: {len(movies)}")
print(f"Movies from {start_year}-{end_year} (inclusive): {len(recent_movies)}")
print(f"Percentage of movies from {start_year}-{end_year}: {len(recent_movies)/len(movies)*100:.2f}%")

recent_movies.head()


Total movies in dataset: 1108291
Movies from 2016-2020 (inclusive): 170695
Percentage of movies from 2016-2020: 15.40%


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,...,cast,director,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,poster_path,year
1530,2203,Für ein paar Filme mehr...,0.0,0.0,Released,2017-01-01,0.0,30.0,0.0,,...,,,,,,,,,,2017.0
3637,6636,The Mugger,5.4,5.0,Released,2017-04-01,0.0,67.0,0.0,tt1006823,...,"Guillermo Arengo, Maya Lesca, Arturo Goetz, Bá...",Pablo Fendrik,Cobi Migliora,Pablo Fendrik,Juan Pablo Gugliotta,,6.7,326.0,/nI7fpYioLcumBOIH0PMPDKxeqZc.jpg,2017.0
5266,10148,Krystal,5.2,56.0,Released,2018-04-13,0.0,90.0,0.0,tt0835802,...,"T.I., Kathy Bates, Rick Fox, Jacob Latimore, W...",William H. Macy,Adam Silver,Will Aldis,"Rachel Rothman, Robert Halmi Jr., Dan Keston, ...",Dan Romer,5.5,1806.0,/5Do7HKvKPgjiBVJieMAOt8aZXAB.jpg,2018.0
8633,14564,Rings,4.995,2520.0,Released,2017-02-01,83080890.0,102.0,25000000.0,tt0498381,...,"Aimee Teegarden, Chuck David Willis, Adam Fris...",F. Javier Gutiérrez,Sharone Meir,"Kōji Suzuki, David Loucka, Jacob Aaron Estes, ...","Walter F. Parkes, Neal Edelstein, Roy Lee, J.C...",Matthew Margeson,4.5,44666.0,/yp4CDOVpVmNwiPoZKQeFCpW8CFo.jpg,2017.0
11691,19295,Power Raiders,0.0,0.0,Released,2016-10-24,0.0,0.0,0.0,,...,,,,,,,,,,2016.0


In [5]:
recent_movies.to_csv('recent_movies.csv', index=False)


In [13]:
# Create a year column for easier filtering
# movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')
# movies['year'] = movies['release_date'].dt.year
# movies.head()
movies.to_csv('all_movies.csv', index=False)


In [6]:
test = pd.read_csv('recent_movies.csv')
test.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,...,cast,director,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,poster_path,year
0,2203,Für ein paar Filme mehr...,0.0,0.0,Released,2017-01-01,0.0,30.0,0.0,,...,,,,,,,,,,2017.0
1,6636,The Mugger,5.4,5.0,Released,2017-04-01,0.0,67.0,0.0,tt1006823,...,"Guillermo Arengo, Maya Lesca, Arturo Goetz, Bá...",Pablo Fendrik,Cobi Migliora,Pablo Fendrik,Juan Pablo Gugliotta,,6.7,326.0,/nI7fpYioLcumBOIH0PMPDKxeqZc.jpg,2017.0
2,10148,Krystal,5.2,56.0,Released,2018-04-13,0.0,90.0,0.0,tt0835802,...,"T.I., Kathy Bates, Rick Fox, Jacob Latimore, W...",William H. Macy,Adam Silver,Will Aldis,"Rachel Rothman, Robert Halmi Jr., Dan Keston, ...",Dan Romer,5.5,1806.0,/5Do7HKvKPgjiBVJieMAOt8aZXAB.jpg,2018.0
3,14564,Rings,4.995,2520.0,Released,2017-02-01,83080890.0,102.0,25000000.0,tt0498381,...,"Aimee Teegarden, Chuck David Willis, Adam Fris...",F. Javier Gutiérrez,Sharone Meir,"Kōji Suzuki, David Loucka, Jacob Aaron Estes, ...","Walter F. Parkes, Neal Edelstein, Roy Lee, J.C...",Matthew Margeson,4.5,44666.0,/yp4CDOVpVmNwiPoZKQeFCpW8CFo.jpg,2017.0
4,19295,Power Raiders,0.0,0.0,Released,2016-10-24,0.0,0.0,0.0,,...,,,,,,,,,,2016.0
