In [1]:
import os # for file/directory manipulation
import re # for regex split
import pandas as pd # for dataframes

# Make a new directory to store the new files

In [2]:
OLD_FILES_DIRECTORY = './data_files/old_files/'
NEW_FILES_DIRECTORY = './data_files/new_files/'

if not os.path.exists(OLD_FILES_DIRECTORY): # after files are downloaded, show move them to this directory
    os.mkdir(OLD_FILES_DIRECTORY)
    
if not os.path.exists(NEW_FILES_DIRECTORY):
    os.mkdir(NEW_FILES_DIRECTORY)

# Read old movie file and write new one

In [3]:
# LATIN ENCODING SEEMS TO WORK OVER UTF-8; UTF-8/ASCII is not able to decode all the lines
with open(OLD_FILES_DIRECTORY + 'movies.list', 'rb') as movie_file: # read as binary file
    movie_data_original = [line.decode('latin-1').strip() for line in movie_file.readlines()]

In [4]:
# Sample of file 
for i in range(30):
    print(movie_data_original[i])

CRC: 0x8E8DBD74  File: movies.list  Date: Fri Dec 22 00:00:00 2017

Copyright 1991-2017 The Internet Movie Database Ltd. All rights reserved.

http://www.imdb.com

movies.list

20 Dec 2017

-----------------------------------------------------------------------------

MOVIES LIST

"!Next?" (1994)						1994-1995
"#1 Single" (2006)					2006-????
"#1 Single" (2006) {Cats and Dogs (#1.4)}		2006
"#1 Single" (2006) {Finishing a Chapter (#1.5)}		2006
"#1 Single" (2006) {Is the Grass Greener? (#1.1)}	2006
"#1 Single" (2006) {Stay (#1.8)}			2006
"#1 Single" (2006) {The Rules of Dating (#1.3)}		2006
"#1 Single" (2006) {Timing Is Everything (#1.7)}	2006
"#1 Single" (2006) {Window Shopping (#1.2)}		2006
"#1 Single" (2006) {Wingman (#1.6)}			2006
"#15SecondScare" (2015)					2015-????
"#15SecondScare" (2015) {Beauty Wrap}			2016
"#15SecondScare" (2015) {Because We Don't Want You to Fall Asleep (#1.3)}	????
"#15SecondScare" (2015) {Bubbles of Blood (#1.17)}	2016
"#15SecondScare" (2015) {Coming and Go

In [5]:
# ACTUAL DATA STARTS ON LINE 16, or index 15

In [6]:
def write_new_move_file(movie_data_original): # list of lines 
    movie_file = open(NEW_FILES_DIRECTORY + 'movies.csv', 'w')
    movie_file.write('movie,year\n') # write a header
    
    # internet doesn't seem to have anything, not reliable
    skips = ('(tv)', '(v)', '(vg)', 're-release', 'blu-ray')
    
    for i in range(15, len(movie_data_original)): # start from 15 cuz before is filler
        line = movie_data_original[i]
        
        if line.startswith('"'):
            continue
        try: # some lines may not be in correct format
            parts = [x.strip() for x in re.split('\\t+', line)] # split by 1 or more tabs
            name, year = parts[:2]
            if any(skip in name.lower() for skip in skips): # check is any of the skippables are in the name
                continue
            if '????' in year: # movie should have a defined year
                continue
            if ',' in name: # surround in quote if name already has comma
                movie_file.write(f'"{name}",{year}\n') 
            else:
                movie_file.write(f'{name},{year}\n') 
        except Exception as e: # tried to process this line, but failed
            print(line, e)
    
    movie_file.close()


In [7]:
write_new_move_file(movie_data_original)

-------------------------------------------------------------------------------- not enough values to unpack (expected 2, got 1)


In [8]:
movie_data = pd.read_csv(NEW_FILES_DIRECTORY + 'movies.csv')

In [9]:
rows, _ = movie_data.shape
f'New Data has {rows} Titles'

'New Data has 1024456 Titles'

In [10]:
movie_data.sample(20)

Unnamed: 0,movie,year
404402,It's Okay (2014),2014
616594,One Small Step (2016),2016
228704,Domino (2005),2005
187076,D'éfficience (2014),2014
312029,Gaz Bar Blues (2003),2003
487664,Lene oder Lena (1918),1918
957922,Unspoken (2013/III),2013
231846,Dosukebe sanmai: Haha musume kui (1996),1996
402164,Islamophobia (2018),2018
632392,Papá se enreda otra vez (1942),1942


# Read old certificates file and write new one

In [11]:
with open(OLD_FILES_DIRECTORY + 'certificates.list', 'rb') as certificate_file: # read as binary file
    certificate_data_original = [line.decode('latin-1').strip() for line in certificate_file.readlines()]

In [12]:
# Sample of ratings file
for i in range(30):
    print(certificate_data_original[i])

CRC: 0x1697FD9F  File: certificates.list  Date: Fri Dec 22 00:00:00 2017

Copyright 1991-2017 The Internet Movie Database Ltd. All rights reserved.

http://www.imdb.com

certificates.list

2017-12-19

-----------------------------------------------------------------------------

CERTIFICATES LIST
"#1 Single" (2006)					USA:TV-PG
"#BlackLove" (2015) {Bringing Sexy Back (#1.3)}		USA:TV-14
"#BlackLove" (2015) {Crash the Party (#1.9)}		USA:TV-14
"#BlackLove" (2015) {Feeling Some Kinda Way (#1.7)}	USA:TV-14
"#BlackLove" (2015) {Like a Virgin (#1.4)}		USA:TV-14
"#BlackLove" (2015) {Making Lemonade Out of Lemons (#1.2)}	USA:TV-14
"#BlackLove" (2015) {Maybe Baby (#1.8)}			USA:TV-14
"#BlackLove" (2015) {Miss Independent (#1.5)}		USA:TV-14
"#BlackLove" (2015) {Pack Your Bags (#1.6)}		USA:TV-14
"#BlackLove" (2015) {Sealing the Deal (#1.10)}		USA:TV-14
"#BlackLove" (2015) {Sexy in the City (#1.1)}		USA:TV-14
"#Hashtag Travel" (2017)				USA:Unrated
"#LoveMonkeyChocolate" (2014)				UK:PG
"#LoveMonke

In [13]:
# ACTUAL DATA STARTS ON LINE 15 or index 14

In [14]:
def write_new_certificate_file(certificate_data_original): # list of lines 
    certificate_file = open(NEW_FILES_DIRECTORY + 'certificates.csv', 'w')
    certificate_file.write('movie,rating\n') # write a header
    
    # internet doesn't seem to have anything, not reliable
    skips = ('(tv)', '(v)', '(vg)', 're-release', 'blu-ray')
    vg_ratings = 'E, E10+, T, C, M, Not Rated'.split(', ')
    
    for i in range(14, len(certificate_data_original)): # start from 14 cuz before is filler
        line = certificate_data_original[i]
        
        if line.startswith('"'):
            continue
        try: # some lines may not be in correct format
            parts = [x.strip() for x in re.split('\\t+', line)] # split by 1 or more tabs
            name, rating = parts[:2]
            if any(skip in name.lower() for skip in skips): # check is any of the skippables are in the name
                continue
            if 'USA' not in rating: # skip non us ratings
                continue
            if 'USA:X' in rating or 'USA:NC-17' in rating or 'TV' in rating: # per rubric, skip these ratings
                continue
            usa, vgr = rating.split(':') 
            if any(vg_rating in vgr for vg_rating in vg_ratings):  # check if video game
                continue
            if ',' in name: # surround in quote if name already has comma
                certificate_file.write(f'"{name}",{rating}\n') 
            else:
                certificate_file.write(f'{name},{rating}\n') 
        except Exception as e: # tried to process this line, but failed
            print(line, e)
    
    certificate_file.close()


In [15]:
write_new_certificate_file(certificate_data_original)

-------------------------------------------------------------------------------- not enough values to unpack (expected 2, got 1)


In [16]:
certificate_data = pd.read_csv(NEW_FILES_DIRECTORY + "certificates.csv")

In [17]:
rows, _ = certificate_data.shape
f'New data has {rows} Titles'

'New data has 55567 Titles'

In [18]:
certificate_data.sample(20)

Unnamed: 0,movie,rating
7714,Chain of Evidence (1957),USA:Approved
42537,The Detective (1968),USA:Approved
1603,Action Jackson (1988),USA:R
42362,The Dark Mirror (1946),USA:Approved
27865,My Flesh and Blood (2003),USA:Unrated
47045,The Perfect Location (2004),USA:Unrated
38642,Straight Out of Brooklyn (1991),USA:R
2208,All the World's a Stooge (1941),USA:Approved
45173,The Last Starfighter (1984),USA:PG
47003,The Pebble and the Penguin (1995),USA:G


# Read old genre file and write new one

In [19]:
with open(OLD_FILES_DIRECTORY + 'genres.list', 'rb') as genre_file: # read as binary file
    genre_data_original = [line.decode('latin-1').strip() for line in genre_file.readlines()]

In [20]:
# Sample 1 of genre file 
for i in range(42, 78): # counts from index 42 to 77
    print(genre_data_original[i])

Short    	 680840
Drama    	 411259
Comedy    	 297238
Documentary    	 263981
Adult    	 82287
Thriller    	 79908
Action    	 79269
Romance    	 79009
Music    	 69599
Animation    	 65035
Horror    	 62687
Family    	 62126
Crime    	 55886
Adventure    	 48554
Fantasy    	 45128
Sci-Fi    	 41402
Mystery    	 37910
Biography    	 31414
History    	 27850
Sport    	 26610
Musical    	 21293
War    	 18570
Reality-TV    	 17921
News    	 16250
Western    	 16111
Talk-Show    	 13732
Game-Show    	 6113
Film-Noir    	 906
Reality-tv    	 45
Sci-fi    	 2
Sex    	 1
Lifestyle    	 1
Hardcore    	 1
Experimental    	 1
Erotica    	 1
Commercial    	 1


In [21]:
genre_counts = dict()
for i in range(42, 78): # counts for each genre start from line 43 to 78 or index 42 to 77
    line = genre_data_original[i]
    movie, count = [x.strip() for x in re.split('\\t+', line)]
    genre_counts[movie] = int(count)
genre_counts

{'Short': 680840,
 'Drama': 411259,
 'Comedy': 297238,
 'Documentary': 263981,
 'Adult': 82287,
 'Thriller': 79908,
 'Action': 79269,
 'Romance': 79009,
 'Music': 69599,
 'Animation': 65035,
 'Horror': 62687,
 'Family': 62126,
 'Crime': 55886,
 'Adventure': 48554,
 'Fantasy': 45128,
 'Sci-Fi': 41402,
 'Mystery': 37910,
 'Biography': 31414,
 'History': 27850,
 'Sport': 26610,
 'Musical': 21293,
 'War': 18570,
 'Reality-TV': 17921,
 'News': 16250,
 'Western': 16111,
 'Talk-Show': 13732,
 'Game-Show': 6113,
 'Film-Noir': 906,
 'Reality-tv': 45,
 'Sci-fi': 2,
 'Sex': 1,
 'Lifestyle': 1,
 'Hardcore': 1,
 'Experimental': 1,
 'Erotica': 1,
 'Commercial': 1}

In [22]:
# Sample 2 of genre file 
for i in range(380, 400):
    print(genre_data_original[i])


8: THE GENRES LIST

"!Next?" (1994)						Documentary
"#1 Single" (2006)					Reality-TV
"#15SecondScare" (2015)					Horror
"#15SecondScare" (2015)					Short
"#15SecondScare" (2015)					Thriller
"#15SecondScare" (2015) {Who Wants to Play with the Rabbit? (#1.2)}	Drama
"#15SecondScare" (2015) {Who Wants to Play with the Rabbit? (#1.2)}	Horror
"#15SecondScare" (2015) {Who Wants to Play with the Rabbit? (#1.2)}	Short
"#15SecondScare" (2015) {Who Wants to Play with the Rabbit? (#1.2)}	Thriller
"#1MinuteNightmare" (2014)				Horror
"#2G1S" (2016)						Crime
"#2WheelzNHeelz" (2017)					Reality-TV
"#30Nods Trailer" (2016)				Drama
"#30Nods" (2016)					Drama
"#4Hire" (2017)						Comedy
"#7DaysLater" (2013)					Comedy


In [23]:
# ACTUAL DATA START ON LINE 385 or index 384

In [24]:
def write_new_genre_file(genre_data_original): # list of lines 
    genre_file = open(NEW_FILES_DIRECTORY + 'genres.csv', 'w')
    genre_file.write('movie,genre\n') # write a header
    
    # internet doesn't seem to have anything, not reliable
    skips = ('(tv)', '(v)', '(vg)', 're-release', 'blu-ray')
    skip_genres = 'Short, Adult, Reality-TV, Talk-Show, Game-Show, News'.split(', ')
    
    for i in range(384, len(genre_data_original)): # start from 384 cuz before is filler
        line = genre_data_original[i]
        
        if line.startswith('"'):
            continue
        try: # some lines may not be in correct format
            parts = [x.strip() for x in re.split('\\t+', line)] # split by 1 or more tabs
            name, genre = parts[:2]
            if any(skip in name.lower() for skip in skips): # check is any of the skippables are in the name
                continue
            if any(g in genre for g in skip_genres): # check if should skip this genre by value
                continue
            count = genre_counts.get(genre, None)
            if count is None or count < 100: # if genre not found or less than 100, skip it 
                continue
            if ',' in name: # surround in quote if name already has comma
                genre_file.write(f'"{name}",{genre}\n') 
            else:
                genre_file.write(f'{name},{genre}\n') 
        except Exception as e: # tried to process this line, but failed
            print(line, e)
    
    genre_file.close()

In [25]:
write_new_genre_file(genre_data_original)

In [26]:
genre_data = pd.read_csv(NEW_FILES_DIRECTORY + 'genres.csv')
genre_data.sample(20)

Unnamed: 0,movie,genre
320995,Eggs Don't Bounce (1944),Family
1300451,Who Is Vermin Supreme? An Outsider Odyssey (2014),Drama
1310511,Women ceng jing de wuchanzhe (2009),Documentary
1041390,The Autograph Hound (1939),Animation
271031,Delivery (2013/III),Crime
905404,Rû=garû (2010),Animation
870777,Real Pretty Song (????),Comedy
1265269,Verloren (????),Thriller
1060381,The Code-Genesis (2011),Fantasy
1332529,Zlocin v dívcí skole (1966),Crime


In [27]:
rows, _ = genre_data.shape 
f'New data has {rows} titles'

'New data has 1340555 titles'

In [28]:
genre_data.genre.unique()

array(['Comedy', 'Animation', 'Drama', 'Documentary', 'History', 'War',
       'Horror', 'Sci-Fi', 'Adventure', 'Biography', 'Family', 'Action',
       'Romance', 'Musical', 'Sport', 'Fantasy', 'Mystery', 'Crime',
       'Thriller', 'Music', 'Western', 'Film-Noir'], dtype=object)

# Read old keywords file and write new one

In [29]:
with open(OLD_FILES_DIRECTORY + 'keywords.list', 'rb') as keywords_file: # read as binary file
    keywords_data_original = [line.decode('latin-1').strip() for line in keywords_file.readlines()]

In [30]:
len(keywords_data_original)

7582062

In [31]:
for i in range(60, 100): # Keywords + counts start on line 62 or index 61
    print(keywords_data_original[i])

keywords in use:
$100-bill (3)	$10000 (3)	$1000000-prize (2)
$1500-bottle-of-wine (1)	$20-bill (3)	$200000000 (1)
$5-bill (1)	$5-day (1)	$5000 (3)	$700-bank-check (1)
$98000-gift-card (1)		'50s-music (11)	'81-subaru (1)
'n-sync (1)	-184-degrees-fahrenheit (2)	-320-degrees-fahrenheit (1)
-40-degrees-fahrenheit (1)	.22-caliber-beretta (2)
.22-caliber-bullet (1)		.22-caliber-gun (7)
.22-caliber-pistol (3)		.22-caliber-revolver (3)
.22-caliber-rifle (1)		.22-caliber-semiautomatic-rifle (1)
.22-magnum-revolver (2)		.22-magnum-rifle (1)
.25-caliber-pistol (1)		.30-caliber-bullet (2)
.32-caliber-pistol (1)		.357-magnum (19)
.38-caliber-gun (5)		.38-calibre-pistol (3)
.38-silenced-revolver (1)	.38-snubnose-revolver (16)
.38-special (5)	.40-caliber-pistol (1)		.44-calibre-bullet (1)
.44-magnum (19)	.44-magnum-pistol (2)		.44-magnum-rifle (1)
.45-automatic (6)		.45-calibre-pistol (11)
.451-batting-average (1)	.50-caliber-pistol (2)
.50-calibre-bullet (6)		.50-calibre-gun (1)
.50-calibre-sniper-r

In [32]:
keyword_count_start = 61

In [33]:
keyword_count_end = 101649 # got this after a little trial and error

In [34]:
for i in range(keyword_count_end, keyword_count_end + 10):
    print(keywords_data_original[i])

zypora (2)	zz-top (3)	zz-top-impression (2)

5: Submission Rules

How To Add Keywords
-------------------

Send email to   adds@imdb.com   with the subject "ADD".  Above
the data you write the word "KEYWORD" on a line by itself.


In [35]:
keywords_data_original[61]
# format of a line. split by tabs, for each part get the keyword and count

'$100-bill (3)\t$10000 (3)\t$1000000-prize (2)'

In [36]:
keyword_count = dict()

for i in range(keyword_count_start, keyword_count_end + 1): # +1 cuz end index is not included
    line = keywords_data_original[i]
    parts = [x.strip() for x in re.split('\\t+', line)]
    for part in parts:
        try:
            keyword, count = [x.strip() for x in part.split(' ')]
            count = int(count[1:-1]) # don't need the parenthesis
            keyword_count[keyword] = count
        except Exception as e:
            print(i) # after inspection, these are just line empty lines 26 of them
            # there's a newline after each keyword group with the same letter
# keyword_count # it's a lot so I won't show

721
5378
11470
19628
24031
26742
32073
35091
38998
41285
42342
43654
46702
52598
54629
56167
62338
62576
79316
89967
95121
96377
97571
100917
100961
101465


In [37]:
movie_keyword_start = 101937 # got this after a little trial and error
for i in range(movie_keyword_start, movie_keyword_start + 20):
    print(keywords_data_original[i])

"#1 Single" (2006)					number-in-title
"#1MinuteNightmare" (2014)				web-series
"#30Nods" (2016)					friend
"#30Nods" (2016)					heroin
"#30Nods" (2016)					vlog
"#4Hire" (2017)						tv-mini-series
"#ATown" (2014)						austin-texas
"#ATown" (2014)						beer
"#ATown" (2014)						drugs
"#ATown" (2014)						friendship
"#ATown" (2014)						love
"#ATown" (2014)						texas
"#ATown" (2014)						web-series
"#Adulting" (2016/I)					millennial
"#Adulting" (2016/I)					web-series
"#AuVolant" (2016)					tv-mini-series
"#Bandcamp" (2014)					tv-mini-series
"#ByMySide" (2012)					twitter-hashtag-in-title
"#DailyPipTalk" (2016)					forex
"#DayOff" (2013)					tv-mini-series


In [38]:
def write_new_keywords_file(keywords_data_original): # list of lines 
    keywords_file = open(NEW_FILES_DIRECTORY + 'keywords.csv', 'w')
    keywords_file.write('movie,keyword\n') # write a header
    
    # internet doesn't seem to have anything, not reliable
    skips = ('(tv)', '(v)', '(vg)', 're-release', 'blu-ray')
    
    for i in range(movie_keyword_start, len(keywords_data_original)):
        line = keywords_data_original[i]
        
        if line.startswith('"'):
            continue
        try: # some lines may not be in correct format
            parts = [x.strip() for x in re.split('\\t+', line)] # split by 1 or more tabs
            name, keyword = parts[:2]
            if any(skip in name.lower() for skip in skips): # check is any of the skippables are in the name
                continue
                
            count = keyword_count.get(keyword, None)
            if count is None or count < 20 :
                continue
            if ',' in name: # surround in quote if name already has comma
                keywords_file.write(f'"{name}",{keyword}\n') 
            else:
                keywords_file.write(f'{name},{keyword}\n') 
        except Exception as e: # tried to process this line, but failed
            print(line, e)
    
    keywords_file.close()

In [39]:
write_new_keywords_file(keywords_data_original)

In [40]:
keywords_data = pd.read_csv(NEW_FILES_DIRECTORY + "keywords.csv")
rows, _ = keywords_data.shape 
f'New file has {rows} titles'

'New file has 3763306 titles'

In [41]:
keywords_data.sample(20)

Unnamed: 0,movie,keyword
3667781,Winning (1969),teenage-boy
2179041,Only You (1994),canal
983767,Fantastic Beasts and Where to Find Them (2016),abusive-mother
2016109,Murderball (2005),cripple
24237,2047: Sights of Death (2014),soldier
2541642,Screen Test (1985),filmmaking
3664515,Wilsonov (2015),racist-comment
2149121,Office Space (1999),disgruntled-worker
1121859,Ghost of Mae Nak (2005),mysterious-stranger
2406401,Reflections of Youth (1975),youth


In [61]:
len(keywords_data.keyword.unique())/len(keyword_count) # he says it should be closer to 5%
# filter logic should be revised

0.1367987321711569

# Read old running times file and write new one 

In [43]:
with open(OLD_FILES_DIRECTORY + 'running-times.list', 'rb') as runtime_file: # read as binary file
    runtime_data_original = [line.decode('latin-1').strip() for line in runtime_file.readlines()]

In [44]:
for i in range(50): 
    print(runtime_data_original[i])

CRC: 0x4449AFEC  File: running-times.list  Date: Fri Dec 22 00:00:00 2017

Copyright 1991-2017 The Internet Movie Database Ltd. All rights reserved.

http://www.imdb.com

running-times.list

2017-12-19

-----------------------------------------------------------------------------

RUNNING TIMES LIST
"#1 Single" (2006)					30
"#1 Single" (2006) {Cats and Dogs (#1.4)}		20
"#1 Single" (2006) {Finishing a Chapter (#1.5)}		20
"#1 Single" (2006) {Is the Grass Greener? (#1.1)}	21
"#1 Single" (2006) {Stay (#1.8)}			20
"#1 Single" (2006) {The Rules of Dating (#1.3)}		20
"#1 Single" (2006) {Timing Is Everything (#1.7)}	20
"#1 Single" (2006) {Window Shopping (#1.2)}		20
"#1 Single" (2006) {Wingman (#1.6)}			20
"#15SecondScare" (2015)					1
"#15SecondScare" (2015) {Because We Don't Want You to Fall Asleep (#1.3)}	1
"#15SecondScare" (2015) {Coming and Going (#1.11)}	1
"#15SecondScare" (2015) {Doll Factory (#1.10)}		1
"#15SecondScare" (2015) {Don't Look (#1.6)}		1
"#15SecondScare" (2015) {Don't Take

In [45]:
runtime_data_original[14] # Actual data starts at line 15 or index 14

'"#1 Single" (2006)\t\t\t\t\t30'

In [46]:
# After inspection, many runtimes are in the format 'Country:runtime' and some have diff formats

def get_runtime(runtime_string):
    result = None
    try:
        if float(runtime_string) >= 60: # try to see if it's just a regular numeric
            result = runtime_string
    except ValueError: # it's probably in the format 'Country:runtime'
        parts = runtime_string.split(':') 
        if len(parts) == 2: # should give ['country', 'runtime'], but check
            _, runtime = parts
            try:
                if float(runtime) >= 60: # try to see if it's just a regular numeric
                    result = runtime # just the number portion
            except ValueError:
                # there are a lot of formats; for ease just check leading digits and make sure it's >= 60
                non_digit_index = get_first_non_digit_index(runtime) 
                if float(runtime[:non_digit_index]) >= 60:
                    result = runtime[:non_digit_index] # we can ignore the seconds; not important
    return result
    
    
def get_first_non_digit_index(string):
    for i, char in enumerate(string):
        if not char.isdigit():
            return i

In [47]:
def write_new_runtime_file(runtime_data_original): # list of lines 
    runtime_file = open(NEW_FILES_DIRECTORY + 'runtimes.csv', 'w')
    runtime_file.write('movie,runtime\n') # write a header
    
    # internet doesn't seem to have anything, not reliable
    skips = ('(tv)', '(v)', '(vg)', 're-release', 'blu-ray')
    
    for i in range(14, len(runtime_data_original)):
        line = runtime_data_original[i]
        
        if line.startswith('"'):
            continue
        try: # some lines may not be in correct format
            parts = [x.strip() for x in re.split('\\t+', line)] # split by 1 or more tabs
            name, runtime_string = parts[:2]
            if any(skip in name.lower() for skip in skips): # check is any of the skippables are in the name
                continue
                
            
            runtime = get_runtime(runtime_string)
            
            if runtime is None:
                continue
                
            if ',' in name: # surround in quote if name already has comma
                runtime_file.write(f'"{name}",{runtime}\n') 
            else:
                runtime_file.write(f'{name},{runtime}\n') 
        except Exception as e: # tried to process this line, but failed    
            print(line, e)
    
    runtime_file.close()

In [48]:
write_new_runtime_file(runtime_data_original)

-------------------------------------------------------------------------------- not enough values to unpack (expected 2, got 1)


In [49]:
runtime_data = pd.read_csv(NEW_FILES_DIRECTORY + "runtimes.csv")
rows, _ = runtime_data.shape 
f'New file has {rows} titles'

'New file has 283156 titles'

In [50]:
runtime_data.sample(20)

Unnamed: 0,movie,runtime
209420,Shizumanu taiyô (2009),202.0
125631,L'apetta Giulia e la signora Vita (2003),76.0
32386,Boys on Film 2: In Too Deep (2009),147.0
82583,From Behind the Sunflower (2005),81.0
177899,Out in the Desert (2013),80.0
83260,Fukushû no uta ga kikoeru (1968),90.0
251776,Thong Dee Fun Khao (2017),122.0
196674,Road Hard (2015),98.0
59417,Die große Liebe (1942),102.0
168560,No Intenso Agora (2017),127.0


# Read old release dates file and write new one

In [51]:
with open(OLD_FILES_DIRECTORY + 'release-dates.list', 'rb') as release_file: # read as binary file
    release_data_original = [line.decode('latin-1').strip() for line in release_file.readlines()]

In [52]:
for i in range(50):
    print(release_data_original[i])

CRC: 0x3F1CFDEF  File: release-dates.list  Date: Fri Dec 22 00:00:00 2017

Copyright 1991-2017 The Internet Movie Database Ltd. All rights reserved.

http://www.imdb.com

release-dates.list

2017-12-20

-----------------------------------------------------------------------------

RELEASE DATES LIST
"!Next?" (1994)						Italy:1 January 1994
"#1 Single" (2006)					USA:22 January 2006
"#1 Single" (2006) {Cats and Dogs (#1.4)}		USA:12 February 2006
"#1 Single" (2006) {Finishing a Chapter (#1.5)}		USA:19 February 2006
"#1 Single" (2006) {Is the Grass Greener? (#1.1)}	USA:22 January 2006
"#1 Single" (2006) {Stay (#1.8)}			USA:19 March 2006
"#1 Single" (2006) {The Rules of Dating (#1.3)}		USA:5 February 2006
"#1 Single" (2006) {Timing Is Everything (#1.7)}	USA:12 March 2006
"#1 Single" (2006) {Window Shopping (#1.2)}		USA:29 January 2006
"#1 Single" (2006) {Wingman (#1.6)}			USA:26 February 2006
"#15SecondScare" (2015) {Beauty Wrap}			USA:1 August 2016
"#15SecondScare" (2015) {Bubbles of Blo

In [53]:
# Actual data starts on line 15 or index 14
release_data_original[14] # format is 'Movie <tabs> Country:Date'
# for ease of access/subsets etc, I'll make 2 columns month, year

'"!Next?" (1994)\t\t\t\t\t\tItaly:1 January 1994'

In [54]:
# After inspection, some dates only have either month and year, year, or month year and day
# given a date string, return a list of just month and year

def get_date_parts(date_string):
    result = ['NA', 'NA']
    parts = re.split('\\s+', date_string) 
    
    if len(parts) == 1: # only year found
        result[1] = parts[0]
    elif len(parts) == 2: # month and year found
        result = parts 
    elif len(parts) == 3: # day month year found
        result = parts[1:]
    return result

In [55]:
def write_new_release_file(release_data_original): # list of lines 
    release_file = open(NEW_FILES_DIRECTORY + 'releases.csv', 'w')
    release_file.write('movie,month,year\n') # write a header
    
    # internet doesn't seem to have anything, not reliable
    skips = ('(tv)', '(v)', '(vg)', 're-release', 'blu-ray')
    
    for i in range(14, len(release_data_original)):
        line = release_data_original[i]
        
        if line.startswith('"'):
            continue
        try: # some lines may not be in correct format
            parts = [x.strip() for x in re.split('\\t+', line)] # split by 1 or more tabs
            name, release_string = parts[:2]
            if any(skip in name.lower() for skip in skips): # check is any of the skippables are in the name
                continue
            
            _, date_string = release_string.split(':')
            
            month, year = get_date_parts(date_string)
            
            
            if ',' in name: # surround in quote if name already has comma
                release_file.write(f'"{name}",{month},{year}\n') 
            else:
                release_file.write(f'{name},{month},{year}\n') 
        except Exception as e: # tried to process this line, but failed    
            print(line, e)
    
    release_file.close()

In [56]:
write_new_release_file(release_data_original)

-------------------------------------------------------------------------------- not enough values to unpack (expected 2, got 1)


In [57]:
release_data = pd.read_csv(NEW_FILES_DIRECTORY + "releases.csv")
rows, _ = release_data.shape 
f'New file has {rows} titles'

'New file has 1724266 titles'

In [58]:
release_data.sample(20)

Unnamed: 0,movie,month,year
414024,El incidente (2014),October,2015
67413,Aikansa edellä (1990),,1990
257398,Chipman (2004),February,2004
627301,Ich denke oft an Piroschka (1955),June,1957
623909,"I zhizn, i slyozy, i lyubov... (1984)",January,1985
178720,Black Eyes (1915),October,1915
72702,Aleksandr Nevskiy (1938),August,2002
1385548,The Devil's Rejects (2005),October,2005
1086035,Playgirls of Munich (1977),May,1988
910544,Men's Room (2015),February,2016
