In [77]:
import os # for file/directory manipulation
import re # for regex split
import pandas as pd # for dataframes
from collections import defaultdict

# Make a new directory to store the new files

In [3]:
OLD_FILES_DIRECTORY = './data_files/old_files/'
NEW_FILES_DIRECTORY = './data_files/new_files/'

if not os.path.exists(OLD_FILES_DIRECTORY): # after files are downloaded, show move them to this directory
    os.mkdir(OLD_FILES_DIRECTORY)
    
if not os.path.exists(NEW_FILES_DIRECTORY):
    os.mkdir(NEW_FILES_DIRECTORY)

# Read old movie file and write new one

In [7]:
# LATIN ENCODING SEEMS TO WORK OVER UTF-8; UTF-8/ASCII is not able to decode all the lines
with open(OLD_FILES_DIRECTORY + 'movies.list', 'rb') as movie_file: # read as binary file
    movie_data_original = [line.decode('latin-1').strip() for line in movie_file.readlines()]

'Number of Original Titles: 4697742'

In [9]:
# Sample of file 
for i in range(30):
    print(movie_data_original[i])

CRC: 0x8E8DBD74  File: movies.list  Date: Fri Dec 22 00:00:00 2017

Copyright 1991-2017 The Internet Movie Database Ltd. All rights reserved.

http://www.imdb.com

movies.list

20 Dec 2017

-----------------------------------------------------------------------------

MOVIES LIST

"!Next?" (1994)						1994-1995
"#1 Single" (2006)					2006-????
"#1 Single" (2006) {Cats and Dogs (#1.4)}		2006
"#1 Single" (2006) {Finishing a Chapter (#1.5)}		2006
"#1 Single" (2006) {Is the Grass Greener? (#1.1)}	2006
"#1 Single" (2006) {Stay (#1.8)}			2006
"#1 Single" (2006) {The Rules of Dating (#1.3)}		2006
"#1 Single" (2006) {Timing Is Everything (#1.7)}	2006
"#1 Single" (2006) {Window Shopping (#1.2)}		2006
"#1 Single" (2006) {Wingman (#1.6)}			2006
"#15SecondScare" (2015)					2015-????
"#15SecondScare" (2015) {Beauty Wrap}			2016
"#15SecondScare" (2015) {Because We Don't Want You to Fall Asleep (#1.3)}	????
"#15SecondScare" (2015) {Bubbles of Blood (#1.17)}	2016
"#15SecondScare" (2015) {Coming and Go

In [None]:
# ACTUAL DATA STARTS ON LINE 16, or index 15

In [32]:
def write_new_move_file(movie_data_original): # list of lines 
    movie_file = open(NEW_FILES_DIRECTORY + 'movies.csv', 'w')
    movie_file.write('movie,year\n') # write a header
    
    # internet doesn't seem to have anything, not reliable
    skips = ('(tv)', '(v)', '(vg)', 're-release', 'blu-ray')
    
    for i in range(15, len(movie_data_original)): # start from 15 cuz before is filler
        line = movie_data_original[i]
        
        if line.startswith('"'):
            continue
        try: # some lines may not be in correct format
            parts = [x.strip() for x in re.split('\\t+', line)] # split by 1 or more tabs
            name, year = parts[:2]
            if any(skip in name.lower() for skip in skips): # check is any of the skippables are in the name
                continue
            if '????' in year: # movie should have a defined year
                continue
            if ',' in name: # surround in quote if name already has comma
                movie_file.write(f'"{name}",{year}\n') 
            else:
                movie_file.write(f'{name},{year}\n') 
        except Exception as e: # tried to process this line, but failed
            print(line, e)
            continue
    
    movie_file.close()


In [33]:
write_new_move_file(movie_data_original)

-------------------------------------------------------------------------------- not enough values to unpack (expected 2, got 1)


In [115]:
movie_data = pd.read_csv(NEW_FILES_DIRECTORY + 'movies.csv')

In [116]:
rows, _ = movie_data.shape
f'New Data has {rows} Titles'

'New Data has 1024456 Titles'

In [117]:
movie_data.head()

Unnamed: 0,movie,year
0,# (2012/I),2012
1,# (2012/II),2012
2,# (2014),2014
3,#1 (2005),2005
4,#1 (2009),2009


# Read old certificates file and write new one

In [12]:
with open(OLD_FILES_DIRECTORY + 'certificates.list', 'rb') as certificate_file: # read as binary file
    certificate_data_original = [line.decode('latin-1').strip() for line in certificate_file.readlines()]

In [13]:
# Sample of ratings file
for i in range(30):
    print(certificate_data_original[i])

CRC: 0x1697FD9F  File: certificates.list  Date: Fri Dec 22 00:00:00 2017

Copyright 1991-2017 The Internet Movie Database Ltd. All rights reserved.

http://www.imdb.com

certificates.list

2017-12-19

-----------------------------------------------------------------------------

CERTIFICATES LIST
"#1 Single" (2006)					USA:TV-PG
"#BlackLove" (2015) {Bringing Sexy Back (#1.3)}		USA:TV-14
"#BlackLove" (2015) {Crash the Party (#1.9)}		USA:TV-14
"#BlackLove" (2015) {Feeling Some Kinda Way (#1.7)}	USA:TV-14
"#BlackLove" (2015) {Like a Virgin (#1.4)}		USA:TV-14
"#BlackLove" (2015) {Making Lemonade Out of Lemons (#1.2)}	USA:TV-14
"#BlackLove" (2015) {Maybe Baby (#1.8)}			USA:TV-14
"#BlackLove" (2015) {Miss Independent (#1.5)}		USA:TV-14
"#BlackLove" (2015) {Pack Your Bags (#1.6)}		USA:TV-14
"#BlackLove" (2015) {Sealing the Deal (#1.10)}		USA:TV-14
"#BlackLove" (2015) {Sexy in the City (#1.1)}		USA:TV-14
"#Hashtag Travel" (2017)				USA:Unrated
"#LoveMonkeyChocolate" (2014)				UK:PG
"#LoveMonke

In [14]:
# ACTUAL DATA STARTS ON LINE 15 or index 14

In [50]:
def write_new_certificate_file(certificate_data_original): # list of lines 
    certificate_file = open(NEW_FILES_DIRECTORY + 'certificates.csv', 'w')
    certificate_file.write('movie,rating\n') # write a header
    
    # internet doesn't seem to have anything, not reliable
    skips = ('(tv)', '(v)', '(vg)', 're-release', 'blu-ray')
    vg_ratings = 'E, E10+, T, C, M, Not Rated'.split(', ')
    
    for i in range(14, len(certificate_data_original)): # start from 14 cuz before is filler
        line = certificate_data_original[i]
        
        if line.startswith('"'):
            continue
        try: # some lines may not be in correct format
            parts = [x.strip() for x in re.split('\\t+', line)] # split by 1 or more tabs
            name, rating = parts[:2]
            if any(skip in name.lower() for skip in skips): # check is any of the skippables are in the name
                continue
            if 'USA' not in rating: # skip non us ratings
                continue
            if 'USA:X' in rating or 'USA:NC-17' in rating or 'TV' in rating: # per rubric, skip these ratings
                continue
            usa, vgr = rating.split(':') 
            if any(vg_rating in vgf for vg_rating in vg_ratings):  # check if video game
                continue
            if ',' in name: # surround in quote if name already has comma
                certificate_file.write(f'"{name}",{rating}\n') 
            else:
                certificate_file.write(f'{name},{rating}\n') 
        except Exception as e: # tried to process this line, but failed
            print(line, e)
            continue
    
    certificate_file.close()


In [51]:
write_new_certificate_file(certificate_data_original)

-------------------------------------------------------------------------------- not enough values to unpack (expected 2, got 1)


In [52]:
certificate_data = pd.read_csv(NEW_FILES_DIRECTORY + "certificates.csv")

In [53]:
rows, _ = certificate_data.shape
f'New data has {rows} Titles'

'New data has 55567 Titles'

In [54]:
certificate_data.head()

Unnamed: 0,movie,rating
0,#FollowFriday (2016),USA:Unrated
1,#OpenSeason (2017),USA:Unrated
2,#murderchallenge (2017),USA:Unrated
3,$ (1971),USA:R
4,$10 Raise (1935),USA:Approved


# Read old genre file and write new one

In [55]:
with open(OLD_FILES_DIRECTORY + 'genres.list', 'rb') as genre_file: # read as binary file
    genre_data_original = [line.decode('latin-1').strip() for line in genre_file.readlines()]

In [76]:
# Sample 1 of genre file 
for i in range(42, 78): # counts from index 42 to 77
    print(genre_data_original[i])

Short    	 680840
Drama    	 411259
Comedy    	 297238
Documentary    	 263981
Adult    	 82287
Thriller    	 79908
Action    	 79269
Romance    	 79009
Music    	 69599
Animation    	 65035
Horror    	 62687
Family    	 62126
Crime    	 55886
Adventure    	 48554
Fantasy    	 45128
Sci-Fi    	 41402
Mystery    	 37910
Biography    	 31414
History    	 27850
Sport    	 26610
Musical    	 21293
War    	 18570
Reality-TV    	 17921
News    	 16250
Western    	 16111
Talk-Show    	 13732
Game-Show    	 6113
Film-Noir    	 906
Reality-tv    	 45
Sci-fi    	 2
Sex    	 1
Lifestyle    	 1
Hardcore    	 1
Experimental    	 1
Erotica    	 1
Commercial    	 1


In [79]:
genre_counts = dict()
for i in range(42, 78): # counts for each genre start from line 43 to 78 or index 42 to 77
    line = genre_data_original[i]
    movie, count = [x.strip() for x in re.split('\\t+', line)]
    genre_counts[movie] = int(count)
genre_counts

{'Short': 680840,
 'Drama': 411259,
 'Comedy': 297238,
 'Documentary': 263981,
 'Adult': 82287,
 'Thriller': 79908,
 'Action': 79269,
 'Romance': 79009,
 'Music': 69599,
 'Animation': 65035,
 'Horror': 62687,
 'Family': 62126,
 'Crime': 55886,
 'Adventure': 48554,
 'Fantasy': 45128,
 'Sci-Fi': 41402,
 'Mystery': 37910,
 'Biography': 31414,
 'History': 27850,
 'Sport': 26610,
 'Musical': 21293,
 'War': 18570,
 'Reality-TV': 17921,
 'News': 16250,
 'Western': 16111,
 'Talk-Show': 13732,
 'Game-Show': 6113,
 'Film-Noir': 906,
 'Reality-tv': 45,
 'Sci-fi': 2,
 'Sex': 1,
 'Lifestyle': 1,
 'Hardcore': 1,
 'Experimental': 1,
 'Erotica': 1,
 'Commercial': 1}

In [64]:
# Sample 2 of genre file 
for i in range(380, 400):
    print(genre_data_original[i])


8: THE GENRES LIST

"!Next?" (1994)						Documentary
"#1 Single" (2006)					Reality-TV
"#15SecondScare" (2015)					Horror
"#15SecondScare" (2015)					Short
"#15SecondScare" (2015)					Thriller
"#15SecondScare" (2015) {Who Wants to Play with the Rabbit? (#1.2)}	Drama
"#15SecondScare" (2015) {Who Wants to Play with the Rabbit? (#1.2)}	Horror
"#15SecondScare" (2015) {Who Wants to Play with the Rabbit? (#1.2)}	Short
"#15SecondScare" (2015) {Who Wants to Play with the Rabbit? (#1.2)}	Thriller
"#1MinuteNightmare" (2014)				Horror
"#2G1S" (2016)						Crime
"#2WheelzNHeelz" (2017)					Reality-TV
"#30Nods Trailer" (2016)				Drama
"#30Nods" (2016)					Drama
"#4Hire" (2017)						Comedy
"#7DaysLater" (2013)					Comedy


In [80]:
# ACTUAL DATA START ON LINE 385 or index 384

In [83]:
def write_new_genre_file(genre_data_original): # list of lines 
    genre_file = open(NEW_FILES_DIRECTORY + 'genres.csv', 'w')
    genre_file.write('movie,genre\n') # write a header
    
    # internet doesn't seem to have anything, not reliable
    skips = ('(tv)', '(v)', '(vg)', 're-release', 'blu-ray')
    skip_genres = 'Short, Adult, Reality-TV, Talk-Show, Game-Show, News'.split(', ')
    
    for i in range(384, len(genre_data_original)): # start from 384 cuz before is filler
        line = genre_data_original[i]
        
        if line.startswith('"'):
            continue
        try: # some lines may not be in correct format
            parts = [x.strip() for x in re.split('\\t+', line)] # split by 1 or more tabs
            name, genre = parts[:2]
            if any(skip in name.lower() for skip in skips): # check is any of the skippables are in the name
                continue
            if any(g in genre for g in skip_genres): # check if should skip this genre by value
                continue
            count = genre_counts.get(genre, None)
            if count is None or count < 100: # if genre not found or less than 100, skip it 
                continue
            if ',' in name: # surround in quote if name already has comma
                genre_file.write(f'"{name}",{genre}\n') 
            else:
                genre_file.write(f'{name},{genre}\n') 
        except Exception as e: # tried to process this line, but failed
            print(line, e)
            continue
    
    genre_file.close()

In [84]:
write_new_genre_file(genre_data_original)

In [86]:
genre_data = pd.read_csv(NEW_FILES_DIRECTORY + 'genres.csv')
genre_data.head(10)

Unnamed: 0,movie,genre
0,# (2012/I),Comedy
1,# (2012/II),Animation
2,# (2014),Comedy
3,#1 (2005),Drama
4,#1 (2009),Animation
5,#1 (2010),Comedy
6,#1 (2018),Documentary
7,#1 Beauty Nail Salon (2014),Drama
8,#1 Beauty Nail Salon (2014),History
9,#1 Beauty Nail Salon (2014),War


In [87]:
rows, _ = genre_data.shape 
f'New data has {rows} titles'

'New data has 1340555 titles'

In [88]:
genre_data.genre.unique()

array(['Comedy', 'Animation', 'Drama', 'Documentary', 'History', 'War',
       'Horror', 'Sci-Fi', 'Adventure', 'Biography', 'Family', 'Action',
       'Romance', 'Musical', 'Sport', 'Fantasy', 'Mystery', 'Crime',
       'Thriller', 'Music', 'Western', 'Film-Noir'], dtype=object)

# Read old keywords file and write new one

In [89]:
with open(OLD_FILES_DIRECTORY + 'keywords.list', 'rb') as keywords_file: # read as binary file
    keywords_data_original = [line.decode('latin-1').strip() for line in keywords_file.readlines()]

In [97]:
len(keywords_data_original)

7582062

In [102]:
for i in range(60, 100): # Keywords + counts start on line 62 or index 61
    print(keywords_data_original[i])

keywords in use:
$100-bill (3)	$10000 (3)	$1000000-prize (2)
$1500-bottle-of-wine (1)	$20-bill (3)	$200000000 (1)
$5-bill (1)	$5-day (1)	$5000 (3)	$700-bank-check (1)
$98000-gift-card (1)		'50s-music (11)	'81-subaru (1)
'n-sync (1)	-184-degrees-fahrenheit (2)	-320-degrees-fahrenheit (1)
-40-degrees-fahrenheit (1)	.22-caliber-beretta (2)
.22-caliber-bullet (1)		.22-caliber-gun (7)
.22-caliber-pistol (3)		.22-caliber-revolver (3)
.22-caliber-rifle (1)		.22-caliber-semiautomatic-rifle (1)
.22-magnum-revolver (2)		.22-magnum-rifle (1)
.25-caliber-pistol (1)		.30-caliber-bullet (2)
.32-caliber-pistol (1)		.357-magnum (19)
.38-caliber-gun (5)		.38-calibre-pistol (3)
.38-silenced-revolver (1)	.38-snubnose-revolver (16)
.38-special (5)	.40-caliber-pistol (1)		.44-calibre-bullet (1)
.44-magnum (19)	.44-magnum-pistol (2)		.44-magnum-rifle (1)
.45-automatic (6)		.45-calibre-pistol (11)
.451-batting-average (1)	.50-caliber-pistol (2)
.50-calibre-bullet (6)		.50-calibre-gun (1)
.50-calibre-sniper-r

In [163]:
keyword_count_start = 61

In [164]:
keyword_count_end = 101649 # got this after a little trial and error

In [165]:
for i in range(keyword_count_end, keyword_count_end + 10):
    print(keywords_data_original[i])

zypora (2)	zz-top (3)	zz-top-impression (2)

5: Submission Rules

How To Add Keywords
-------------------

Send email to   adds@imdb.com   with the subject "ADD".  Above
the data you write the word "KEYWORD" on a line by itself.


In [166]:
keywords_data_original[61]
# format of a line. split by tabs, for each part get the keyword and count

'$100-bill (3)\t$10000 (3)\t$1000000-prize (2)'

In [185]:
keyword_count = dict()

for i in range(keyword_count_start, keyword_count_end + 1): # +1 cuz end index is not included
    line = keywords_data_original[i]
    parts = [x.strip() for x in re.split('\\t+', line)]
    for part in parts:
        try:
            keyword, count = [x.strip() for x in part.split(' ')]
            count = int(count[1:-1]) # don't need the parenthesis
            keyword_count[keyword] = count
        except Exception as e:
            print(i) # after inspection, these are just line empty lines 26 of them
            # there's a newline after each keyword group with the same letter
# keyword_count # it's a lot so I won't show

721
5378
11470
19628
24031
26742
32073
35091
38998
41285
42342
43654
46702
52598
54629
56167
62338
62576
79316
89967
95121
96377
97571
100917
100961
101465


In [159]:
movie_keyword_start = 101937 # got this after a little trial and error
for i in range(movie_keyword_start, movie_keyword_start + 20):
    print(keywords_data_original[i])

"#1 Single" (2006)					number-in-title
"#1MinuteNightmare" (2014)				web-series
"#30Nods" (2016)					friend
"#30Nods" (2016)					heroin
"#30Nods" (2016)					vlog
"#4Hire" (2017)						tv-mini-series
"#ATown" (2014)						austin-texas
"#ATown" (2014)						beer
"#ATown" (2014)						drugs
"#ATown" (2014)						friendship
"#ATown" (2014)						love
"#ATown" (2014)						texas
"#ATown" (2014)						web-series
"#Adulting" (2016/I)					millennial
"#Adulting" (2016/I)					web-series
"#AuVolant" (2016)					tv-mini-series
"#Bandcamp" (2014)					tv-mini-series
"#ByMySide" (2012)					twitter-hashtag-in-title
"#DailyPipTalk" (2016)					forex
"#DayOff" (2013)					tv-mini-series


In [186]:
def write_new_keywords_file(keywords_data_original): # list of lines 
    keywords_file = open(NEW_FILES_DIRECTORY + 'keywords.csv', 'w')
    keywords_file.write('movie,keyword\n') # write a header
    
    # internet doesn't seem to have anything, not reliable
    skips = ('(tv)', '(v)', '(vg)', 're-release', 'blu-ray')
    
    for i in range(movie_keyword_start, len(keywords_data_original)):
        line = keywords_data_original[i]
        
        if line.startswith('"'):
            continue
        try: # some lines may not be in correct format
            parts = [x.strip() for x in re.split('\\t+', line)] # split by 1 or more tabs
            name, keyword = parts[:2]
            if any(skip in name.lower() for skip in skips): # check is any of the skippables are in the name
                continue
                
            count = keyword_count.get(keyword, None)
            if count is None or count < 20 :
                continue
            if ',' in name: # surround in quote if name already has comma
                keywords_file.write(f'"{name}",{keyword}\n') 
            else:
                keywords_file.write(f'{name},{keyword}\n') 
        except Exception as e: # tried to process this line, but failed
            print(line, e)
            continue
    
    keywords_file.close()

In [187]:
write_new_keywords_file(keywords_data_original)

In [188]:
keywords_data = pd.read_csv(NEW_FILES_DIRECTORY + "keywords.csv")
rows, _ = keywords_data.shape 
f'New file has {rows} titles'

'New file has 3763306 titles'

In [189]:
keywords_data.head(10)

Unnamed: 0,movie,keyword
0,# (2012/II),stop-motion
1,#1 (2010),janitor
2,#1 (2010),magic
3,#1 (2010),pencil
4,#1 (2010),school
5,#1 (2010),surprise
6,#1 (2018),based-on-book
7,#1 at the Apocalypse Box Office (2015),australian-apocalyptic
8,#1 at the Apocalypse Box Office (2015),australian-science-fiction
9,#137 (2011),australian-future-earth


In [191]:
len(keywords_data.keyword.unique())/len(keyword_count) # he says it should be closer to 5%
# filter logic should be revised

0.1367987321711569

# Read old running times file and write new one 