## Part 1 : Data Acquisition

### Step 1: Downloading the 'Kaggle 5000 Movie Dataset'

In [38]:
import urllib.request # instead of 'import urllib in Python 2'
import os

if not os.path.exists('./data'):
    os.makedirs('./data')
    
kaggle_url = "https://users.encs.concordia.ca/~gregb/home/comp499data/movie_metadata.csv"
if not os.path.exists('.data/kaggle_dataset.csv'):
    response = urllib.request.urlretrieve(kaggle_url, './data/kaggle_dataset.csv')
    
    # had to change 'urllib.urlretrive' for 'urllib.request.urlretrieve'

### Step 2: Downloading the 'IMDB Plain Text Data'

In [45]:
import gzip

# Obtaining IMDB's text files
imdb_url_prefix = 'https://users.encs.concordia.ca/~gregb/home/comp499data/'
imdb_files_list = ['genres.list.gz', 'ratings.list.gz']
for name in imdb_files_list:
    if not os.path.exists('./data/' + name):
        response = urllib.request.urlretrieve(imdb_url_prefix + name, './data/' + name)
        urllib.request.urlcleanup()   # urllib fails to download two files from a ftp source. This fixes the bug!
        with gzip.open('./data/' + name) as comp_file, open('./data/' + name[:-3], 'w') as reg_file:
            file_content = comp_file.read()
            reg_file.write(file_content.decode("iso-8859-1"))
            
            
# had to change all calls 'urllib' in Python 2 to Python 3's 'urllib.request'
# also had to add 'decode(iso-8859-1")'


### Step 3: Downloading the 'IMDB Prepared Data'

In [40]:
imdb_url = "https://users.encs.concordia.ca/~gregb/home/comp499data/imdb_dataset.csv"
if not os.path.exists('./data/imdb_dataset.csv'): 
    response = urllib.request.urlretrieve(imdb_url, './data/imdb_dataset.csv')

## Part 2: Data Extraction

#### Content of the "ratings.list" data file

In [41]:

path = "./data/ratings.list"
with open(path, encoding="utf8", errors='ignore') as myfile:
    head = [next(myfile) for x in range(38)]
print (''.join(head[28:38]))   # skipping the first 28 lines as they are descriptive headers

# had to add 'encoding="utf8"'

      0000000125  1888533   9.2  The Shawshank Redemption (1994)
      0000000125  1289428   9.2  The Godfather (1972)
      0000000124  889607   9.0  The Godfather: Part II (1974)
      0000000124  1864164   9.0  The Dark Knight (2008)
      0000000133  518449   8.9  12 Angry Men (1957)
      0000000133  971107   8.9  Schindler's List (1993)
      0000000123  1477112   8.9  Pulp Fiction (1994)
      0000000124  1349449   8.9  The Lord of the Rings: The Return of the King (2003)
      0000000123  559468   8.8  Il buono, il brutto, il cattivo (1966)
      0000000133  1513600   8.8  Fight Club (1999)



#### Content of the "genres.list" data file

In [42]:
path = "./data/genres.list"
with open(path, encoding="utf8", errors='ignore') as myfile:
    head = [next(myfile) for x in range(392)]
print (''.join(head[382:392]))   # skipping the first 382 lines as they are descriptive header

# had to add 'encoding="utf8"'



"!Next?" (1994)						Documentary
"#1 Single" (2006)					Reality-TV
"#15SecondScare" (2015)					Horror
"#15SecondScare" (2015)					Short
"#15SecondScare" (2015)					Thriller
"#15SecondScare" (2015) {Who Wants to Play with the Rabbit? (#1.2)}	Drama
"#15SecondScare" (2015) {Who Wants to Play with the Rabbit? (#1.2)}	Horror
"#15SecondScare" (2015) {Who Wants to Play with the Rabbit? (#1.2)}	Short



### Step 1: Extracting the information from "genres.list"

In [44]:
import re
import pandas as pd

with open("./data/genres.list", encoding="utf8", errors='ignore') as genres_file:
    raw_content = genres_file.readlines()
    genres_list = []
    content = raw_content[382:]
    for line in content:
        m = re.match(r'"?(.*[^"])"? \(((?:\d|\?){4})(?:/\w*)?\).*\s((?:\w|-)+)', line.strip())
        if m is None: continue
        genres_list.append([m.group(1), m.group(2), m.group(3)])
        # genres_list.append([m.group(1), m.group(2), m.group(3)])
    genres_data = pd.DataFrame(genres_list, columns=['movie', 'year', 'genre'])
    
genres_data
    

Unnamed: 0,movie,year,genre
0,!Next?,1994,Documentary
1,#1 Single,2006,Reality-TV
2,#15SecondScare,2015,Horror
3,#15SecondScare,2015,Short
4,#15SecondScare,2015,Thriller
5,#15SecondScare,2015,Drama
6,#15SecondScare,2015,Horror
7,#15SecondScare,2015,Short
8,#15SecondScare,2015,Thriller
9,#1MinuteNightmare,2014,Horror
