## Part 1 : Data Acquisition

### Step 1: Downloading the 'Kaggle 5000 Movie Dataset'

In [1]:
import urllib.request # instead of 'import urllib in Python 2'
import os

if not os.path.exists('./data'):
    os.makedirs('./data')
    
kaggle_url = "https://users.encs.concordia.ca/~gregb/home/comp499data/movie_metadata.csv"
if not os.path.exists('.data/kaggle_dataset.csv'):
    response = urllib.request.urlretrieve(kaggle_url, './data/kaggle_dataset.csv')
    
    # had to change 'urllib.urlretrive' for 'urllib.request.urlretrieve'

### Step 2: Downloading the 'IMDB Plain Text Data'

In [2]:
import gzip

# Obtaining IMDB's text files
imdb_url_prefix = 'https://users.encs.concordia.ca/~gregb/home/comp499data/'
imdb_files_list = ['genres.list.gz', 'ratings.list.gz']
for name in imdb_files_list:
    if not os.path.exists('./data/' + name):
        response = urllib.request.urlretrieve(imdb_url_prefix + name, './data/' + name)
        urllib.request.urlcleanup()   # urllib fails to download two files from a ftp source. This fixes the bug!
        with gzip.open('./data/' + name) as comp_file, open('./data/' + name[:-3], 'w') as reg_file:
            file_content = comp_file.read()
            reg_file.write(file_content.decode("iso-8859-1"))
            
            
# had to change all calls 'urllib' in Python 2 to Python 3's 'urllib.request'
# also had to add 'decode(iso-8859-1")'


### Step 3: Downloading the 'IMDB Prepared Data'

In [3]:
imdb_url = "https://users.encs.concordia.ca/~gregb/home/comp499data/imdb_dataset.csv"
if not os.path.exists('./data/imdb_dataset.csv'): 
    response = urllib.request.urlretrieve(imdb_url, './data/imdb_dataset.csv')

## Part 2: Data Extraction

#### Content of the "ratings.list" data file

In [4]:
path = "./data/ratings.list"
with open(path) as myfile:
    head = [next(myfile) for x in range(38)]
print (''.join(head[28:38]))   # skipping the first 28 lines as they are descriptive headers


      0000000125  1888533   9.2  The Shawshank Redemption (1994)
      0000000125  1289428   9.2  The Godfather (1972)
      0000000124  889607   9.0  The Godfather: Part II (1974)
      0000000124  1864164   9.0  The Dark Knight (2008)
      0000000133  518449   8.9  12 Angry Men (1957)
      0000000133  971107   8.9  Schindler's List (1993)
      0000000123  1477112   8.9  Pulp Fiction (1994)
      0000000124  1349449   8.9  The Lord of the Rings: The Return of the King (2003)
      0000000123  559468   8.8  Il buono, il brutto, il cattivo (1966)
      0000000133  1513600   8.8  Fight Club (1999)



#### Content of the "genres.list" data file

In [5]:
path = "./data/genres.list"
with open(path) as myfile:
    head = [next(myfile) for x in range(392)]
print (''.join(head[382:392]))   # skipping the first 382 lines as they are descriptive header



"!Next?" (1994)						Documentary
"#1 Single" (2006)					Reality-TV
"#15SecondScare" (2015)					Horror
"#15SecondScare" (2015)					Short
"#15SecondScare" (2015)					Thriller
"#15SecondScare" (2015) {Who Wants to Play with the Rabbit? (#1.2)}	Drama
"#15SecondScare" (2015) {Who Wants to Play with the Rabbit? (#1.2)}	Horror
"#15SecondScare" (2015) {Who Wants to Play with the Rabbit? (#1.2)}	Short



### Step 1: Extracting the information from "genres.list"

In [6]:
import re
import pandas as pd

with open("./data/genres.list") as genres_file:
    raw_content = genres_file.readlines()
    genres_list = []
    content = raw_content[382:]
    for line in content:
        m = re.match(r'"?(.*[^"])"? \(((?:\d|\?){4})(?:/\w*)?\).*\s((?:\w|-)+)', line.strip())
        if m is None: continue
        genres_list.append([m.group(1), m.group(2), m.group(3)])
    genres_data = pd.DataFrame(genres_list, columns=['movie', 'year', 'genre'])
    
genres_data
    

Unnamed: 0,movie,year,genre
0,!Next?,1994,Documentary
1,#1 Single,2006,Reality-TV
2,#15SecondScare,2015,Horror
3,#15SecondScare,2015,Short
4,#15SecondScare,2015,Thriller
5,#15SecondScare,2015,Drama
6,#15SecondScare,2015,Horror
7,#15SecondScare,2015,Short
8,#15SecondScare,2015,Thriller
9,#1MinuteNightmare,2014,Horror


In [7]:
with open("./data/ratings.list") as ratings_file:
    raw_content = ratings_file.readlines()
    ratings_list = []
    content = raw_content[28:]
    for line in content:
        m = re.match(r'(?:\d|\.|\*){10}\s+\d+\s+(1?\d\.\d)\s"?(.*[^"])"? \(((?:\d|\?){4})(?:/\w*)?\)', line.strip())
        if m is None: continue
        ratings_list.append([m.group(2), m.group(3), m.group(1)])
    ratings_data = pd.DataFrame(ratings_list, columns=['movie', 'year', 'rating'])
    
ratings_data
    

Unnamed: 0,movie,year,rating
0,The Shawshank Redemption,1994,9.2
1,The Godfather,1972,9.2
2,The Godfather: Part II,1974,9.0
3,The Dark Knight,2008,9.0
4,12 Angry Men,1957,8.9
5,Schindler's List,1993,8.9
6,Pulp Fiction,1994,8.9
7,The Lord of the Rings: The Return of the King,2003,8.9
8,"Il buono, il brutto, il cattivo",1966,8.8
9,Fight Club,1999,8.8


## Part 3: Data Profiling and Cleaning

### Step 1: Loading the “Kaggle 5000 Movie Dataset”

In [8]:
import pandas as pd

# Loading the Kaggle dataset from the .csv file (kaggle_dataset.csv)
kaggle_data = pd.read_csv('./data/kaggle_dataset.csv')
kaggle_data

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0
5,Color,Andrew Stanton,462.0,132.0,475.0,530.0,Samantha Morton,640.0,73058679.0,Action|Adventure|Sci-Fi,...,738.0,English,USA,PG-13,263700000.0,2012.0,632.0,6.6,2.35,24000
6,Color,Sam Raimi,392.0,156.0,0.0,4000.0,James Franco,24000.0,336530303.0,Action|Adventure|Romance,...,1902.0,English,USA,PG-13,258000000.0,2007.0,11000.0,6.2,2.35,0
7,Color,Nathan Greno,324.0,100.0,15.0,284.0,Donna Murphy,799.0,200807262.0,Adventure|Animation|Comedy|Family|Fantasy|Musi...,...,387.0,English,USA,PG,260000000.0,2010.0,553.0,7.8,1.85,29000
8,Color,Joss Whedon,635.0,141.0,0.0,19000.0,Robert Downey Jr.,26000.0,458991599.0,Action|Adventure|Sci-Fi,...,1117.0,English,USA,PG-13,250000000.0,2015.0,21000.0,7.5,2.35,118000
9,Color,David Yates,375.0,153.0,282.0,10000.0,Daniel Radcliffe,25000.0,301956980.0,Adventure|Family|Fantasy|Mystery,...,973.0,English,UK,PG,250000000.0,2009.0,11000.0,7.5,2.35,10000


### Step 2: Calculating Some Basic Statistics (Profiling)

In [9]:
print ('Number of movies in kaggle_data: {}'.format(kaggle_data.shape[0]))
print ('Number of movies in genres_data: {}'.format(genres_data.shape[0]))
print ('Number of movies in ratings_data: {}'.format(ratings_data.shape[0]))

Number of movies in kaggle_data: 5043
Number of movies in genres_data: 2658941
Number of movies in ratings_data: 789415


### Step 3: Dealing with duplicates (cleaning)

In [10]:
print ('Number of duplicates in kaggle_data: {}'.format(
    sum(kaggle_data.duplicated(subset=['movie_title', 'title_year'], keep=False))))
print ('Number of duplicates in genres_data: {}'.format(
    sum(genres_data.duplicated(subset=['movie', 'year'], keep=False))))
print ('Number of duplicates in ratings_data: {}'.format(
    sum(ratings_data.duplicated(subset=['movie', 'year'], keep=False))))

Number of duplicates in kaggle_data: 241
Number of duplicates in genres_data: 2031322
Number of duplicates in ratings_data: 342815


In [11]:
# kaggle_data.duplicated(subset=['movie_title', 'title_year'], keep=False)

# help(pd.DataFrame.duplicated)

# print all the duplicated elements indexes   
# dupli = kaggle_data.duplicated(subset=['movie_title', 'title_year'], keep=False)
# for idx, val in enumerate(dupli):
#    if val == True:
#        print(idx)

In [12]:
kaggle_data = kaggle_data.drop_duplicates(subset=['movie_title', 'title_year'], keep='first').copy()
genres_data = genres_data.drop_duplicates(subset=['movie', 'year'], keep='first').copy()
ratings_data = ratings_data.drop_duplicates(subset=['movie', 'year'], keep='first').copy()


### Step 4: Normalizing the text (cleaning)

The key attribute that we will use to integrate our movie datasets is the movie titles. So it is important to normalize these titles. The following code snippet makes all movie titles lower case, and then removes certain characters such as “‘” and “?”, and replaces some other special characters (e.g., “&” is replaced with “and”)

In [13]:
def preprocess_title(title):
    title = title.lower()
    title = title.replace(',', ' ')
    title = title.replace("'", '')    
    title = title.replace('&', 'and')
    title = title.replace('?', '')
    # title = title.decode('utf-8', 'ignore')  in Python 3, all str is unicode object
    return title.strip()

kaggle_data['norm_movie_title'] = kaggle_data['movie_title'].map(preprocess_title)
genres_data['norm_movie'] = genres_data['movie'].map(preprocess_title)
ratings_data['norm_movie'] = ratings_data['movie'].map(preprocess_title)

### Step 5: Looking at a few samples

In [14]:
kaggle_data.sample(3, random_state=0)

# pandas.DataFrame.sample: Return a random sample of items from an axis of object
# random_state: for reproducibility

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,norm_movie_title
4422,Color,Simeon Rice,6.0,93.0,6.0,56.0,Lisa Brave,393.0,,Action|Horror|Thriller,...,English,USA,R,1500000.0,2014.0,191.0,5.5,2.35,307,unsullied
1022,Color,Doug Liman,214.0,108.0,218.0,405.0,Ty Burrell,6000.0,9528092.0,Biography|Drama|Thriller,...,English,USA,PG-13,22000000.0,2010.0,3000.0,6.8,2.35,9000,fair game
3631,Color,Jonathan Levine,147.0,99.0,129.0,362.0,Aaron Yoo,976.0,2077046.0,Comedy|Drama|Romance,...,English,USA,R,6000000.0,2008.0,617.0,7.0,2.35,0,the wackness


The title_year attribute is stored as floats (i.e., rational numbers).
We can add another cleaning step to transform the title_year into strings and replace the missing title years with symbol “?”.

In [82]:
def preprocess_year(year):
    if pd.isnull(year):
        return '?'
    else:
        return str(int(year))

kaggle_data['norm_title_year'] = kaggle_data['title_year'].map(preprocess_year)
kaggle_data.head()

# pandas.core.series.Series.map : Map values of Series using input correspondence (which can be a dict, Series, or function)


Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,norm_movie_title,norm_title_year,id,mixture
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,237000000.0,2009.0,936.0,7.9,1.78,33000,avatar,2009,0,avatar 2009 237000000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,300000000.0,2007.0,5000.0,7.1,2.35,0,pirates of the caribbean: at worlds end,2007,1,pirates of the caribbean: at worlds end 2007 300000000
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,245000000.0,2015.0,393.0,6.8,2.35,85000,spectre,2015,2,spectre 2015 245000000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,250000000.0,2012.0,23000.0,8.5,2.35,164000,the dark knight rises,2012,3,the dark knight rises 2012 250000000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,12.0,7.1,,0,star wars: episode vii - the force awakens,?,4,


## Part 4: Data Matching & Merging

### Step 1: Integrating the “IMDB Plain Text Data” files

Note that both ratings_data and genres_data dataframes contain data that come from the same source (i.e., “the IMDB Plain Text data”). Thus, we assume that there are no inconsistencies between the data stored in these dataframe and to combine them, all we need to do is to match the entries that share the same title and production year. This simple “exact match” can be done simply using dataframes.

In [83]:
brief_imdb_data = pd.merge(ratings_data, genres_data, how='inner', on=['norm_movie', 'year'])
brief_imdb_data.head()

# inner join

Unnamed: 0,movie_x,year,rating,norm_movie,movie_y,genre
0,The Shawshank Redemption,1994,9.2,the shawshank redemption,The Shawshank Redemption,Crime
1,The Godfather,1972,9.2,the godfather,The Godfather,Crime
2,The Godfather: Part II,1974,9.0,the godfather: part ii,The Godfather: Part II,Crime
3,The Dark Knight,2008,9.0,the dark knight,The Dark Knight,Action
4,12 Angry Men,1957,8.9,12 angry men,12 Angry Men,Crime


We refer to the dataset created above as the brief_imdb_data since it only contains two attributes (namely, genre and rating). Henceforth, we are going to use a richer version of the IMDB dataset which we created by integrating a number of files from the “IMDB Plain Text Data”. If you have completed the first part of this tutorial, then this dataset is already downloaded and stored in “imdb_dataset.csv” under the “data” folder. The following code snippet loads this dataset, does preprocessing on the title and production year of movies, removes the duplicates as before, and prints the size of the dataset.

In [84]:
# reading the new IMDB dataset
imdb_data = pd.read_csv('./data/imdb_dataset.csv')
# let's normlize the title as we did in Part 3 of the tutorial
imdb_data['norm_title'] = imdb_data['title'].map(preprocess_title) #functions defined precedently
imdb_data['norm_year'] = imdb_data['year'].map(preprocess_year)
imdb_data = imdb_data.drop_duplicates(subset=['norm_title', 'norm_year'], keep='first').copy()
imdb_data.shape


(869178, 27)

### Step 2: Integrating the Kaggle and IMDB datasets

A simple approach to integrate the two datasets is to simply join entries that share the same movie title and year of production. 

In [18]:
data_attempt1 = pd.merge(imdb_data, kaggle_data, how='inner', left_on=['norm_title', 'norm_year'], right_on=['norm_movie_title', 'norm_title_year'])
data_attempt1.shape


(4248, 57)

But given that IMDB and Kaggle datasets are collected from different sources, chances are that the name of a movie would be slightly different in these datasets (e.g. “Wall.E” vs “WallE”). To be able to find such matches, one can look at the similarity of movie titles and consider title with high similarity to be the same entity. BigGorilla provides a python pacakge named py_stringsimjoin for doing similarity join across two datasets. The following code snippet uses the py_stringsimjoin to match all the titles that have an edit distance of one or less (i.e., there is at most one character that needs to be changed/added/removed to make both titles identical). Once the similarity join is complete, it only selects the title pairs that are produced in the same year.

In [20]:
import py_stringsimjoin as ssj
    # had to run: pip install py-stringsimjoin
    # consider title with high similarity to be the same entity
import py_stringmatching as sm
    # had to run:  pip install py_entitymatching

imdb_data['id'] = range(imdb_data.shape[0])
kaggle_data['id'] = range(kaggle_data.shape[0])
similar_titles = ssj.edit_distance_join(imdb_data, kaggle_data, 'id', 'id', 'norm_title','norm_movie_title', l_out_attrs=['norm_title', 'norm_year'],r_out_attrs=['norm_movie_title', 'norm_title_year'], threshold=1)
# selecting the entries that have the same production year
data_attempt2 = similar_titles[similar_titles.r_norm_title_year == similar_titles.l_norm_year]
data_attempt2.shape


0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:49


(4679, 8)

We can see that using the similarity join 4,689 titles were matched. Let’s look at some of the titles that are matched by the similarity join but are not identical.

In [86]:
# check similarity joins

data_attempt2[data_attempt2.l_norm_title != data_attempt2.r_norm_movie_title].head()

Unnamed: 0,_id,l_id,r_id,l_norm_title,l_norm_year,r_norm_movie_title,r_norm_title_year,_sim_score
145,145,852736,46,world war v,2013,world war z,2013,1.0
175,175,281649,56,grave,2012,brave,2012,1.0
243,243,816188,67,upe,2009,up,2009,1.0
250,250,817366,67,ut,2009,up,2009,1.0
265,265,316745,70,hug,2011,hugo,2011,1.0


### Step 3: Using Magellan for Data Matching

#### Substep A: Finding a candidate set (Blocking)

The goal of this step is to limit the number of pairs that we consider as potential matches using a simple heuristic. For this task, we can create a new column in each dataset that combines the values of important attributes into a single string (which we call the mixture). Then, we can use the string similarity join as before to find a set of entities that have some overlap in the values of the important columns. Before doing that, we need to transform the columns that are part of the mixture to strings. The py_stringsimjoin package allows us to do so easily.

In [122]:
# creates a string composed of different columns, to facilitate string comparison (matching pairs)

# transforming the "budget" column into string and creating a new **mixture** column
ssj.utils.converter.dataframe_column_to_str(imdb_data, 'budget', inplace=True)
imdb_data['mixture'] = imdb_data['norm_title'] + ' ' + imdb_data['norm_year'] + ' ' + imdb_data['budget']

# repeating the same thing for the Kaggle dataset
ssj.utils.converter.dataframe_column_to_str(kaggle_data, 'budget', inplace=True)
kaggle_data['mixture'] = kaggle_data['norm_movie_title'] + ' ' + kaggle_data['norm_title_year'] + ' ' + kaggle_data['budget']


Now, we can use the mixture columns to create a desired candidate set which we call C.

In [123]:
C = ssj.overlap_coefficient_join(kaggle_data, imdb_data, 'id', 'id', 'mixture', 'mixture', sm.WhitespaceTokenizer(), l_out_attrs=['norm_movie_title', 'norm_title_year', 'duration', 'budget', 'content_rating'], r_out_attrs=['norm_title', 'norm_year', 'length', 'budget', 'mpaa'], threshold=0.65)

print(C)

0% [##############################] 100% | ETA: 00:00:00

         _id  l_id    r_id              l_norm_movie_title l_norm_title_year  \
0          0  4841      99            dude  wheres my dog!              2014   
1          1  4337     106                       road hard              2015   
2          2  4342     106           me you and five bucks              2015   
3          3  4351     106                       checkmate              2015   
4          4  4352     106                         #horror              2015   
5          5  4871     183            the image revolution              2014   
6          6  3429     288           house of 1000 corpses              2003   
7          7  4029     368                       compadres              2016   
8          8  2726     450             crocodile dundee ii              1988   
9          9  4775     630                            burn              2012   
10        10  4471     733                         orgazmo              1997   
11        11  3406     838              


Total time elapsed: 00:00:12


We can see that by doing a similarity join, we already reduced the candidate set to 18,317 pairs.

#### Substep B: Specifying the keys

The next step is to specify to the py_entitymatching package which columns correspond to the keys in each dataframe. Also, we need to specify which columns correspond to the foreign keys of the the two dataframes in the candidate set.

In [124]:
# had to run on terminal:
# condal install -c conda-forge py_entitymatching

In [125]:
import py_entitymatching as em
em.set_key(kaggle_data, 'id')   # specifying the key column in the kaggle dataset
em.set_key(imdb_data, 'id')     # specifying the key column in the imdb dataset
em.set_key(C, '_id')            # specifying the key in the candidate set
em.set_ltable(C, kaggle_data)   # specifying the left table 
em.set_rtable(C, imdb_data)     # specifying the right table
em.set_fk_rtable(C, 'r_id')     # specifying the column that matches the key in the right table 
em.set_fk_ltable(C, 'l_id')     # specifying the column that matches the key in the left table 


True

#### Substep C: Debugging the blocker

Now, we need to make sure that the candidate set is loose enough to include pairs of movies that are not very close. If this is not the case, there is a chance that we have eliminated pair that could be potentially matched together. By looking at a few pairs from the candidate set, we can judge whether the blocking step has been too harsh or not.

Note: The py_entitymatching package provides some tools for debugging the blocker as well.

In [126]:
C[['l_norm_movie_title', 'r_norm_title', 'l_norm_title_year', 'r_norm_year',
   'l_budget', 'r_budget', 'l_content_rating', 'r_mpaa']].head()

Unnamed: 0,l_norm_movie_title,r_norm_title,l_norm_title_year,r_norm_year,l_budget,r_budget,l_content_rating,r_mpaa
0,dude wheres my dog!,#hacked,2014,2014,20000,20000,PG,
1,road hard,#horror,2015,2015,1500000,1500000,,
2,me you and five bucks,#horror,2015,2015,1500000,1500000,,
3,checkmate,#horror,2015,2015,1500000,1500000,,
4,#horror,#horror,2015,2015,1500000,1500000,Not Rated,


#### Substep D: Sampling from the candidate set

The goal of this step is to obtain a sample from the candidate set and manually label the sampled candidates; that is, to specify if the candidate pair is a correct match or not.

In [127]:
# Sampling 500 pairs
sampled = C.sample(500, random_state=0)
sampled.to_csv('./data/sampled.csv', encoding='utf-8')

In order to label the sampled data, we can create a new column in the .csv file (which we call label) and put value 1 under that column if the pair is a correct match and 0 otherwise. To avoid overriding the files, let’s rename the new file as labeled.csv.

In [128]:
# response = urllib.request.urlretrieve('https://anaconda.org/BigGorilla/datasets/1/download/labeled.csv', './data/labeled.csv')

import urllib.request
url = "https://anaconda.org/BigGorilla/datasets/1/download/labeled.csv"
#hdr = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
opener = urllib.request.build_opener()
opener.addheaders = [['User-agent', 'Mozilla/5.0']]
urllib.request.install_opener(opener)

#req = urllib.request.Request(url, headers=hdr)
response = urllib.request.urlretrieve(url, './data/labeled.csv')



In [129]:
labeled = em.read_csv_metadata('./data/labeled.csv', ltable=kaggle_data, rtable=imdb_data, fk_ltable='l_id', fk_rtable='r_id', key='_id')
labeled.head()


Metadata file is not present in the given path; proceeding to read the csv file.


Unnamed: 0.1,Unnamed: 0,_id,l_id,r_id,l_norm_movie_title,l_norm_title_year,l_duration,l_budget,l_content_rating,r_norm_title,r_norm_year,r_length,r_budget,r_mpaa,_sim_score,label
0,4771,4771,2639,235925,eye of the beholder,1999,109.0,15000000,R,eye of the beholder,1999,109.0,35000000,R,0.833333,1
1,11478,11478,2001,600301,rocky balboa,2006,139.0,24000000,PG,rocky balboa,2006,139.0,24000000,PG,1.0,1
2,13630,13630,4160,691766,from russia with love,1963,115.0,2000000,Approved,the aeolians: from russia with love,2012,,20000,,0.666667,0
3,1972,1972,1248,101029,sex tape,2014,94.0,40000000,R,blended,2014,117.0,40000000,PG-13,0.666667,0
4,15903,15903,722,758133,the scorch trials,2015,132.0,61000000,PG-13,the scorch trials,2015,132.0,61000000,PG-13,1.0,1


#### Substep E: Traning machine learning algorithms

In [130]:
# We use the sampled dataset to train various machine learning algorithms for our prediction task.

#Split our dataset into a training and a test set, 
# and then select the desired machine learning technique for our prediction task

split = em.split_train_test(labeled, train_proportion=0.5, random_state=0)
train_data = split['train']
test_data = split['test']

dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NaiveBayes')

Before we can apply any machine learning technique, we need to extract a set of features. Fortunately, the py_entitymatching package can automatically extract a set of features once we specify which columns in the two datasets correspond to each other. The following code snippet starts by specifying the correspondence between the column of the two datasets. Then, it uses the py_entitymatching package to determine the type of each column. By considering the types of columns in each dataset (stored in variables l_attr_types and r_attr_types), and using the tokenizers and similarity functions suggested by the package, we can extract a set of instructions for extracting features. Note that variable F is not the set of extracted features, rather it encodes the instructions for computing the features.

In [131]:
attr_corres = em.get_attr_corres(kaggle_data, imdb_data)
    # type(attr_corres)  => dict
attr_corres['corres'] = [('norm_movie_title', 'norm_title'),
                        ('norm_title_year', 'norm_year'),
                        ('content_rating', 'mpaa'),
                        ('budget', 'budget'),
                       ]
    # type(attr_corres['corres']) => list

l_attr_types = em.get_attr_types(kaggle_data)   
    # dict, ie norm_movie_title, norm_title'year, budget, etc.
r_attr_types = em.get_attr_types(imdb_data)   # dict

tok = em.get_tokenizers_for_matching() # dict
sim = em.get_sim_funs_for_matching() # dict

F = em.get_features(kaggle_data, imdb_data, l_attr_types, r_attr_types, attr_corres, tok, sim)
# type(F) pandas.core.frame.DataFrame

# print(F)

Given the set of desired features F, we can now calculate the feature values for our training data and also impute the missing values in our data. In this case, we choose to replace the missing values with the mean of the column.

In [132]:
train_features = em.extract_feature_vecs(train_data, feature_table=F, attrs_after='label', show_progress=False)
train_features = em.impute_table(train_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], strategy='mean')




Using the calculated features, we can evaluate the performance of different machine learning algorithms and select the best one for our matching task

In [133]:
# help(em.select_matcher)

In [134]:
result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=train_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], k=5, target_attr='label', metrics_to_display='f1', random_state=0)
    # had to change the argument 'metric' to 'metrics_to_display'
result['cv_stats']



Unnamed: 0,Matcher,Average f1
0,DecisionTree,0.701211
1,RF,0.688889
2,SVM,0.162198
3,LinReg,0.738423
4,LogReg,0.468584
5,NaiveBayes,0.459578


#### Substep F: Evaluating the quality of our matching

In [135]:
# We are obtaining a high accurracy and recall on the test set as well.b


best_model = result['selected_matcher']
best_model.fit(table=train_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], target_attr='label')

test_features = em.extract_feature_vecs(test_data, feature_table=F, attrs_after='label', show_progress=False)
test_features = em.impute_table(test_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], strategy='mean')

# Predict on the test data
predictions = best_model.predict(table=test_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], 
								 append=True, target_attr='predicted', inplace=False)

# Evaluate the predictions
eval_result = em.eval_matches(predictions, 'label', 'predicted')





#### Substep G: Using the trained model to match the datasets

In [136]:
# match the 2 tables using the trained model

candset_features = em.extract_feature_vecs(C, feature_table=F, show_progress=True)
candset_features = em.impute_table(candset_features, exclude_attrs=['_id', 'l_id', 'r_id'], strategy='mean')
predictions = best_model.predict(table=candset_features, exclude_attrs=['_id', 'l_id', 'r_id'], append=True, target_attr='predicted', inplace=False)
matches = predictions[predictions.predicted == 1]



0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:38


In [137]:
# remove all unecessary columns, 
# creates a well-formatted dataframe that has the resulting integrated dataset

from py_entitymatching.catalog import catalog_manager as cm
matches = matches[['_id', 'l_id', 'r_id', 'predicted']]
matches.reset_index(drop=True, inplace=True)
cm.set_candset_properties(matches, '_id', 'l_id', 'r_id', kaggle_data, imdb_data)
matches = em.add_output_attributes(matches, l_output_attrs=['norm_movie_title', 'norm_title_year', 'budget', 'content_rating'], r_output_attrs=['norm_title', 'norm_year', 'budget', 'mpaa'], l_output_prefix='l_', r_output_prefix='r_', delete_from_catalog=False)
matches.drop('predicted', axis=1, inplace=True)
matches.head()


Unnamed: 0,_id,l_id,r_id,l_norm_movie_title,l_norm_title_year,l_budget,l_content_rating,r_norm_title,r_norm_year,r_budget,r_mpaa
0,4,4352,106,#horror,2015,1500000,Not Rated,#horror,2015,1500000,
1,8,2726,450,crocodile dundee ii,1988,15800000,PG,crocodile dundee ii,1988,14000000,
2,11,3406,838,500 days of summer,2009,7500000,PG-13,(500) days of summer,2009,7500000,PG-13
3,25,3631,1872,10 cloverfield lane,2016,15000000,PG-13,10 cloverfield lane,2016,15000000,PG-13
4,26,2965,1881,10 days in a madhouse,2015,12000000,R,10 days in delaware,2015,0,


Question durant le tutoriel:

De faire un simple 'merge' sur 2 tables représentant les mêmes données ne donnera pas le résultat escompté. On rencontrera le problème d'inconsistence des données, alors que par exemple si on fait un joins sur une colonne par nom, en espérant qu'un 'Michel' et un 'michel', ou un 'MICHEL' (etc.) soit 'merged', cela ne fonctionnera pas, car les 2 strings ne sont pas équivalentes. On doit donc commencer par nettoyer les données en, par exemple, les uniformisant. Il s'agira de s'assurer que les É et les E, ou les majuscules et minuscules ne soient pas distinguées, si c'est ce que l'on veut. Aussi, de s'assurer que les unités de mesure (ex. cm vs pouce) soit les mêmes entre les 2 tables. 

RÉSUMÉ:

J'ai appris la différence entre un simple merge, d'utiliser la edit distance sur des strings, et d'utiliser l'apprentissage machine pour matcher des entités. 
Il va de soi que l'efficacité est progressive (la plus efficace étant l'apprentissage machine), comparatifs, statistiques et nombres de rows résultants à l'appui.