## Movie data matching
- dataset : IMDB, Rotten Tomato

In [1]:
import os
import pandas as pd
from matplotlib import pyplot as plt

%matplotlib inline

### 1. Load data 
- IMDB
- Rotten Tomato

#### 1-1. IMDB

In [2]:
path = './../../data/'

In [3]:
os.listdir(path)

['imdb_30.csv',
 'imdb_30.xlsx',
 'imdb_40(수정).xlsx',
 'imdb_40.csv',
 'imdb_40.xlsx',
 'imdb_50.csv',
 'imdb_top_1000.csv',
 'ml-25m',
 'ml-25m.zip',
 'movieLens',
 'new_imdb_40.xlsx',
 'old_imdb_40.xlsx',
 'rotten_tomato']

In [4]:
imdb_1000 = pd.read_csv(os.path.join(path + 'imdb_top_1000.csv'), encoding='utf-8')
print(imdb_1000.shape)

(1000, 16)


In [5]:
len(imdb_1000.columns)

16

In [6]:
imdb_1000.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [7]:
imdb_df = imdb_1000.copy()

- IMDB Top 40

In [8]:
imdb_40 = imdb_df.iloc[0:40,:]

In [9]:
imdb_40.reset_index(inplace=True)

In [10]:
imdb_40['Overview'][0]

'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.'

#### 1-2. Rotten Tomato

In [11]:
path = './../../data/rotten_tomato/'

In [12]:
os.listdir(path)

['rotten_tomatoes_critic_reviews.csv', 'rotten_tomatoes_movies.csv']

In [13]:
rotten_df = pd.read_csv(os.path.join(path + 'rotten_tomatoes_movies.csv'), encoding='utf-8')
print(rotten_df.shape)

(17712, 22)


In [14]:
rotten_df.head(1)

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76


In [15]:
len(rotten_df.columns)

22

### 2. Comparison between Rotten and IMDb

In [18]:
rotten_set = set(rotten_df.movie_title) 
print(len(rotten_set))

17106


In [20]:
imdb_set = set(imdb_40.Series_Title)
print(len(imdb_set))

40


In [24]:
difference_movies = imdb_set - rotten_set
print(len(difference_movies))

13


In [22]:
intersection_movies = imdb_set.intersection(rotten_set)
print(len(intersection_movies))

27


In [23]:
intersection_movies

{'12 Angry Men',
 'Fight Club',
 'Forrest Gump',
 'Gladiator',
 'Inception',
 'Interstellar',
 "It's a Wonderful Life",
 'Joker',
 "One Flew Over the Cuckoo's Nest",
 'Pulp Fiction',
 'Saving Private Ryan',
 "Schindler's List",
 'Star Wars: Episode V - The Empire Strikes Back',
 'The Dark Knight',
 'The Departed',
 'The Godfather',
 'The Green Mile',
 'The Intouchables',
 'The Lord of the Rings: The Fellowship of the Ring',
 'The Lord of the Rings: The Return of the King',
 'The Lord of the Rings: The Two Towers',
 'The Matrix',
 'The Pianist',
 'The Prestige',
 'The Shawshank Redemption',
 'The Silence of the Lambs',
 'Whiplash'}

In [25]:
difference_movies

{'Cidade de Deus',
 'Gisaengchung',
 'Goodfellas',
 'Hamilton',
 'Il buono, il brutto, il cattivo',
 'La vita è bella',
 'Se7en',
 'Sen to Chihiro no kamikakushi',
 'Seppuku',
 'Shichinin no samurai',
 'Soorarai Pottru',
 'Star Wars',
 'The Godfather: Part II'}

- 동일한 제목의 imdb 영화

In [26]:
imdb_40[imdb_40['Series_Title'].isin(intersection_movies)]

Unnamed: 0,index,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0
1,1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0
2,2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444.0
4,4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000.0
5,5,https://m.media-amazon.com/images/M/MV5BNzA5ZD...,The Lord of the Rings: The Return of the King,2003,U,201 min,"Action, Adventure, Drama",8.9,Gandalf and Aragorn lead the World of Men agai...,94.0,Peter Jackson,Elijah Wood,Viggo Mortensen,Ian McKellen,Orlando Bloom,1642758,377845905.0
6,6,https://m.media-amazon.com/images/M/MV5BNGNhMD...,Pulp Fiction,1994,A,154 min,"Crime, Drama",8.9,"The lives of two mob hitmen, a boxer, a gangst...",94.0,Quentin Tarantino,John Travolta,Uma Thurman,Samuel L. Jackson,Bruce Willis,1826188,107928762.0
7,7,https://m.media-amazon.com/images/M/MV5BNDE4OT...,Schindler's List,1993,A,195 min,"Biography, Drama, History",8.9,"In German-occupied Poland during World War II,...",94.0,Steven Spielberg,Liam Neeson,Ralph Fiennes,Ben Kingsley,Caroline Goodall,1213505,96898818.0
8,8,https://m.media-amazon.com/images/M/MV5BMjAxMz...,Inception,2010,UA,148 min,"Action, Adventure, Sci-Fi",8.8,A thief who steals corporate secrets through t...,74.0,Christopher Nolan,Leonardo DiCaprio,Joseph Gordon-Levitt,Elliot Page,Ken Watanabe,2067042,292576195.0
9,9,https://m.media-amazon.com/images/M/MV5BMmEzNT...,Fight Club,1999,A,139 min,Drama,8.8,An insomniac office worker and a devil-may-car...,66.0,David Fincher,Brad Pitt,Edward Norton,Meat Loaf,Zach Grenier,1854740,37030102.0
10,10,https://m.media-amazon.com/images/M/MV5BN2EyZj...,The Lord of the Rings: The Fellowship of the Ring,2001,U,178 min,"Action, Adventure, Drama",8.8,A meek Hobbit from the Shire and eight compani...,92.0,Peter Jackson,Elijah Wood,Ian McKellen,Orlando Bloom,Sean Bean,1661481,315544750.0


In [33]:
rotten_df[rotten_df['movie_title'].isin(intersection_movies)][['movie_title','original_release_date']
]

Unnamed: 0,movie_title,original_release_date
911,Gladiator,
1171,12 Angry Men,1997-08-17
5401,The Departed,2006-10-06
5993,Star Wars: Episode V - The Empire Strikes Back,1980-06-20
6410,Fight Club,1999-10-15
6645,Forrest Gump,1994-07-06
7052,Gladiator,2000-05-05
7091,The Godfather,1972-03-24
7298,The Green Mile,1999-12-10
8324,Inception,2010-07-16


#### 2-3. Analyze the difference

- Title matching에서 missing된 영화

In [34]:
difference_movies

{'Cidade de Deus',
 'Gisaengchung',
 'Goodfellas',
 'Hamilton',
 'Il buono, il brutto, il cattivo',
 'La vita è bella',
 'Se7en',
 'Sen to Chihiro no kamikakushi',
 'Seppuku',
 'Shichinin no samurai',
 'Soorarai Pottru',
 'Star Wars',
 'The Godfather: Part II'}

In [35]:
rotten_df['movie_title'] = rotten_df['movie_title'].str.lower()
imdb_40['Series_Title'] = imdb_40['Series_Title'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [58]:
count = 0
for num in range(0, len(rotten_df)): 
#     print(num)
    if('Shichinin no samurai'.lower() in rotten_df['movie_title'][num]):
        count = count + 1
        print(f"{num} | {rotten_df['movie_title'].loc[num]} | {rotten_df['original_release_date'].loc[num]}  | {rotten_df['streaming_release_date'].loc[num]}")
print(count)

13084 | seven samurai (shichinin no samurai) | 1956-11-19  | 2016-08-10
1


### 3. Right Answer

In [62]:
path = './../../data/'

In [63]:
right_df = pd.read_excel(os.path.join(path + 'imdb_40(rotten비교).xlsx'))

In [64]:
right_df.head()

Unnamed: 0,title,imdb_id,rotten_id,year
0,The Shawshank Redemption,0,13183.0,1994
1,The Godfather,1,7052.0,1972
2,The Dark Knight,2,14788.0,2008
3,The Godfather: Part II,3,7092.0,1974
4,12 Angry Men,4,1171.0,1957


In [65]:
right_df.dropna(axis=0, inplace=True)

In [66]:
len(right_df)

35

In [67]:
right_df = right_df.astype({'rotten_id': 'int64'})

In [68]:
right_df.dtypes

title        object
imdb_id       int64
rotten_id     int64
year          int64
dtype: object

### 4. Similarity comparison using features

In [69]:
imdb_40.columns

Index(['index', 'Poster_Link', 'Series_Title', 'Released_Year', 'Certificate',
       'Runtime', 'Genre', 'IMDB_Rating', 'Overview', 'Meta_score', 'Director',
       'Star1', 'Star2', 'Star3', 'Star4', 'No_of_Votes', 'Gross'],
      dtype='object')

In [70]:
rotten_df.columns

Index(['rotten_tomatoes_link', 'movie_title', 'movie_info',
       'critics_consensus', 'content_rating', 'genres', 'directors', 'authors',
       'actors', 'original_release_date', 'streaming_release_date', 'runtime',
       'production_company', 'tomatometer_status', 'tomatometer_rating',
       'tomatometer_count', 'audience_status', 'audience_rating',
       'audience_count', 'tomatometer_top_critics_count',
       'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count'],
      dtype='object')

#### 4-1. IMDB 40

In [72]:
common_imdb = imdb_40.copy()

In [75]:
common_imdb.drop(['Poster_Link','IMDB_Rating','Meta_score','No_of_Votes','Gross'], axis=1, inplace=True)

In [76]:
common_imdb.columns

Index(['index', 'Series_Title', 'Released_Year', 'Certificate', 'Runtime',
       'Genre', 'Overview', 'Director', 'Star1', 'Star2', 'Star3', 'Star4'],
      dtype='object')

In [77]:
common_imdb.columns = ['index', 'title', 'year', 'certificate', 'runtime', 'genre', 'overview', 'director', 'Star1', 'Star2', 'Star3', 'Star4']

In [78]:
common_imdb.columns

Index(['index', 'title', 'year', 'certificate', 'runtime', 'genre', 'overview',
       'director', 'Star1', 'Star2', 'Star3', 'Star4'],
      dtype='object')

In [79]:
cols = ['Star1', 'Star2', 'Star3', 'Star4']
common_imdb['actor'] = common_imdb[cols].apply(lambda row: ','.join(row.values.astype(str)), axis=1)

In [82]:
common_imdb.drop(['Star1', 'Star2', 'Star3', 'Star4'], axis=1, inplace=True)

In [83]:
common_imdb.columns

Index(['index', 'title', 'year', 'certificate', 'runtime', 'genre', 'overview',
       'director', 'actor'],
      dtype='object')

In [84]:
common_imdb.head()

Unnamed: 0,index,title,year,certificate,runtime,genre,overview,director,actor
0,0,the shawshank redemption,1994,A,142 min,Drama,Two imprisoned men bond over a number of years...,Frank Darabont,"Tim Robbins,Morgan Freeman,Bob Gunton,William ..."
1,1,the godfather,1972,A,175 min,"Crime, Drama",An organized crime dynasty's aging patriarch t...,Francis Ford Coppola,"Marlon Brando,Al Pacino,James Caan,Diane Keaton"
2,2,the dark knight,2008,UA,152 min,"Action, Crime, Drama",When the menace known as the Joker wreaks havo...,Christopher Nolan,"Christian Bale,Heath Ledger,Aaron Eckhart,Mich..."
3,3,the godfather: part ii,1974,A,202 min,"Crime, Drama",The early life and career of Vito Corleone in ...,Francis Ford Coppola,"Al Pacino,Robert De Niro,Robert Duvall,Diane K..."
4,4,12 angry men,1957,U,96 min,"Crime, Drama",A jury holdout attempts to prevent a miscarria...,Sidney Lumet,"Henry Fonda,Lee J. Cobb,Martin Balsam,John Fie..."


#### 4-2. Rotten Tomato

In [100]:
common_rotten = rotten_df.copy()

In [105]:
common_rotten.reset_index(inplace=True)

In [106]:
common_rotten.columns

Index(['index', 'rotten_tomatoes_link', 'movie_title', 'movie_info',
       'critics_consensus', 'content_rating', 'genres', 'directors', 'authors',
       'actors', 'original_release_date', 'streaming_release_date', 'runtime',
       'production_company', 'tomatometer_status', 'tomatometer_rating',
       'tomatometer_count', 'audience_status', 'audience_rating',
       'audience_count', 'tomatometer_top_critics_count',
       'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count'],
      dtype='object')

In [107]:
drop_cols = ['rotten_tomatoes_link', 'critics_consensus','authors', 'streaming_release_date', 'production_company', 'tomatometer_status', 'tomatometer_rating', 'tomatometer_count', 'audience_status', 'audience_rating', 'audience_count', 'tomatometer_top_critics_count','tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count']
common_rotten.drop(drop_cols, axis=1, inplace=True)

In [108]:
common_rotten.columns

Index(['index', 'movie_title', 'movie_info', 'content_rating', 'genres',
       'directors', 'actors', 'original_release_date', 'runtime'],
      dtype='object')

In [110]:
common_rotten.columns = ['index', 'title', 'overview', 'certificate', 'genre', 'director', 'actor', 'date', 'runtime']

In [111]:
common_rotten.columns

Index(['index', 'title', 'overview', 'certificate', 'genre', 'director',
       'actor', 'date', 'runtime'],
      dtype='object')

In [112]:
# 문자열을 Datetime으로 변환하기
common_rotten['date'] = pd.to_datetime(common_rotten['date'],format='%Y-%m-%d')

In [113]:
common_rotten['year'] = common_rotten['date'].dt.year

In [114]:
common_rotten.columns

Index(['index', 'title', 'overview', 'certificate', 'genre', 'director',
       'actor', 'date', 'runtime', 'year'],
      dtype='object')

In [115]:
common_rotten.drop(['date'], axis=1, inplace=True)

In [116]:
common_rotten.columns

Index(['index', 'title', 'overview', 'certificate', 'genre', 'director',
       'actor', 'runtime', 'year'],
      dtype='object')

In [119]:
common_rotten.head()

Unnamed: 0,index,title,overview,certificate,genre,director,actor,runtime,year
0,0,percy jackson & the olympians: the lightning t...,"Always trouble-prone, the life of teenager Per...",PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",119.0,2010.0
1,1,please give,Kate (Catherine Keener) and her husband Alex (...,R,Comedy,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",90.0,2010.0
2,2,10,"A successful, middle-aged Hollywood songwriter...",R,"Comedy, Romance",Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",122.0,1979.0
3,3,12 angry men (twelve angry men),Following the closing arguments in a murder tr...,NR,"Classics, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",95.0,1957.0
4,4,"20,000 leagues under the sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",127.0,1954.0
