In [None]:
"""
Combine all data from:
1. movie_data_adaptation
2. director_data
3. book_data
4. found_book
5. author_book_data
6. book_history_data

functions used:
director_value(movie) from director.py

Output to pickle: all_data

"""

In [48]:
import pandas as pd
import numpy as np
import pickle

import sys
sys.path.append('/Users/katiehuang/Documents/metis/projects/onl_ds5_project_2/py')
from director import *

## 1. Merge movie data and (book_data, found_book)

In [7]:
# Load in movie_data and book_data pickle files
movie_df = pd.read_pickle('../data/movie_data_adaptation')
book_df = pd.read_pickle('../data/book_data')
found_book_df = pd.read_pickle('../data/found_book')

In [11]:
# Combine book_df and found_book_df to book_all_df
book_all_df = pd.concat([book_df,found_book_df],axis=0).\
drop_duplicates(subset=['book_title','author'])

In [12]:
book_all_df

Unnamed: 0,book_title,author,rating_value,rating_count,review_count,page,year
0,Harry Potter and the Sorcerer's Stone,J.K. Rowling,4.48,7292896.0,115556.0,309.0,1997.0
1,Harry Potter and the Chamber of Secrets,J.K. Rowling,4.43,2821010.0,54542.0,341.0,1998.0
2,Harry Potter and the Prisoner of Azkaban,J.K. Rowling,4.57,2908354.0,57262.0,435.0,1999.0
3,Harry Potter and the Goblet of Fire,J.K. Rowling,4.56,2676621.0,47961.0,734.0,2000.0
4,Harry Potter and the Order of the Phoenix,J.K. Rowling,4.50,2585450.0,43737.0,870.0,2003.0
...,...,...,...,...,...,...,...
1392,The Hanging Tree - Sheet Music: (from The Hung...,Hal Leonard Corporation,4.14,7.0,2.0,,2015.0
1393,Fifty Shades Darker,E.L. James,3.84,761148.0,29215.0,532.0,2011.0
1394,Bless the Child,Cathy Cash Spellman,3.80,607.0,54.0,608.0,1993.0
1395,Playing for Keeps,R.L. Mathewson,4.05,106251.0,5587.0,330.0,2011.0


In [10]:
movie_df.

((1800, 7), (1397, 7))

In [28]:
# Merge movie_df and book_all_df
mb_df = pd.merge(movie_df,book_all_df,\
                      left_on="movie_title",right_on="book_title",how='inner')
mb_df.shape

(1169, 29)

## 2. Merge with google search (popularity)

In [22]:
import pandas as pd
# Load pickle file
book_history = pd.read_pickle('../data/book_history_2_data')
book_history.shape

(1166, 9)

In [25]:
book_popularity_df = \
book_history[['title','release_date','book_popularity','author_popularity']].dropna()

In [27]:
book_popularity_df.head(3)

Unnamed: 0,title,release_date,book_popularity,author_popularity
0,The Godfather,1972-03-24,0.26,0.0567
1,Harry Potter and the Sorcerer's Stone,2001-11-16,0.18,0.0121
2,Harry Potter and the Sorcerer's Stone,2001-11-16,0.18,0.0121


In [31]:
# Combine movie,book,popularity (mbp)
mbp_df = pd.merge(mb_df,book_popularity_df,left_on='book_title',\
                 right_on='title',how='inner')
mbp_df.shape

(1602, 33)

In [147]:
# Save the data to pickle file
all_df.to_pickle('../dump/all_correctRT_data')

## 3. Merge with author_book_data
(How many books the author had published before the movie release year)

In [63]:
# Load author_book_data
author_book_df = pd.read_pickle('../data/author_book_data')
mbp_df.shape,author_book_df.shape

((1602, 33), (979, 2))

In [64]:
# Merge movie,book,popularity,author_boook (mbp_ab)
mpb_ab_df = pd.merge(mbp_df,author_book_df,on='author',how='left')
mpb_ab_df.shape

(1785, 34)

In [65]:
mpb_ab_df['release_year'] = mpb_ab_df['release_date_x'].dt.year

In [66]:
# count publications of author before the movie release
def count_pub(movie):
    
    author = movie.author
    movie_release_year = movie.release_year
    
    author_info = author_book_df[author_book_df.author == author]
    
    years = author_info.iloc[0].years
    prior = [x for x in years if x < movie_release_year]
    count = len(prior)
    
    return count

In [67]:
mpb_ab_df['count_a'] = mpb_ab_df.apply(count_pub,axis=1)
mpb_ab_df.head(3)

Unnamed: 0,movie_title,rating,vote,certificate,genre,release_date_x,metascore,keywords,budget,opening_weekend_usa,...,review_count,page,year,title,release_date_y,book_popularity,author_popularity,years,release_year,count_a
0,The Godfather,9.2,1616717,R,"[Crime, Drama]",1972-03-24,100.0,"[crime family, mafia, patriarch, organized cri...",6000000.0,302393.0,...,8930.0,448.0,1969.0,The Godfather,1972-03-24,0.26,0.0567,"[2002, 2000, 1997, 1996, 1996, 1991, 1990, 198...",1972,1
1,Harry Potter and the Sorcerer's Stone,7.6,656400,PG,"[Adventure, Family, Fantasy]",2001-11-16,64.0,"[magic, wizard, school of magic, based on nove...",125000000.0,90294621.0,...,115556.0,309.0,1997.0,Harry Potter and the Sorcerer's Stone,2001-11-16,0.18,0.0121,"[2020, 2018, 2016, 2016, 2016, 2016, 2012, 200...",2001,4
2,Harry Potter and the Sorcerer's Stone,7.6,656400,PG,"[Adventure, Family, Fantasy]",2001-11-16,64.0,"[magic, wizard, school of magic, based on nove...",125000000.0,90294621.0,...,115556.0,309.0,1997.0,Harry Potter and the Sorcerer's Stone,2001-11-16,0.18,0.0121,"[2020, 2018, 2016, 2016, 2016, 2016, 2012, 200...",2001,4


In [68]:
# Clean up columns
mpb_ab_df.drop(columns=['link_d','link_w','link_s','book_title','release_date_y','years'],inplace=True)
mpb_ab_df.rename(columns = {'release_date_x':'release_date',
                           'year':'publish_year'},inplace=True)
mpb_ab_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1785 entries, 0 to 1784
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   movie_title          1785 non-null   object        
 1   rating               1785 non-null   float64       
 2   vote                 1785 non-null   int64         
 3   certificate          1785 non-null   object        
 4   genre                1785 non-null   object        
 5   release_date         1785 non-null   datetime64[ns]
 6   metascore            779 non-null    float64       
 7   keywords             1785 non-null   object        
 8   budget               1631 non-null   float64       
 9   opening_weekend_usa  1143 non-null   float64       
 10  gross_usa            1307 non-null   float64       
 11  gross_world          1373 non-null   float64       
 12  runtime              1774 non-null   float64       
 13  distributor          1777 non-nul

## 4. Merge with director value
(How many movies directed before the movie release, average rating, average gross of those movies)

In [70]:
# Find corresponding value of each director when movie is released
# ['avg_rating','avg_gross'] (per movie directed)
director_df = pd.read_pickle('../data/director_data')

In [72]:
# Create df including columns of director film_count, avg_rating, avg_gross
# Rows correspond to rows in all_df
movie_director_df = mpb_ab_df.apply(director_value,axis=1).apply(pd.Series)
movie_director_df.head(3)

Unnamed: 0,movie_title,director,film_count_d,avg_rating_d,avg_gross_d
0,The Godfather,Francis Ford Coppola,7,5.285714,32634196
1,Harry Potter and the Sorcerer's Stone,Chris Columbus,9,6.622222,106588545
2,Harry Potter and the Sorcerer's Stone,Chris Columbus,9,6.622222,106588545


In [73]:
# Merge mpb_ab_df and director value
all_df = pd.merge(mpb_ab_df, movie_director_df, on=['movie_title','director'] ,how='left')

In [74]:
# Save the data to pickle file
all_df.to_pickle('../data/all_data')

### Practice on fuzzywuzzy package

In [9]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
Str1 = "Los Angeles Lakers"
Str2 = "Lakers"
Ratio = fuzz.ratio(Str1.lower(),Str2.lower())
Partial_Ratio = fuzz.partial_ratio(Str1.lower(),Str2.lower())
print(Ratio)
print(Partial_Ratio)

50
100


In [21]:
movie_title_list_s=['After We Collided',
 'The Godfather',
 "Harry Potter and the Sorcerer's Stone",
 'Unknown',
 'The Lord of the Rings: The Fellowship of the Ring',
 'The Beastmaster',
 'Little Women']

In [22]:
book_title_list_s = ['The Running Man',
 'Left Behind',"Harry Potter and the Sorcerer's Stone",
 'Harry Potter and the Chamber of Secrets',
 'Harry Potter and the Prisoner of Azkaban',
 'Harry Potter and the Goblet of Fire',
 'Harry Potter and the Order of the Phoenix',
 'Twilight']

In [23]:
movie_title_list_s[4]
Ratios = process.extract(movie_title_list_s[4],book_title_list_s)
match = process.extractOne(movie_title_list_s[4],book_title_list_s)
if match[1] > 90:
    print(match[0])
else:
    print('no match')

no match


In [27]:
match=[]
for movie in movie_title_list:
    potential = process.extractOne(movie,book_title_list)
    if potential[1] > 90:
        match.append(potential[0])
    else:
        match.append(np.nan)

In [30]:
c = 0
for x in match:
    if x is np.nan:
        c += 1
c

1367

In [29]:
1800-c

433