In [2]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt

import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

import re

In [3]:
movies = pd.read_csv('./data/with_wiki_scrape_complete.csv')
movies.drop(columns=['Unnamed: 0'], inplace=True)

In [4]:
movies.head()

Unnamed: 0,tconst,title,primary_title,original_title,release_year,runtime,genres,directors,writers,rating,votes,cast_crew,wiki_title,scraped_data,tomato_score,metacritic_score,mpaa_rating,wiki_scrape
0,tt0111161,The Shawshank Redemption,The Shawshank Redemption,The Shawshank Redemption,1994,142,['Drama'],nm0001104,"nm0000175,nm0001104",9.3,2540263,"['0290358', '0000209', '0000151', '0348409', '...",The Shawshank Redemption (1994 film),"{""Title"":""The Shawshank Redemption"",""Year"":""19...",91%,80/100,R,"{'Plot': 'In 1947 Portland, Maine, banker Andy..."
1,tt0468569,The Dark Knight,The Dark Knight,The Dark Knight,2008,152,"['Action', 'Crime', 'Drama']",nm0634240,"nm0634300,nm0634240,nm0275286,nm0004170",9.0,2490570,"['0746273', '0000288', '0005132', '0001173', '...",The Dark Knight (2008 film),"{""Title"":""The Dark Knight"",""Year"":""2008"",""Rate...",94%,84/100,PG-13,"{'Plot': ""A gang of criminals rob a Gotham Cit..."
2,tt1375666,Inception,Inception,Inception,2010,148,"['Action', 'Adventure', 'Sci-Fi']",nm0634240,nm0634240,8.8,2231336,"['0245596', '0000138', '0330687', '0680983', '...",Inception (2010 film),"{""Title"":""Inception"",""Year"":""2010"",""Rated"":""PG...",87%,74/100,PG-13,"{'Plot': 'Dominick ""Dom"" Cobb and Arthur are ""..."
3,tt0137523,Fight Club,Fight Club,Fight Club,1999,139,['Drama'],nm0000399,"nm0657333,nm0880243",8.8,1999188,"['0513165', '0000093', '0001570', '0001533', '...",Fight Club (1999 film),"{""Title"":""Fight Club"",""Year"":""1999"",""Rated"":""R...",79%,66/100,R,"{'Plot': 'The Narrator, an automobile recall s..."
4,tt0109830,Forrest Gump,Forrest Gump,Forrest Gump,1994,142,"['Drama', 'Romance']",nm0000709,"nm0343165,nm0744839",8.8,1960064,"['0005494', '0000158', '0000705', '0000641', '...",Forrest Gump (1994 film),"{""Title"":""Forrest Gump"",""Year"":""1994"",""Rated"":...",70%,82/100,PG-13,"{'Plot': 'In 1981, at a bus stop in Savannah, ..."


In [3]:
#creating a list of all of the wiki info converted to actual dictionaries
dict_list = []

for i in range(0, 10000):
    
    dict_list.append(ast.literal_eval(movies.iloc[i, 17]))
    
#creating a list of the plot info, filling in "ERROR" if plot not called plot
plot_list = []

for dictionary in dict_list:
    try: 
        plot_list.append(dictionary['Plot'])
    except:
        try: plot_list.append(dictionary['Summary'])
        except:
            try:
                plot_list.append(dictionary['Synopsis'])
            except:
                plot_list.append('ERROR')
        
movies['plot'] = plot_list

In [4]:
movies[movies['plot'] == "ERROR"].shape

(515, 19)

we have 515 movies that we were unable to scrape a plot for. Luckily we have some back up short plot summaries that we can use. 

In [5]:
#selecting just the plot synopsis from our "scraped_data" column
movies['backup_plot'] = movies['scraped_data'].apply(
    lambda x: x.split("\"Plot\":")[1].split('\"Language\":')[0][:-2])

In [6]:
movies.columns

Index(['tconst', 'title', 'primary_title', 'original_title', 'release_year',
       'runtime', 'genres', 'directors', 'writers', 'rating', 'votes',
       'cast_crew', 'wiki_title', 'scraped_data', 'tomato_score',
       'metacritic_score', 'mpaa_rating', 'wiki_scrape', 'plot',
       'backup_plot'],
      dtype='object')

In [7]:
#replacing the movies with missing plots with the backup plots
#grouping the movies that don't have "ERROR" as their plot
movies_long_plot = movies[movies['plot'] != 'ERROR']
#excluding the blank plots
movies_long_plot = movies_long_plot[movies_long_plot['plot'] != '']
#selecting error plots
movies_plot_error = movies[movies['plot'] == 'ERROR']
movies_plot_error['plot'] = movies_plot_error['backup_plot']
#selecting blank plots
movies_no_plot = movies[movies['plot'] == '']
movies_no_plot['plot'] = movies_no_plot['backup_plot']
#putting it all back together
movies = pd.concat([movies_long_plot, movies_plot_error, movies_no_plot])
movies.sort_values('votes', ascending=False, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_plot_error['plot'] = movies_plot_error['backup_plot']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_no_plot['plot'] = movies_no_plot['backup_plot']


In [8]:
movies.columns

Index(['tconst', 'title', 'primary_title', 'original_title', 'release_year',
       'runtime', 'genres', 'directors', 'writers', 'rating', 'votes',
       'cast_crew', 'wiki_title', 'scraped_data', 'tomato_score',
       'metacritic_score', 'mpaa_rating', 'wiki_scrape', 'plot',
       'backup_plot'],
      dtype='object')

In [9]:
movies.shape

(10000, 20)

In [10]:
#making the "genres" column back into a list
movies['genres'] = movies['genres'].apply(lambda x: x.replace(' ', '')\
    .replace('\'', '').replace('[', '').replace(']','').split(','))

In [11]:
#separating the genres into individual columns
genre_1 = []
genre_2 = []
genre_3 = []

for genre in movies['genres']:
    genre_1.append(genre[0])

for genre in movies['genres']:
    try:
        genre_2.append(genre[1])
    except:
        genre_2.append(np.nan)

for genre in movies['genres']:
    try:
        genre_3.append(genre[2])
    except:
        genre_3.append(np.nan)
#adding the lists as columns        
movies['genre_1'] = genre_1
movies['genre_2'] = genre_2
movies['genre_3'] = genre_3


In [12]:
#making the directors column into a list
movies['directors'] = movies['directors'].apply(lambda x: x.split(','))

In [13]:
# separating directors into distinct columns
director_1 = []
director_2 = []
director_3 = []

for director in movies['directors']:
    director_1.append(director[0])

for director in movies['directors']:
    try:
        director_2.append(director[1])
    except:
        director_2.append(np.nan)

for director in movies['directors']:
    try:
        director_3.append(director[2])
    except:
        director_3.append(np.nan)
#adding the lists as columns        
movies['director_1'] = genre_1
movies['director_2'] = genre_2
movies['director_3'] = genre_3


In [14]:
# making the writers column into a list
movies['writers'] = movies['writers'].apply(lambda x: x.split(','))

In [15]:
# separating writers into distinct columns
writer_1 = []
writer_2 = []
writer_3 = []

for writer in movies['writers']:
    writer_1.append(writer[0])

for writer in movies['writers']:
    try:
        writer_2.append(writer[1])
    except:
        writer_2.append(np.nan)

for writer in movies['writers']:
    try:
        writer_3.append(writer[2])
    except:
        writer_3.append(np.nan)
#adding the lists as columns        
movies['writer_1'] = writer_1
movies['writer_2'] = writer_2
movies['writer_3'] = writer_3

In [16]:
# making the cast/crew into a list
movies['cast_crew'] = movies['cast_crew'].apply(lambda x: x.replace(' ', '')\
    .replace('\'', '').replace('[', '').replace(']','').split(','))

In [17]:
movies['cast_crew'].head()

0    [0290358, 0000209, 0000151, 0348409, 0006669, ...
1    [0746273, 0000288, 0005132, 0001173, 0000323, ...
2    [0245596, 0000138, 0330687, 0680983, 0913822, ...
3    [0513165, 0000093, 0001570, 0001533, 0340260, ...
4    [0005494, 0000158, 0000705, 0000641, 0000398, ...
Name: cast_crew, dtype: object

In [18]:
# separating cast_crew into distinct columns using a function

def movie_people(position):
    cast_list = []
    for person_list in movies['cast_crew']:
        try:
            cast_list.append(person_list[position])
        except:
            cast_list.append(np.nan)
    return cast_list
        
# calling the function to create the columns        
movies['cast_1'] = movie_people(0)
movies['cast_2'] = movie_people(1)
movies['cast_3'] = movie_people(2)
movies['cast_4'] = movie_people(3)
movies['cast_5'] = movie_people(4)
movies['cast_6'] = movie_people(5)
movies['cast_7'] = movie_people(6)
movies['cast_8'] = movie_people(7)
movies['cast_9'] = movie_people(8)
movies['cast_10'] = movie_people(9)

In [19]:
#creating a decade tag 
movies['decade'] = movies['release_year'].apply(lambda x: round((x-4), -1))

In [20]:
movies.head()

Unnamed: 0,tconst,title,primary_title,original_title,release_year,runtime,genres,directors,writers,rating,...,cast_2,cast_3,cast_4,cast_5,cast_6,cast_7,cast_8,cast_9,cast_10,decade
0,tt0111161,The Shawshank Redemption,The Shawshank Redemption,The Shawshank Redemption,1994,142,[Drama],[nm0001104],"[nm0000175, nm0001104]",9.3,...,209,151,348409,6669,1104,175,555550,2353,5683,1990
1,tt0468569,The Dark Knight,The Dark Knight,The Dark Knight,2008,152,"[Action, Crime, Drama]",[nm0634240],"[nm0634300, nm0634240, nm0275286, nm0004170]",9.0,...,288,5132,1173,323,634240,634300,275286,4170,650038,2000
2,tt1375666,Inception,Inception,Inception,2010,148,"[Action, Adventure, Sci-Fi]",[nm0634240],[nm0634240],8.8,...,138,330687,680983,913822,634240,858799,1877,2892,809059,2010
3,tt0137523,Fight Club,Fight Club,Fight Club,1999,139,[Drama],[nm0000399],"[nm0657333, nm0880243]",8.8,...,93,1570,1533,340260,399,657333,880243,68501,149556,2000
4,tt0109830,Forrest Gump,Forrest Gump,Forrest Gump,1994,142,"[Drama, Romance]",[nm0000709],"[nm0343165, nm0744839]",8.8,...,158,705,641,398,709,343165,744839,277704,823330,1990


In [21]:
movies.columns

Index(['tconst', 'title', 'primary_title', 'original_title', 'release_year',
       'runtime', 'genres', 'directors', 'writers', 'rating', 'votes',
       'cast_crew', 'wiki_title', 'scraped_data', 'tomato_score',
       'metacritic_score', 'mpaa_rating', 'wiki_scrape', 'plot', 'backup_plot',
       'genre_1', 'genre_2', 'genre_3', 'director_1', 'director_2',
       'director_3', 'writer_1', 'writer_2', 'writer_3', 'cast_1', 'cast_2',
       'cast_3', 'cast_4', 'cast_5', 'cast_6', 'cast_7', 'cast_8', 'cast_9',
       'cast_10', 'decade'],
      dtype='object')

In [22]:
movies.describe()

Unnamed: 0,release_year,runtime,rating,votes,decade
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,2001.7521,109.6716,6.63504,83388.13,1997.725
std,16.558975,21.005134,1.021927,155199.2,16.90275
min,1941.0,62.0,1.0,8749.0,1940.0
25%,1994.0,95.0,6.1,15091.75,1990.0
50%,2006.0,106.0,6.7,30566.5,2000.0
75%,2014.0,119.0,7.4,81224.5,2010.0
max,2022.0,237.0,9.3,2540263.0,2020.0


In [35]:
#dropping columns that we will not be using further
movies.drop(columns = [
    'wiki_scrape',
    'backup_plot',
    'scraped_data',
    'title',
    'original_title',
    'wiki_title',
    'directors',
    'writers',
    'genres',
    'cast_crew',
    'decade'
], inplace=True)

In [68]:
#sentiment analysis

#removing unnecessary characters
movies['plot'] = movies['plot'].apply(
    lambda x: str(x.replace('\n', '').replace("\\", '')))

movies['plot'] = [re.sub('[\(\[].*?[\)\]]', '', t.replace('\n', '')) for t in movies['plot']]

movies['plot'] = [t.replace('/\\', ' ').replace(':-)', ' ').replace('tdb> ', ' ') 
              for t in movies['plot']]

movies['plot'] = [t.replace('=', ' ').replace('--', ' ').replace('_', ' ')\
              .replace('}', ' ').replace('*', ' ').replace('^', ' ')\
              .replace('~', ' ')
              for t in movies['plot']]

movies['plot'] = [t.replace('- < > -', ' ').replace('|', ' ').replace('*-', ' ') 
              for t in movies['plot']]

movies['plot'] = [t.replace('...', ' ').replace('\t', ' ').lower().strip() 
              for t in movies['plot']]



In [69]:
temp = movies[:50]

In [72]:
#creating a column that indicates the polarity of the synopsis, hopefully to
#reveal if a movie is a feel good movie or not

#setting up sentiment analysis and creating column

nlp = spacy.load('en_core_web_sm')

nlp.add_pipe('spacytextblob')

movies['plot_polarity'] = movies['plot'].apply(lambda x: nlp(x)._.polarity)  

In [73]:
movies.to_csv('./data/data_for_eda.csv')