## Libraries

In [1]:
# Matplotlib forms basis for visualization in Python
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
from collections import Counter
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import plotly.express as px

import seaborn as sns

sns.set()

# Graphics in SVG format are more sharp and legible
%config InlineBackend.figure_format = 'svg'

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
import requests
from bs4 import BeautifulSoup
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.metrics import classification_report_imbalanced
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [3]:
#!pip install selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## Parsing description from imdb website

dont touch this code please, you will need to wait apr. 20 hours or reload you laptop!!!

In [4]:
#cin = pd.read_csv('cinemas.csv', index_col='Unnamed: 0')

#def scrape_movie_description(link):
#    try:
#        driver = webdriver.Chrome(executable_path="chromedriver.exe")
#        driver.get(link)

#        forms = driver.find_element(By.XPATH, "//span[@class='sc-5f699a2-2 cxqNYC']")
#        text = forms.get_attribute('innerText')

#        driver.quit()
#        return text
#    except:
#        return 'NA'


#cin['movie_description'] = cin['movie_imdb_link'].apply(scrape_movie_description)

#cin.to_csv("cin_with_description.csv", index=False)

## Take a first look at the data

In [5]:
cin = pd.read_csv('cin_with_description.csv')

In [6]:
cin.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,Unnamed: 28,war_symb_title,point_symb_title,movie_description
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,237000000.0,2009.0,936.0,7.9,1.78,33000,,4,0,A paraplegic Marine dispatched to the moon Pan...
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,300000000.0,2007.0,5000.0,7.1,2.35,0,,8,0,"Captain Barbossa, Will Turner and Elizabeth Sw..."
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,245000000.0,2015.0,393.0,6.8,2.35,85000,,1,0,A cryptic message from James Bond's past sends...
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,250000000.0,2012.0,23000.0,8.5,2.35,164000,,3,0,"Eight years after the Joker's reign of chaos, ..."
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,12.0,7.1,,0,,9,0,


In [7]:
cat_vars = ['color', 'genres', 'language', 'country', 'content_rating']

for var in cat_vars:
    print("Summary statistics for", var)
    print(cin[var].value_counts(normalize=True))

Summary statistics for color
Color               0.958259
Black and White     0.041542
Green and Yellow    0.000199
Name: color, dtype: float64
Summary statistics for genres
Drama                                            0.046733
Comedy                                           0.042376
Comedy|Drama                                     0.037822
Comedy|Drama|Romance                             0.037030
Comedy|Romance                                   0.031287
                                                   ...   
Fantasy|Comedy|Sci-Fi                            0.000198
Biography|Comedy|Drama|Music|Romance             0.000198
Biography|Crime|Drama|History|Music              0.000198
Biography|Comedy|Crime|Drama|Romance|Thriller    0.000198
Comedy|Crime|Horror                              0.000198
Name: genres, Length: 916, dtype: float64
Summary statistics for language
English       0.934696
French        0.014490
Spanish       0.007940
Hindi         0.005558
Mandarin      0.005161

In [8]:
cin.count()

color                        5031
director_name                4946
num_critic_for_reviews       5000
duration                     5035
director_facebook_likes      4946
actor_3_facebook_likes       5027
actor_2_name                 5037
actor_1_facebook_likes       5043
gross                        4161
genres                       5050
actor_1_name                 5043
movie_title                  5050
num_voted_users              5050
cast_total_facebook_likes    5050
actor_3_name                 5027
facenumber_in_poster         5038
plot_keywords                4896
movie_imdb_link              5050
num_user_for_reviews         5029
language                     5038
country                      5045
content_rating               4742
budget                       4557
title_year                   4942
actor_2_facebook_likes       5037
imdb_score                   5050
aspect_ratio                 4721
movie_facebook_likes         5050
Unnamed: 28                     2
war_symb_title

# Data Cleaning

## Drop empty columns and not valid/duplicated rows

In [9]:
cin = cin.drop(['Unnamed: 28'], axis=1)

In [10]:
cin = cin.drop(cin[cin['actor_1_facebook_likes'] < 0].index)

In [11]:
#also we should drop duplicates
cin = cin.drop_duplicates(subset=cin.columns.difference(['movie_title']))

In [12]:
#we should drop raw with USA content rating
cin = cin.drop(cin[cin['content_rating'] == 'USA'].index)

In [13]:
#we should drop raws with movies that not exists
cin = cin.drop(cin[cin['movie_title'] == 'A!O!U!I!E!'].index)

In [14]:
#we should drop raws where imdb_score>10 (Sanctuary, for example with 98,0)
cin = cin.drop(cin[cin['imdb_score'] > 10].index)

In [15]:
#let's drop all rows where language is numerical value etc
cin = cin[pd.to_numeric(cin['language'], errors='coerce').isna()]

## How many missing data points do we have?

In [16]:
# get the number of missing data points per column
missing_values_count = cin.isnull().sum()

# look at the # of missing points in the first ten columns
missing_values_count[0:]

color                         19
director_name                103
num_critic_for_reviews        49
duration                      15
director_facebook_likes      103
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        873
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
facenumber_in_poster          12
plot_keywords                152
movie_imdb_link                0
num_user_for_reviews          21
language                      12
country                        5
content_rating               300
budget                       487
title_year                   107
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 327
movie_facebook_likes           0
war_symb_title                 0
point_symb_title               0
movie_desc

In [17]:
total_cells = np.product(cin.shape)
total_missing = missing_values_count.sum()

# percent of data that is missing
percent_missing = (total_missing/total_cells) * 100
print(percent_missing)

1.725551459908203


In [18]:
cin.shape[0]

4997

## let's drop lines where too much missing values

drop rows where > 15% missing values

In [19]:
cin = cin.drop(cin[(cin.isnull().sum(axis = 1)/cin.shape[1])*100 > 15].index)

In [20]:
cin.shape[0]

4854

## let's drop lines where gross is missing, since it's our target value

In [22]:
cin.dropna(subset=['gross'], inplace=True)

## Content rating standardization

In [23]:
cin['content_rating'] = cin['content_rating'].replace('Unrated', 'Not Rated')

In [24]:
cin['content_rating'] = cin['content_rating'].replace('TV-14', 'PG-13')

In [25]:
cin['content_rating'] = cin['content_rating'].replace('TV-G', 'G')

In [26]:
cin['content_rating'] = cin['content_rating'].replace('TV-PG', 'PG')

In [27]:
cin['content_rating'] = cin['content_rating'].replace('X', 'NC-17')

In [28]:
cin['content_rating'] = cin['content_rating'].replace('M', 'R')

In [29]:
cin['content_rating'] = cin['content_rating'].replace('GP', 'PG')

In [30]:
cin['content_rating'] = cin['content_rating'].replace('TV-MA', 'NC-17')

In [31]:
cin['content_rating'] = cin['content_rating'].replace('TV-Y', 'G')

In [32]:
cin['content_rating'] = cin['content_rating'].replace('TV-Y7', 'G')

In [33]:
cin['content_rating'] = cin['content_rating'].replace('Passed', 'Not Rated')

In [34]:
# get the number of missing data points per column
missing_values_count = cin.isnull().sum()

# look at the # of missing points in the first ten columns
missing_values_count[0:]

color                          2
director_name                  3
num_critic_for_reviews         2
duration                       2
director_facebook_likes        3
actor_3_facebook_likes         7
actor_2_name                   1
actor_1_facebook_likes         0
gross                          0
genres                         0
actor_1_name                   0
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                   7
facenumber_in_poster           7
plot_keywords                 37
movie_imdb_link                0
num_user_for_reviews           0
language                       3
country                        0
content_rating                60
budget                       264
title_year                     3
actor_2_facebook_likes         1
imdb_score                     0
aspect_ratio                 100
movie_facebook_likes           0
war_symb_title                 0
point_symb_title               0
movie_desc

## trying to fill in missing data using other data

In [35]:
df = cin

## budget

In [36]:
for index, row in df.iterrows():
    if pd.isnull(row['budget']):
        director_name = row['director_name']
        similar_row = df[(df['director_name'] == director_name) & (~df['budget'].isnull())].head(1)
        if not similar_row.empty:
            budget_value = similar_row['budget'].values[0]
            df.loc[index, 'budget'] = budget_value

In [37]:
missing_values_count = df.isnull().sum()
missing_values_count[0:]

color                          2
director_name                  3
num_critic_for_reviews         2
duration                       2
director_facebook_likes        3
actor_3_facebook_likes         7
actor_2_name                   1
actor_1_facebook_likes         0
gross                          0
genres                         0
actor_1_name                   0
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                   7
facenumber_in_poster           7
plot_keywords                 37
movie_imdb_link                0
num_user_for_reviews           0
language                       3
country                        0
content_rating                60
budget                       129
title_year                     3
actor_2_facebook_likes         1
imdb_score                     0
aspect_ratio                 100
movie_facebook_likes           0
war_symb_title                 0
point_symb_title               0
movie_desc

In [38]:
for index, row in df.iterrows():
    if pd.isnull(row['budget']):
        title_year = row['title_year']
        similar_rows = df[(df['title_year'] == title_year) & (~df['budget'].isnull())]
        if not similar_rows.empty:
            average_budget = similar_rows['budget'].mean()
            df.loc[index, 'budget'] = average_budget

In [39]:
missing_values_count = df.isnull().sum()
missing_values_count[0:]

color                          2
director_name                  3
num_critic_for_reviews         2
duration                       2
director_facebook_likes        3
actor_3_facebook_likes         7
actor_2_name                   1
actor_1_facebook_likes         0
gross                          0
genres                         0
actor_1_name                   0
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                   7
facenumber_in_poster           7
plot_keywords                 37
movie_imdb_link                0
num_user_for_reviews           0
language                       3
country                        0
content_rating                60
budget                         3
title_year                     3
actor_2_facebook_likes         1
imdb_score                     0
aspect_ratio                 100
movie_facebook_likes           0
war_symb_title                 0
point_symb_title               0
movie_desc

## filling in color

In [40]:
for index, row in df.iterrows():
    if pd.isnull(row['color']):
        title_year = row['title_year']
        similar_row = df[(df['title_year'] == title_year) & (~df['color'].isnull())].head(1)
        if not similar_row.empty:
            color_value = similar_row['color'].values[0]
            df.loc[index, 'color'] = color_value

In [41]:
missing_values_count = df.isnull().sum()
missing_values_count[0:]

color                          0
director_name                  3
num_critic_for_reviews         2
duration                       2
director_facebook_likes        3
actor_3_facebook_likes         7
actor_2_name                   1
actor_1_facebook_likes         0
gross                          0
genres                         0
actor_1_name                   0
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                   7
facenumber_in_poster           7
plot_keywords                 37
movie_imdb_link                0
num_user_for_reviews           0
language                       3
country                        0
content_rating                60
budget                         3
title_year                     3
actor_2_facebook_likes         1
imdb_score                     0
aspect_ratio                 100
movie_facebook_likes           0
war_symb_title                 0
point_symb_title               0
movie_desc

## facenumber in poster

In [42]:
for index, row in df.iterrows():
    if pd.isnull(row['facenumber_in_poster']):
        actor_names = [row['actor_1_name'], row['actor_2_name'], row['actor_3_name']]
        face_count = sum(pd.notnull(actor_name) for actor_name in actor_names)
        df.loc[index, 'facenumber_in_poster'] = face_count

In [43]:
missing_values_count = df.isnull().sum()
missing_values_count[0:]

color                          0
director_name                  3
num_critic_for_reviews         2
duration                       2
director_facebook_likes        3
actor_3_facebook_likes         7
actor_2_name                   1
actor_1_facebook_likes         0
gross                          0
genres                         0
actor_1_name                   0
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                   7
facenumber_in_poster           0
plot_keywords                 37
movie_imdb_link                0
num_user_for_reviews           0
language                       3
country                        0
content_rating                60
budget                         3
title_year                     3
actor_2_facebook_likes         1
imdb_score                     0
aspect_ratio                 100
movie_facebook_likes           0
war_symb_title                 0
point_symb_title               0
movie_desc

## content_rating

In [44]:
reference_movies = df.dropna(subset=['content_rating', 'budget'])

budget_ranges = [(0, 1_000_000), (1_000_000, 10_000_000), (10_000_000, np.inf)]
for budget_range in budget_ranges:
    min_budget, max_budget = budget_range

    most_common_rating = reference_movies[
        (reference_movies['budget'] >= min_budget) &
        (reference_movies['budget'] < max_budget)
    ]['content_rating'].mode().iloc[0]

    missing_ratings = df[
        (df['budget'] >= min_budget) &
        (df['budget'] < max_budget) &
        df['content_rating'].isnull()
    ]
    df.loc[missing_ratings.index, 'content_rating'] = most_common_rating

In [45]:
missing_values_count = df.isnull().sum()
missing_values_count[0:]

color                          0
director_name                  3
num_critic_for_reviews         2
duration                       2
director_facebook_likes        3
actor_3_facebook_likes         7
actor_2_name                   1
actor_1_facebook_likes         0
gross                          0
genres                         0
actor_1_name                   0
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                   7
facenumber_in_poster           0
plot_keywords                 37
movie_imdb_link                0
num_user_for_reviews           0
language                       3
country                        0
content_rating                 0
budget                         3
title_year                     3
actor_2_facebook_likes         1
imdb_score                     0
aspect_ratio                 100
movie_facebook_likes           0
war_symb_title                 0
point_symb_title               0
movie_desc

## plot_keywords

In [46]:
genre_keywords = {
    'Action': ['action', 'adventure', 'fight', 'explosion'],
    'Comedy': ['comedy', 'funny', 'humor', 'laugh'],
    'Drama': ['drama', 'emotional', 'relationship', 'character'],
    'Thriller': ['thriller', 'suspense', 'mystery', 'twist'],
    'Romance': ['romance', 'love', 'heartbreak', 'passion'],
    'Sci-Fi': ['sci-fi', 'technology', 'future', 'aliens'],
    'Horror': ['horror', 'scary', 'fear', 'supernatural'],
    'Fantasy': ['fantasy', 'magic', 'mythical', 'adventure'],
    'Mystery': ['mystery', 'investigation', 'detective', 'clue'],
    'Animation': ['animation', 'cartoon', 'fun', 'family']
}

In [47]:
for index, row in df.iterrows():
    if pd.isnull(row['plot_keywords']):
        genre = row['genres']
        if genre in genre_keywords:
            keywords = genre_keywords[genre]
            df.loc[index, 'plot_keywords'] = ', '.join(keywords)

In [48]:
missing_values_count = df.isnull().sum()
missing_values_count[0:]

color                          0
director_name                  3
num_critic_for_reviews         2
duration                       2
director_facebook_likes        3
actor_3_facebook_likes         7
actor_2_name                   1
actor_1_facebook_likes         0
gross                          0
genres                         0
actor_1_name                   0
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                   7
facenumber_in_poster           0
plot_keywords                 27
movie_imdb_link                0
num_user_for_reviews           0
language                       3
country                        0
content_rating                 0
budget                         3
title_year                     3
actor_2_facebook_likes         1
imdb_score                     0
aspect_ratio                 100
movie_facebook_likes           0
war_symb_title                 0
point_symb_title               0
movie_desc

## language

In [49]:
for index, row in df.iterrows():
    if pd.isnull(row['language']):
        country = row['country']
        matching_country = df[df['country'] == country]
        non_null_languages = matching_country['language'].dropna()
        if non_null_languages.empty:
            continue
        language = non_null_languages.iloc[0]
        df.loc[index, 'language'] = language

In [50]:
missing_values_count = df.isnull().sum()
missing_values_count[0:]

color                          0
director_name                  3
num_critic_for_reviews         2
duration                       2
director_facebook_likes        3
actor_3_facebook_likes         7
actor_2_name                   1
actor_1_facebook_likes         0
gross                          0
genres                         0
actor_1_name                   0
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                   7
facenumber_in_poster           0
plot_keywords                 27
movie_imdb_link                0
num_user_for_reviews           0
language                       0
country                        0
content_rating                 0
budget                         3
title_year                     3
actor_2_facebook_likes         1
imdb_score                     0
aspect_ratio                 100
movie_facebook_likes           0
war_symb_title                 0
point_symb_title               0
movie_desc

## num_critic_for_reviews

(do we really need now regression here?)

In [51]:
for index, row in df.iterrows():
    if pd.isnull(row['num_critic_for_reviews']):
        movie_likes = row['movie_facebook_likes']
        df_subset = df[df['movie_facebook_likes'] == movie_likes]
        non_null_reviews = df_subset['num_critic_for_reviews'].dropna()
        if non_null_reviews.empty:
            continue
        num_reviews = non_null_reviews.mean()
        df.loc[index, 'num_critic_for_reviews'] = num_reviews

In [52]:
missing_values_count = df.isnull().sum()
missing_values_count[0:]

color                          0
director_name                  3
num_critic_for_reviews         1
duration                       2
director_facebook_likes        3
actor_3_facebook_likes         7
actor_2_name                   1
actor_1_facebook_likes         0
gross                          0
genres                         0
actor_1_name                   0
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                   7
facenumber_in_poster           0
plot_keywords                 27
movie_imdb_link                0
num_user_for_reviews           0
language                       0
country                        0
content_rating                 0
budget                         3
title_year                     3
actor_2_facebook_likes         1
imdb_score                     0
aspect_ratio                 100
movie_facebook_likes           0
war_symb_title                 0
point_symb_title               0
movie_desc

# Filling in missing data

## Lets try to fill out missing numeric values in data using median

In [53]:
numeric_cols = df.select_dtypes(include=['float64', 'int64'])
df[numeric_cols.columns] = numeric_cols.fillna(numeric_cols.median())

In [54]:
missing_values_count = df.isnull().sum()
missing_values_count[0:]

color                         0
director_name                 3
num_critic_for_reviews        0
duration                      0
director_facebook_likes       0
actor_3_facebook_likes        0
actor_2_name                  1
actor_1_facebook_likes        0
gross                         0
genres                        0
actor_1_name                  0
movie_title                   0
num_voted_users               0
cast_total_facebook_likes     0
actor_3_name                  7
facenumber_in_poster          0
plot_keywords                27
movie_imdb_link               0
num_user_for_reviews          0
language                      0
country                       0
content_rating                0
budget                        0
title_year                    0
actor_2_facebook_likes        0
imdb_score                    0
aspect_ratio                  0
movie_facebook_likes          0
war_symb_title                0
point_symb_title              0
movie_description             0
dtype: i

In [55]:
total_cells = np.product(df.shape)
total_missing = missing_values_count.sum()

percent_missing = (total_missing/total_cells) * 100
print(percent_missing)

0.029774264066380937


## lets try to fill in cat val using mode

In [56]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)

In [57]:
# get the number of missing data points per column
missing_values_count = df.isnull().sum()

# look at the # of missing points in the first ten columns
missing_values_count[0:]

color                        0
director_name                0
num_critic_for_reviews       0
duration                     0
director_facebook_likes      0
actor_3_facebook_likes       0
actor_2_name                 0
actor_1_facebook_likes       0
gross                        0
genres                       0
actor_1_name                 0
movie_title                  0
num_voted_users              0
cast_total_facebook_likes    0
actor_3_name                 0
facenumber_in_poster         0
plot_keywords                0
movie_imdb_link              0
num_user_for_reviews         0
language                     0
country                      0
content_rating               0
budget                       0
title_year                   0
actor_2_facebook_likes       0
imdb_score                   0
aspect_ratio                 0
movie_facebook_likes         0
war_symb_title               0
point_symb_title             0
movie_description            0
dtype: int64

In [58]:
total_cells = np.product(df.shape)
total_missing = missing_values_count.sum()

percent_missing = (total_missing/total_cells) * 100
print(percent_missing)

0.0


## splitting genres

In [59]:
df['genres_split'] = df['genres'].apply(lambda x: x.split('|'))

In [60]:
df.to_csv("cin_prep.csv", index=False)