In [None]:
<h3  style = "text-align:center;font-size:40px;font-family:courier">Movie Visualization &<br>Recommendation System</h3>

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objects as go
init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings("ignore")

<h3 style = "text-align:center;font-size:30px;font-family:courier">Exploratory Data Analysis and Data Cleaning</h3>

In [None]:
movie = pd.read_csv('/kaggle/input/movies-dataset-for-feature-extracion-prediction/movies.csv')
movie.head()

In [None]:
import pandasql as psql # using sql for some exploration

Q1 = """  
    with temp as (select cast(replace(VOTES,",","") as integer) as VOTES from movie)
    
    select MOVIES, YEAR, GENRE,RATING, cast(replace(VOTES,",","") as integer) as VOTES
    from movie 
    where RATING >= (select avg(RATING) as avr_rating from movie)
    and VOTES >= (select avg(VOTES) from temp)
    order by RATING desc, VOTES desc
    limit 20
    """
psql.sqldf(Q1)

In [None]:
Q2 = """
     select GENRE, max(RATING) as highest_rate
     from movie  
     group by GENRE
     order by highest_rate desc
     limit 10
     """
psql.sqldf(Q2)

### Context for each columns
* MOVIES: Movie name
* YEAR: The year of movie or tv shows telecast for audience
* GENRE: Genre of the Movie/ TV Shows
* RATING: The audience thought about movie or tv show given
* ONE-LINE: The short description about movie or tv show for audience first impression
* STARS: The casting 
* VOTES: The audience express their view
* RunTime: The duration of running time 
* Gross: Global earning for the movie/ tv shows

In [None]:
# Checking for Missing Values
movie.isna().sum()

In [None]:
print("Missing Values:\n")
for col in movie.columns:
    missing = movie[col].isna().sum()
    percent = missing / movie.shape[0] * 100
    print("%s: %.2f%% (%d)" % (col,percent,missing))

In [None]:
movie.info()

### Cleaning some of the features

In [None]:
# Removing "\n" from GENRE, ONE-LINE, and STARS columns
for col in ['GENRE','ONE-LINE','STARS']:
    movie[col] = movie[col].str.replace("\n","").str.strip()

movie.head()

In [None]:
# Creating New Column Director and Stars by extracting Director(s) and Stars from orignial STARS column
def extract_director(direc):
    if 'Director' in direc or 'Directors' in direc:
        director = direc.strip().split("|")[0] # The Second Half is the stars
        return director.split(":")[1] # Return the Director name
    else:
        return ''

def extract_stars(stars):
    if 'Star' not in stars or 'Stars' not in stars:
        return ''
    else:
        return stars.split(":")[-1] # last value in this list will be the stars

movie['Director'] = movie['STARS'].apply(lambda d: extract_director(d))
movie['Stars'] = movie['STARS'].apply(lambda s: extract_stars(s))

# View head of these columns
movie[['STARS','Director','Stars']].head()

<h3 style = "text-align:center;font-size:30px;font-family:courier">Data Visualization</h3>

### Years

In [None]:
# Extracting Year from original YEARS column

movie['Year'] = movie['YEAR'].str.extract(r'([0-9]{4}–.*|[0-9]{4})')
movie['Year'] = movie['Year'].str.strip().replace(")","")

def extract_year(year):
    if year[-3:] == '– )':
        return year.replace('– )',"–")
    else:
        return year.replace(')',"")

movie['Year'] = movie['Year'].fillna('Unknown')
movie['Year'] = movie['Year'].apply(lambda y: extract_year(y))
    
year_count = movie[movie['Year'] != 'Unknown']['Year'].value_counts().reset_index().rename(columns = {'Year':'Count','index':'Year'})
year_count.head()

In [None]:
colors = ['paleturquoise'] * 10
colors[0],colors[2],colors[4],colors[-1] = 'darkcyan','darkcyan','darkcyan','darkcyan'

fig = px.bar(data_frame = year_count.head(10),
             x = 'Year', y = 'Count')

fig.update_traces(marker_color = colors)

fig.update_layout(title = 'Year(s) Distribution')

fig.show()

### Rating

In [None]:
print("Statistical value of [{}]".format('Rating'))

# Average Rating 
print("Mean:", round(movie['RATING'].mean(),2))

# Median Rating
print("Median:", movie['RATING'].median())

# Max Rating
print("Max:", movie['RATING'].max())

In [None]:
fig = px.bar(data_frame = movie['RATING'].value_counts().reset_index().head(10),
             x = 'index', y = 'RATING',
             title = 'Rating Distribution')

fig.update_yaxes(title = 'Count')

fig.update_xaxes(type ='category',
                 title = 'Rating (out of 10)')

fig.show()

### RunTime

In [None]:
fig = px.bar(data_frame = movie['RunTime'].value_counts().reset_index().head(10),
             x = 'index', y = 'RunTime',
             title = 'Runtime Distribution')

fig.update_yaxes(title = 'Count')

fig.update_xaxes(type ='category',
                 title = 'Runtime (mins)')

fig.show()

### Voting

In [None]:
movie.info()

In [None]:
movie['VOTES'] = movie['VOTES'].str.replace(",","")
movie['VOTES'] 

In [None]:
movie['VOTES'] = movie['VOTES'].fillna(0)
movie['VOTES'] = movie['VOTES'].astype(int)
movie['VOTES'].sort_values(ascending = False)

### Genre

In [None]:
movie_genre = movie['GENRE'].value_counts().reset_index().rename(columns={'GENRE':'Count','index':'Genre'})

fig = px.bar(data_frame = movie_genre.sort_values(by='Count',ascending = False).head(10),
             x = 'Genre', y = 'Count')

fig.update_layout(title = 'Top 10 Genre Combination')

fig.show()

### Looking at Individual Genre

In [None]:
# Count number of Genre
from collections import Counter

genre_raw = movie['GENRE'].dropna().to_list()
genre_list = list()

for genres in genre_raw:
    genres = genres.split(", ")
    for g in genres:
        genre_list.append(g)
        
genre_df = pd.DataFrame.from_dict(Counter(genre_list), orient = 'index').rename(columns = {0:'Count'})
genre_df.head()

In [None]:
# Genre Count Ditribution
fig = px.pie(data_frame = genre_df,
             values = 'Count',
             names = genre_df.index,
             color_discrete_sequence = px.colors.qualitative.Safe)

fig.update_traces(textposition = 'inside',
                  textinfo = 'label+percent',
                  pull = [0.05] * len(genre_df.index.to_list()))

fig.update_layout(title = {'text':'Genre Distribution'},
                  legend_title = 'Gender',
                  uniformtext_minsize=13,
                  uniformtext_mode='hide',
                  font = dict(
                      family = "Courier New, monospace",
                      size = 18,
                      color = 'black'
                  ))


fig.show()

### Director

In [None]:
fig = px.bar(data_frame = movie[~(movie['Director'] == "")]['Director'].value_counts().reset_index().head(10),
             x = 'index', y = 'Director')

fig.update_layout(title = 'Director(s) Distribution',
                  xaxis_title = 'Director(s)',
                  yaxis_title = 'Count')

fig.show()



### Stars

In [None]:
stars_list = list()

for stars in movie[movie['Stars'] != ""]['Stars'].to_list():
    stars = stars.split(", ")
    for s in stars:
        stars_list.append(s)

stars_df = pd.DataFrame.from_dict(Counter(stars_list), orient = 'index').rename(columns = {0:'Count'})
stars_df = stars_df.sort_values(by='Count',ascending = False)

fig = px.bar(data_frame = stars_df.head(10),
             x = stars_df.head(10).index, y = 'Count')

fig.update_layout(title = 'Top 10 Stars that appeared the most',
                  xaxis_title = 'Stars',
                  yaxis_title = 'Count')


fig.show()

### Gross

In [None]:
gross_df = movie[~movie['Gross'].isna()] # New Dataframe with no NaN in Gross column

# Extract the numerical value
def extract_gross(gross):
    return float(gross.replace("$","").replace("M",""))

# Unit is Million US Dollar
gross_df['Gross'] = gross_df['Gross'].apply(lambda g: extract_gross(g))

# Highest Gross Movie
print("Highest Gross movie:",gross_df.iloc[gross_df['Gross'].argmax()]['MOVIES'])

In [None]:
fig = px.bar(data_frame = gross_df.sort_values(by='Gross', ascending = False).head(10),
             x = 'MOVIES', y = 'Gross',
             title = 'Top 10 Gross Movie')
fig.update_layout(yaxis_title = 'Million US Dollar')
fig.show()

<h3 style = "text-align:center;font-size:30px;font-family:courier">Content-based Filtering Movie Recommendation System</h3>

In [None]:
# Features  using GENRE, RATING??, ONE-LINE, RunTime??, Director, Stars

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

features = ['GENRE','ONE-LINE','Director','Stars']

# Filling in missing values with Blank String
for feature in features:
    movie[feature] = movie[feature].fillna("")

movie['combined_features'] = movie['GENRE'] + " " + movie['ONE-LINE'] + " " + movie['Director'] + " " + movie['Stars'] 
cv = CountVectorizer()
count_matrix = cv.fit_transform(movie['combined_features'])
cosine_sim = cosine_similarity(count_matrix)

In [None]:
# Function for movie recommendation
def movie_recommendation(mov,sim_num = 5):

    user_choice = mov
    
    try:
        ref_index = movie[movie['MOVIES'].str.contains(user_choice, case = False)].index[0]

        similar_movies = list(enumerate(cosine_sim[ref_index]))

        sorted_simmilar_movies = sorted(similar_movies, key = lambda x: x[1], reverse = True)[1:]

        print('\nRecomended Movies for [{}]'.format(user_choice))
        print('-'*(24 + len(user_choice)))

        for i, element in enumerate(sorted_simmilar_movies):
            similar_movie_id = element[0]
            similar_movie_title = movie['MOVIES'].iloc[similar_movie_id]
            s_score = element[1]
            print('{:40} -> {:.3f}'.format(similar_movie_title, s_score))

            if i > sim_num:
                break
    except IndexError:
        print("\n[{}] is not in our database!".format(user_choice))
        print("We couldn't recommend anyting...Sorry...")

In [None]:
# Search for movie with the keyword
def movie_available(key):
    
    keyword = key
    
    print("Movie with keyword: [{}]".format(keyword))
    
    for i, mov in enumerate(movie[movie['MOVIES'].str.contains(keyword)]['MOVIES'].to_list()):
        print("{}) {} ".format(i+1,mov))

In [None]:
# Running the Function
movie_available("Spider")

In [None]:
# Running the Function 
movie_recommendation("Spider-Man: Far from home")

In [None]:
# Running the Function with argument
movie_recommendation("Spider-Man: Far from home",10)

<h4 style = "font-family:courier;font-size:20px">This reccomendation might not be the same as to what you might find on the internet because not all the movies are recorded in this dataset.<h4>
<h3 style = "font-family:courier;font-size:40px;text-align:center">Thank you for checking out my work!</h3>