In [2]:
import re
from pprint import pprint

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import spacy
from gensim.models import KeyedVectors, Word2Vec
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity


pd.set_option('display.max_colwidth', None)
SPECIAL_CHARS = '[^A-Za-z0-9 ]+'
STOP_WORDS = stopwords.words('english')

# 2. Functions

In [3]:
def preprocess_text(text):
    """
    Take out stopwords.
    Take out punctuations and special characters.
    """
    SPECIAL_CHARS = '[^A-Za-z0-9 ]+'
    STOP_WORDS = stopwords.words('english')
    text = text.lower().split(' ')
    temp = [word for word in text if word not in STOP_WORDS]
    text = ' '.join(temp)
    text = re.sub(SPECIAL_CHARS, '', text)
    return text

In [4]:
def tokenise(doc):
    return [token.text for token in nlp(doc)]

In [5]:
def tokenise_lemma(doc):
    """
    Use spacy as the nlp object to tokenise each doc
    Lemmatise each words
    """
    return ' '.join([token.lemma_ for token in nlp(doc)])

In [6]:
# this is one way to get each title's vector representation
# more investagtion is needed later.

def get_vectors(first_map, second_map):
    """
    Use tokenised words to get vectors representations from the pretrained model (i.e. second_map).
    Average the vector representation of the description as the representation of the document 
    (i.e. each movie title's representation is the mean of vectors of each words in its description)
    """
    first_vec  = dict()
    for title, description in first_map.items():
        temp = list()
        for element in description: #element = tokenised words
            try:
                temp.append(second_map[element]) #secondmap is w2v model which should have a responding word vecotr for the tokenise word
            except KeyError:
                pass
        first_vec[title] = np.mean(temp, axis=0)
    
    return first_vec

In [7]:
def search(name, df):
    return df.loc[df['title'].str.lower()==name.lower()]

In [8]:
def get_topN_similar(lookup_id, title_vec, df, N=10):
    lookup_id = lookup_id.lower()
    sim = list()
    lookup_map = title_vec
    subject_map = title_vec 
        
    for uid, vec in lookup_map.items():
        thisSim = cosine_similarity(vec.reshape(1, -1), subject_map[lookup_id].reshape(1, -1))
        org = search(uid, df).originals.values
        gen = search(uid, df).genres.values
        sim.append((uid, thisSim[0][0], org, gen))
    sim = sorted(sim, key=lambda x: x[1], reverse=True)[:N+1]
    returnDf = pd.DataFrame(columns=['title','similarity','originals','genres'],
                           data = sim)
    return returnDf

In [9]:
def get_most_similar(lookup_id, title_vec, df):

    sim = list()
    lookup_map = title_vec
    subject_map = title_vec 
        
    for uid, vec in lookup_map.items():
        thisSim = cosine_similarity(vec.reshape(1, -1), subject_map[lookup_id].reshape(1, -1))
        org = search(uid, df).originals.values
        gen = search(uid, df).genres.values
        sim.append((uid, thisSim[0][0], org, gen))

    return sorted(sim, key=lambda x: x[1], reverse=True)

In [10]:
def filter_df(keyword):
    """
    Return a dataframe with the filtered result.
    The input value is case-insensitive. 
    """
    if type(keyword) == list:
        return netflixDf.loc[netflixDf['title'].isin(keyword)]
    else:
        return netflixDf.loc[netflixDf['title'].str.lower().isin([keyword.lower()])]

In [11]:
def markerX(key, values):
    return netflixDf.loc[netflixDf[key].str.lower().isin(values)].sort_values(by='pca_2', ascending=False)

def others(key, values):
    return netflixDf.loc[~netflixDf[key].str.lower().isin(values)]

# 3. Analyse Pipeline

## Terms explained
Document -> a bunch of texts <br>
Corpus -> a bunch of documents <br>
Vectors -> a mathematically convenience representation of a document (a bunch of textx) <br>
Models -> an algorithm for transforming vectors from one representation to another <br>

## Read the dataset/ Load the spacy pretrained model

In [None]:
# netflixDf = pd.read_csv('finalDataset_v3.csv', usecols=['title','type','description','genres','originals', 'year'])

In [None]:
# df = pd.read_csv('debugged.csv', usecols=['cleaned'])

In [None]:
# netflixDf = pd.concat([netflixDf,df], axis=1)

In [None]:
# np.where(pd.isnull(df.cleaned))

In [12]:
# use pre-trained corpus to help tokenise words
nlp = spacy.load('en_core_web_sm')

## Analyse Descriptive Data

In [15]:
movieDf = pd.read_csv('movie_dataset_july18.csv', usecols=['title','genres','year','type','originals','everything'])

## Create Corpus and apply word embedding

In [14]:
movieDf.columns

Index(['Unnamed: 0', 'title', 'genres', 'year', 'type', 'originals',
       'description', 'everything'],
      dtype='object')

In [19]:
movieTkDocs = [tokenise(doc) for doc in movieDf.everything.values.tolist()] #tokenise 

In [20]:
# mapping out the title and each description. so later on i can search 
movieMap = dict(zip(movieDf['title'].str.lower().tolist(), movieTkDocs))
# lower the title (easy for search)

In [21]:
path = "GoogleNews-vectors-negative300.bin"
w2v = KeyedVectors.load_word2vec_format(path, binary=True)
# It is much faster take less than 2 minutes

In [22]:
movieTitleVec = get_vectors(movieMap, w2v)

## What if I include genre information

In [None]:
movieDf_gen = netflixDf.loc[netflixDf['type']=='movie'].copy()

In [None]:
# netflixDf['genres'].fillna('No info', inplace=True)

In [None]:
y = []
for i in netflixDf.year.values.tolist():
    if len(i)==4:
        y.append(i)
    else:
        y.append('No info')

In [None]:
netflixDf['year'] = y

In [None]:
# netflixDf.to_csv('finaldataset_v3.csv')

In [None]:
temp = pd.read_csv('finaldataset_v3.csv', usecols=['title','genres','year','type','originals','cleaned'])
mvevt = temp.loc[temp['type']=='movie'].copy()

In [None]:
mvevt.head(1)

In [None]:
everything = list()
for ix in range(3770):
    everything.append(mvevt.iloc[ix,5]+ ' ' +mvevt.iloc[ix,1])
    

In [None]:
mvevt['everything'] = everything

In [None]:
mvevt.rename(columns={'cleaned':'description'}, inplace=True)

In [None]:
mvevtTkDocs = [tokenise(doc) for doc in mvevt.everything.values.tolist()] #tokenise 

In [None]:
# mapping out the title and each description. so later on i can search 
movieMap = dict(zip(mvevt['title'].str.lower().tolist(), mvevtTkDocs))
# lower the title (easy for search)

In [None]:
movieTitleVec = get_vectors(movieMap, w2v)

In [None]:
get_topN_similar('tall girl', movieTitleVec, mvevt)

In [None]:
search('social animals',movieDf)

In [None]:
get_topN_similar('bird box', movieTitleVec, mvevt)

In [None]:
search('mute', movieDf)

## It is clear that adding genres into corpus improves accuracy

# Descriptive analysis

In [None]:
# genres count
mv = temp.loc[temp['type']=='movie'].copy()

In [None]:
from collections import defaultdict

In [None]:
def count_genres(genre_list):
    """
    genre_list is pd sereis
    return a dataframe
    """
    genres_count = defaultdict(int)
    for movie in genre_list:
        for genre in movie.split(','):
            genres_count[genre] += 1
    df = pd.DataFrame(data=[genres_count])
    df = df.transpose().reset_index().rename(columns={'index':'genres',0:'counts'}).sort_values(by = 'counts', ascending=False).reset_index(drop=True)
    df['percentage'] = df['counts'].apply(lambda x: round(x/df.counts.sum(),2))
    return df

In [None]:
def show_values_on_bars(axs, h_v="v", space=0.4):
    def _show_on_single_plot(ax):
        if h_v == "v":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height()
                value = int(p.get_height())
                ax.text(_x, _y, value, ha="center") 
        elif h_v == "h":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() + float(space)
                _y = p.get_y() + p.get_height()
                value = int(p.get_width())
                ax.text(_x, _y, value, ha="left", va='bottom')

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)

In [None]:
mv_genres_count = count_genres(mv.genres.values.tolist())

In [None]:
mv_org = mv.loc[mv['originals']==1].copy()

In [None]:
mv_org_genres_count = count_genres(mv_org.genres.values.tolist())

In [None]:
fig, ax = plt.subplots(1,2,figsize=(16,8))
bar = sns.barplot(x = 'counts',
                  y = 'genres',
                  data = mv_genres_count,
                  ax = ax[0],
                  orient = 'h')
show_values_on_bars(bar, h_v='h', space=0.3)
ax[0].set_title('The distribution of Netflix movies')

bar = sns.barplot(x = 'counts',
                  y = 'genres',
                  data = mv_org_genres_count,
                  ax = ax[1],
                  orient = 'h')
show_values_on_bars(bar, h_v='h', space=0.3)
ax[1].set_title('The distribution of Netflix Originals movies')
plt.show()

Nomalise the data and compare

In [None]:
mv_genres_count['percentage'] = mv_genres_count['counts'].apply(lambda x: round(x/genres_df.counts.sum(),2))
mv_org_genres_count['percentage'] = mv_org_genres_count['counts'].apply(lambda x: round(x/genres_df.counts.sum(),2))

In [None]:
fig, ax = plt.subplots(1,2,figsize=(16,8))
bar = sns.barplot(x = 'percentage',
                  y = 'genres',
                  data = mv_genres_count,
                  ax = ax[0],
                  orient = 'h')
# show_values_on_bars(bar, h_v='h', space=0.3)
ax[0].set_title('The distribution of Netflix movies')

bar = sns.barplot(x = 'percentage',
                  y = 'genres',
                  data = mv_org_genres_count,
                  ax = ax[1],
                  orient = 'h')
# show_values_on_bars(bar, h_v='h', space=0.3)
ax[1].set_title('The distribution of Netflix Originals movies')
plt.savefig('movie%.png')
plt.show()

In [None]:
def trend_in_yearN(df ,genre , N):
    """
    return how many title's are in the given genre in the given year.
    """
    x = count_genres(df.loc[df['year']== N].genres.values.tolist())
    return x.set_index('genres').loc[genre].values.tolist()[0]

In [None]:
def ten_year_trend(df, genre, s=2010, e=2020):
    """
    default setting start with 2010 end with 2019 (10 years).
    return a dictionary
    """
    trend = defaultdict(int)
    for key in range(s, e):
        year = str(key)
        trend[key] = trend_in_yearN(mv, genre, year)
    return trend

In [None]:
com = ten_year_trend(mv, 'comedy')
sns.lineplot(x = list(com.keys()),
            y = list(com.values()),
            marker = 'X')

In [None]:
def create_trend_dataframe(df, cols):
    """
    df = from which df to produce ten_year_trend
    cols = list of genres you want to include
    """
    trend_df = pd.DataFrame()
    for genre in cols:
        temp = pd.DataFrame(data=[ten_year_trend(df, genre)]).transpose().rename(columns={0:genre})
        trend_df = pd.concat([trend_df, temp], axis=1)
    return trend_df

In [None]:
cols = ['drama','comedy','action-and-adventure','thriller','romance',
        'mystery', 'documentary','crime','family','fantasy']
trend_df = pd.DataFrame()
for genre in cols:
    temp = pd.DataFrame(data=[ten_year_trend(mv, genre)]).transpose().rename(columns={0:genre})
    trend_df = pd.concat([trend_df, temp], axis=1)

In [None]:
create_trend_dataframe(mv, cols)

In [None]:
genres = trend_df.columns.values.tolist()[1:6]
for gen in genres:
    sns.lineplot(x = 'index', y = gen, data = trend_df)

## TVshow

In [None]:
temp = pd.read_csv('finaldataset_v3.csv', usecols=['title','genres','year','type','originals','cleaned'])
tv = temp.loc[temp['type']=='tvshow'].copy()

In [None]:
tv_genres_count = count_genres(tv.genres.values.tolist())

In [None]:
tv_org = tv.loc[tv['originals']==1].copy()
tv_org_genres_count = count_genres(tv_org.genres.values.tolist())

In [None]:
fig, ax = plt.subplots(1,2,figsize=(16,8))
bar = sns.barplot(x = 'percentage',
                  y = 'genres',
                  data = tv_genres_count,
                  ax = ax[0],
                  orient = 'h')
# show_values_on_bars(bar, h_v='h', space=0.3)
ax[0].set_title('The distribution of Netflix Tvshow')

bar = sns.barplot(x = 'percentage',
                  y = 'genres',
                  data = tv_org_genres_count,
                  ax = ax[1],
                  orient = 'h')
# show_values_on_bars(bar, h_v='h', space=0.3)
ax[1].set_title('The distribution of Netflix Originals Tvshow')
plt.savefig('tvshow%.png')
plt.show()

In [None]:
tv_trend = create_trend_dataframe(tv, cols).reset_index()

In [None]:
genres = tv_trend.columns.values.tolist()[1:10]
for gen in genres:
    sns.lineplot(x = 'index', y = gen, data = tv_trend)