### Imports

In [178]:
import redis
import re
import json
import pandas as pd
import numpy as np
import sys
sys.path.append('../library/')
from midStats import cosSimWords
from core import exceptionOutput
from datetime import datetime
import arrow
import user_agent
from bs4 import BeautifulSoup
from tqdm import tqdm
import requests
import os
from dotenv import load_dotenv
load_dotenv()
from tabulate import tabulate
import ast
import string

r8 = redis.Redis(
    host='localhost',
    port=6379,
    charset="utf-8",
    decode_responses=True,
    db = 8
)

### Load Redis Data from DB 2

In [179]:
ids = r8.keys()
vals = r8.mget(ids)

In [180]:
j = [ast.literal_eval(v) for v in vals]

### Load in tmdb data

In [None]:
tmdbDf = pd.read_csv('../data/tmdbDetails.csv')

titleDict = tmdbDf.set_index('imdb_id')['title'].to_dict()

### Load Creits From Redis

In [None]:
r6 = redis.Redis(
    host='127.0.0.1',
    port=6379,
    charset="utf-8",
    decode_responses=True,
    db=6
)

keys = r6.keys('*')
values = r6.mget(keys)

creditsDict = []

for i,v in enumerate(tqdm(values)):
    vJ = json.loads(v)
    imdbId = {'imdbId':keys[i]}
    cast = [{**c, **imdbId} for c in vJ['cast']]
    creditsDict += cast

### Map

In [None]:
imdbCredits = {}

for c in tqdm(creditsDict):
    if c['imdbId'] not in imdbCredits.keys():
        imdbCredits[c['imdbId']] = {
            'title': titleDict[c['imdbId']].lower().translate(str.maketrans('','',string.punctuation)),
            'people': []
        }

    imdbCredits[c['imdbId']]['people'].append(c['name'])

assert len(imdbCredits.keys()) == len(set([c['imdbId'] for c in creditsDict])), AssertionError("MISSING KEYS")

### Sort for faster search

In [None]:
sortedCredits = dict(sorted(imdbCredits.items(), key=lambda item: item[1]['title']))

for k, v in tqdm(sortedCredits.items()):
    sortedCredits[k]['words'] = set([w.lower() for w in v['title'].replace('-',' ').split(' ')])
    

### Cosine sim to identify commonalities

In [None]:
franchiseTab = []
allFranchiseIds = {}

for franchiseData in tqdm(j):
    for franchiseName, franchiseMovies in franchiseData.items():
        franchiseTab.append([franchiseName, len(franchiseMovies)])

        movieIds = {}

        for k, innerDict in franchiseMovies.items():
            try:
                if isinstance(innerDict['title'], list):
                    innerDict['title'] = innerDict['title'][0]

                title = innerDict['title'].replace('-',' ').lower().translate(str.maketrans('', '', string.punctuation))
                people = [p.lower().translate(str.maketrans('', '', string.punctuation)) for p in innerDict['people']][:5]
                titleWords = set([w.lower() for w in title.split(' ')])

                # Filter out credits with no people
                def filterCredits(sortedCredits: dict, simThresh: int = 1):
                    filteredCredits = {k: v for k, v in sortedCredits.items() if \
                        len(v['words'].intersection(titleWords)) >= min(len(v['words']), len(titleWords))
                    }

                    return filteredCredits
                
                filteredCredits = filterCredits(sortedCredits)

                if len(filteredCredits) == 0:
                    movieIds[k] = None
                    continue

                # Sort by length of intersection
                sortedFilteredCredits = dict(sorted(filteredCredits.items(), key=lambda item: len(set(people).intersection(set([e.lower() for e in item[1]['people'][:5]])))/len(item[1]['people']), reverse=True))          

                maxSims = {}
                for imdbId, c in sortedFilteredCredits.items():
                    try:
                        titleSim = cosSimWords(title.lower(), c['title'].lower(), analyzer='char', ngram_range=(2,3), justBool=False)                   
                        maxSims[imdbId] = titleSim
                        if titleSim >= .99:
                            break
                    except Exception as e:
                        exceptionOutput(e)
                        pass
                
                if len(maxSims) > 0 and max(maxSims.values()) > .75:
                    movieIds[k] = max(maxSims, key=maxSims.get)
                else:
                    movieIds[k] = None

            except Exception as e:
                print(exceptionOutput(e))
                pass    
            
            allFranchiseIds[franchiseName] = movieIds

print('')
print(tabulate(franchiseTab, headers=['Franchise',"Number of Movies"]))

In [190]:
# with open('../data/franchiseMappings.json', 'w') as f:
#     json.dump(allFranchiseIds, f)

### Do one more pass for null imdbids


In [None]:
# Get all null

foundIds = []
missingIds = []
for franchise, movies in allFranchiseIds.items():
    for movieId, imdbId in movies.items():
        if imdbId != None:
            foundIds.append(imdbId)
        else:
            missingIds.append(movieId)

print(f"NUMBER OF FOUND IDS: {len(foundIds)}")
print(f"NUMBER OF MISSING IDS: {len(missingIds)}")

### Remove found ids from candidates

In [228]:
imdbVectors = []
translator = str.maketrans('', '', string.punctuation)

for imdbId, data in imdbCredits.items():
    if imdbId in foundIds:
        continue
    
    titleStripped = data['title'].lower().translate(translator)
    peopleStripped = ' '.join(data['people'][:5]).lower().translate(str.maketrans('','',string.punctuation))

    fullStr = titleStripped + ' ' + peopleStripped

    imdbVectors.append((imdbId, fullStr))


### Formulate Missing Dict

In [263]:
jFiltered = []

for f in j:
    fD = {}
    for t, ms in f.items():

        fD[t] = {}
        for mt, md in ms.items():
            if mt in missingIds:
                fD[t][mt]=md


    if len(fD[t]) > 0:
        jFiltered.append(fD)

In [None]:
franchiseTab = []
newFranchiseIds = {}

for franchiseData in tqdm(jFiltered):
    for franchiseName, franchiseMovies in franchiseData.items():
        franchiseTab.append([franchiseName, len(franchiseMovies)])

        movieIds = {}

        for k, innerDict in franchiseMovies.items():
            try:
                if isinstance(innerDict['title'], list):
                    innerDict['title'] = innerDict['title'][0]

                title = innerDict['title'].replace('-',' ').lower().translate(str.maketrans('', '', string.punctuation))
                people = [p.lower().translate(str.maketrans('', '', string.punctuation)) for p in innerDict['people']][:5]
                titleWords = set([w.lower() for w in title.split(' ')])

                # Filter out credits with no people
                def filterCredits(sortedCredits: dict, simThresh: int = 1):
                    filteredCredits = {k: v for k, v in sortedCredits.items() if \
                        len(v['words'].intersection(titleWords)) >= simThresh
                    }

                    return filteredCredits
                
                filteredCredits = filterCredits(sortedCredits, 2)

                if len(filteredCredits) == 0:
                    filteredCredits = filterCredits(sortedCredits, 1)

                if len(filteredCredits) == 0:
                    continue

                # Sort by length of intersection
                sortedFilteredCredits = dict(sorted(filteredCredits.items(), key=lambda item: len(set(people).intersection(set([e.lower() for e in item[1]['people'][:5]])))/len(item[1]['people']), reverse=True))          

                maxSims = {}
                for imdbId, c in sortedFilteredCredits.items():
                    try:
                        titleSim = cosSimWords(title.lower().translate(translator), c['title'].lower().translate(translator), analyzer='char', ngram_range=(2,6), justBool=False)                   

                        maxSims[imdbId] = titleSim
                        if titleSim >= .99:
                            break
                    except Exception as e:
                        exceptionOutput(e)
                        pass
                
                if len(maxSims) > 0:
                    movieIds[k] = max(maxSims, key=maxSims.get)
                else:
                    movieIds[k] = None

            except Exception as e:
                print(exceptionOutput(e))
                pass    
            
            newFranchiseIds[franchiseName] = movieIds

print('')
print(tabulate(franchiseTab, headers=['Franchise',"Number of Movies"]))

In [294]:
def combine_nested_dicts(dict1, dict2):
    combined_dict = {}

    # Get all unique keys from both dictionaries
    all_keys = set(dict1.keys()).union(set(dict2.keys()))

    for key in all_keys:
        if key in dict1 and key in dict2:
            # Combine values if key exists in both dictionaries
            combined_dict[key] = {**dict1[key], **dict2[key]}
        elif key in dict1:
            # Add value from dict1 if key exists only in dict1
            combined_dict[key] = dict1[key]
        else:
            # Add value from dict2 if key exists only in dict2
            combined_dict[key] = dict2[key]

    return combined_dict

In [297]:
completeFranchiseIds = combine_nested_dicts(allFranchiseIds, newFranchiseIds)

with open('../data/franchiseMappings2.json', 'w') as f:
    json.dump(completeFranchiseIds, f)

### Final Heuristic Pass