### Imports

In [1]:
import redis
import re
import json
import pandas as pd
import numpy as np
import sys
sys.path.append('../library/')
from core import extractBetween, extractElementsInOrder, exceptionOutput
from datetime import datetime
import nest_asyncio
import asyncio
import arrow
from tqdm import tqdm

r = redis.Redis(
    host='localhost',
    port=6379,
    charset="utf-8",
    decode_responses=True,
    db = 3
)

### Load Redis Data from DB 2

In [2]:
dates = r.keys()
vals = r.mget(dates)

### Form Basic Dfs

In [3]:
valsJson = [el[1:] for el in [json.loads(e) for e in vals]]

columns = ['rank','previous','title','distributor','weekGross','pctLastWeek','numberOfTheaters','numberOfTheatersChange','perTheaterAvg','totalGross','weeksInRelease']

allWeeks = []
for i, week in enumerate(tqdm(valsJson)):
    for movie in week:
        if all([v== '' for v in movie]) | (movie[0] == ''):
            continue

        movieDict = dict(zip(columns, movie))
        movieDict['date'] = dates[i]
        allWeeks.append(movieDict)

100%|██████████| 2330/2330 [00:00<00:00, 13838.82it/s]


In [4]:
df = pd.DataFrame.from_dict(allWeeks)

df['dateDt'] = pd.to_datetime(df['date'])


for col in ['weekGross','pctLastWeek','numberOfTheaters','perTheaterAvg','totalGross']:
    df[col] = df[col].str.replace('$','', regex=False)\
                                    .str.replace(',','',regex=False)\
                                    .str.replace('%','',regex=False)\
                                    .str.replace('<','',regex=False)\
                                    .str.replace('(v)', '', regex=False)\
                                    .replace('-', np.nan)\
                                    .replace('', np.nan)\
                                    .replace('n/c', np.nan)\
                                    .astype(float)

In [5]:
# Rank seems to be unreliable, we'll just compute based on weekGross
df.sort_values(by=['dateDt', 'weekGross'], ascending=[False, False], inplace=True)

df['rank'] = df.groupby('date').cumcount() + 1

### We now need to map title to imdbId
- To do this, we can probably use a combination of release date and triplet title search

In [6]:
tmdbDf = pd.read_csv('../data/tmdbDetails.csv')

  tmdbDf = pd.read_csv('../data/tmdbDetails.csv')


In [7]:
relDf = tmdbDf[['title', 'release_date','imdb_id']]
relDf.drop_duplicates(subset='imdb_id', keep='last', inplace=True)
relDf.set_index('imdb_id', inplace=True)
candidateDict = relDf.to_dict('index')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relDf.drop_duplicates(subset='imdb_id', keep='last', inplace=True)


In [8]:
async def tripletSearchWithYear(targetList: list, candidates: dict) -> str:
    """ Triplet search
    
        - Uses a chunk based search algorithm to search for a target against a list of candidates
        - This includes a year for final filtering
    """
    results = []
    for target in tqdm(targetList):
        scores = {}
        
        def getChunks(str, chunkSize=3):
            chunks = [] 
            for i in range(len(str)):
                chunk = str[i:i+chunkSize]
                if len(chunk)==chunkSize:
                    chunks.append(chunk)
            
            return chunks

        targetChunks = getChunks(target['title'].lower())

        for imdbId, candidate in candidates.items():
            candidateChunks = getChunks(candidate['title'].lower())
            
            intersection1 = [chunk for chunk in candidateChunks if chunk in targetChunks]
            intersection2 = [chunk for chunk in targetChunks if chunk in candidateChunks]
            
            if ((len(intersection1) > 0) & (len(intersection2))):
                intersectionScore = (len(intersection1) / len(candidateChunks)) + (len(intersection2) / len(targetChunks))
            else:
                intersectionScore = 0

            if intersectionScore > 1: # Max score is 2
                scores[imdbId] = intersectionScore

        targetRelease = arrow.get(target['dateDt'])
        targetId = target['uuid']

        maxSim = max(scores.values())
        
        maxIds = [i for i in scores.keys() if scores[i] == maxSim]
        
        if len(maxIds) == 1:
            bestMatch = maxIds[0]
        
        elif len(maxIds) > 1: # This is for disambiguation of similar titles
            deltas = {i: abs((targetRelease - arrow.get(candidates[i]['release_date'])).total_seconds()) for i in maxIds if candidates[i]['release_date'] == candidates[i]['release_date']}
            minDelta = min(deltas, key = deltas.get)
            
            bestMatch = minDelta
        else:
            bestMatch = 'NO CANDIDATE WAS FOUND'
        
        results.append({targetId: bestMatch})

    return  results

In [9]:
from uuid import uuid1
df['uuid'] = df.apply(lambda x: str(uuid1()), axis=1)
targetDict = df[['uuid','title','dateDt']].to_dict('records')

In [12]:
def tripletSearchWithYear(target, j) -> str:
    """ Triplet search
    
        - Uses a chunk based search algorithm to search for a target against a list of candidates
        - This includes a year for final filtering
    """

    scores = {}
    
    def getChunks(str, chunkSize=3):
        chunks = [] 
        for i in range(len(str)):
            chunk = str[i:i+chunkSize]
            if len(chunk)==chunkSize:
                chunks.append(chunk)
        
        return chunks

    targetChunks = getChunks(target['title'].lower())

    for imdbId, candidate in candidateDict.items():
        candidateChunks = getChunks(candidate['title'].lower())
        
        intersection1 = [chunk for chunk in candidateChunks if chunk in targetChunks]
        intersection2 = [chunk for chunk in targetChunks if chunk in candidateChunks]
        
        if ((len(intersection1) > 0) & (len(intersection2))):
            intersectionScore = (len(intersection1) / len(candidateChunks)) + (len(intersection2) / len(targetChunks))
        else:
            intersectionScore = 0

        if intersectionScore > 1: # Max score is 2
            scores[imdbId] = intersectionScore

    targetRelease = arrow.get(target['dateDt'])
    targetId = target['uuid']

    maxSim = max(scores.values())
    
    maxIds = [i for i in scores.keys() if scores[i] == maxSim]
    
    if len(maxIds) == 1:
        bestMatch = maxIds[0]
    
    elif len(maxIds) > 1: # This is for disambiguation of similar titles
        deltas = {i: abs((targetRelease - arrow.get(candidateDict[i]['release_date'])).total_seconds()) for i in maxIds if candidateDict[i]['release_date'] == candidateDict[i]['release_date']}
        minDelta = min(deltas, key = deltas.get)
        
        bestMatch = minDelta
    else:
        bestMatch = 'NO CANDIDATE WAS FOUND'

    print(f'we dun found oneeeee yeeeaassss: {j}/{len(targetDict)}')
    return  {targetId: bestMatch}

In [13]:
import concurrent.futures as cf
from time import time


t1 = time()
with cf.ThreadPoolExecutor() as executor:
    futures = [executor.submit(tripletSearchWithYear, target, j) for j, target in enumerate(targetDict[:100])]
    results = [future.result() for future in cf.as_completed(futures)]
    
t2 = time()

print(t2 - t1)

we dun found oneeeee yeeeaassss: 0.0001395543564218263
we dun found oneeeee yeeeaassss: 0.00015284524750961928
we dun found oneeeee yeeeaassss: 7.974534652675788e-05
we dun found oneeeee yeeeaassss: 0.00010632712870234385
we dun found oneeeee yeeeaassss: 0.00011961801979013683
we dun found oneeeee yeeeaassss: 0.0001329089108779298
we dun found oneeeee yeeeaassss: 0.00016613613859741227
we dun found oneeeee yeeeaassss: 0/150479
we dun found oneeeee yeeeaassss: 6/150479
we dun found oneeeee yeeeaassss: 3/150479
we dun found oneeeee yeeeaassss: 11/150479
we dun found oneeeee yeeeaassss: 1/150479
we dun found oneeeee yeeeaassss: 0.0001461998019657228
we dun found oneeeee yeeeaassss: 9.303623761455087e-05
we dun found oneeeee yeeeaassss: 4/150479
we dun found oneeeee yeeeaassss: 0.00017942702968520524
we dun found oneeeee yeeeaassss: 0.00015949069305351576
we dun found oneeeee yeeeaassss: 5/150479
we dun found oneeeee yeeeaassss: 0.0002060088118607912
we dun found oneeeee yeeeaassss: 7/1504

ValueError: max() iterable argument is empty

### Save to CSV

In [None]:
mergedDf.to_csv('../data/allBoxOffice.csv')