### Imports

In [None]:
import redis
import re
import json
import pandas as pd
import os
import numpy as np
import sys
sys.path.append('../library/')
from core import extractBetween, extractElementsInOrder, exceptionOutput, createSlidingWindows
from midStats import 
from datetime import datetime
from uuid import uuid1
import nest_asyncio
import asyncio
from datetime import timedelta
import arrow
from multiprocessing import Pool
from tqdm.notebook import tqdm   
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from joblib import Parallel, delayed

### Load Redis Data from DB 2

In [None]:
r = redis.Redis(
        host='localhost',
        port=6379,
        charset="utf-8",
        decode_responses=True,
        db = 3
    )

# TARGET DF
dates = r.keys()
vals  = r.mget(dates)

valsJson = [el[1:] for el in [json.loads(e) for e in vals]]

columns = ['rank','previous','title','distributor','weekGross','pctLastWeek','numberOfTheaters','numberOfTheatersChange','perTheaterAvg','totalGross','weeksInRelease']

allWeeks = []
for i, week in enumerate(tqdm(valsJson)):
    for movie in week:
        if all([v == '' for v in movie]) | (movie[0] == ''):
            continue

        movieDict = dict(zip(columns, movie))
        movieDict['date'] = dates[i]
        allWeeks.append(movieDict)

df = pd.DataFrame.from_dict(allWeeks)


In [None]:
df['dateDt'] = pd.to_datetime(df['date'], utc=True)

for col in ['weekGross','pctLastWeek','numberOfTheaters','perTheaterAvg','totalGross']:
    df[col] = df[col].str.replace('$','', regex=False)\
                                    .str.replace(',','',regex=False)\
                                    .str.replace('%','',regex=False)\
                                    .str.replace('<','',regex=False)\
                                    .str.replace('(v)', '', regex=False)\
                                    .replace('-', np.nan)\
                                    .replace('', np.nan)\
                                    .replace('n/c', np.nan)\
                                    .astype(float)

df.sort_values(by=['dateDt', 'weekGross'], ascending=[False, False], inplace=True)

# This step is required to infer the release date
l1 = len(df)
df.drop_duplicates(subset=['title','distributor'], keep='last', inplace=True)
print(f"WE DROPPED: {l1-len(df)} DUPLICATES")

df['uuid'] = df.apply(lambda x: str(uuid1()), axis=1)
targetDict = df[['uuid','title','dateDt']].to_dict('records')

# CANDIDATE DF
tmdbDf = pd.read_csv('../data/tmdbDetails.csv')
relDf = tmdbDf[['title', 'release_date','imdb_id']]
relDf['release_date'] = pd.to_datetime(relDf['release_date'], utc=True)
relDf.drop_duplicates(subset='imdb_id', keep='last', inplace=True)
relDf.dropna(subset='release_date', inplace=True)
candidateDict = relDf.to_dict('records')

# Set up threading
cores = os.cpu_count()
pool = Pool(cores)

### Create Windows & Vectorize 

In [None]:
candidateDict = [{'title':e['title'].lower(),'dateDt':e['release_date'], 'imdbId':e['imdb_id']} for e in candidateDict if e['release_date'] <= arrow.now()]
print(f"THERE ARE: {len(candidateDict)} CANDIDATES TO CHOOSE FROM")

allTargets = [{'title':e['title'].lower(),'dateDt':e['dateDt'], 'uuid':e['uuid']} for e in targetDict]

# Sort the lists according to date in reverse order
candidateDict = sorted(candidateDict, key=lambda x: x['dateDt'], reverse = True)
allTargetsRaw = sorted(allTargets, key=lambda x: x['dateDt'], reverse = True)

allTargets = []

for target in allTargetsRaw:
    if target not in allTargets:
        allTargets.append(target)

In [None]:
# Sort candidateDict once
candidateDict = sorted(candidateDict, key=lambda x: x['dateDt'], reverse=True)

### Find perfect matches

In [None]:
# Get years for candidate dict and target dict
for t in tqdm(allTargets):
    t['year'] = t['dateDt'].year

for c in tqdm(candidateDict):
    c['year'] = c['dateDt'].year

notFound = []

idMappings = {}

for t in tqdm(allTargets):
    foundIds = {c['imdbId'] for c in candidateDict if c['title'] == t['title'] and c['year'] == t['year']}
    if len(foundIds) != 1:
        notFound.append(t)

    else:
        idMappings[t['uuid']] = {
            'imdbId': foundIds,
            'title': t['title']
        }

for i, d in idMappings.items():
    d['imdbId'] = list(d['imdbId'])[0]

In [None]:
with open(f'../data/theNumbersData/perfectMatches.json', 'w') as f:
    json.dump(idMappings, f, indent=4)

### Best guess at imperfect matches

In [None]:
foundIds = set([i['imdbId'] for i in tqdm(idMappings.values())])
candidatesFiltered = [c for c in candidateDict if c['imdbId'] not in foundIds]

print(f"NEW LENGTH OF CANDIDATES IS: {len(candidatesFiltered)}")


windowSize = 50
targetWindows = createSlidingWindows(notFound, windowSize)
print(f"WE ARE PROCESSING: {len(targetWindows)} WINDOWS OF {windowSize}")
print(f"TOTAL SEARCH SPACE IS {len(notFound) * len(candidatesFiltered)}")

In [None]:
allMatches = []

def find_best_match_parallel(targetDict, candidateDict, vectors_L2, vectorizer):
    targetDict = sorted(targetDict, key=lambda x: x['dateDt'], reverse=True)
    releaseRange = 90
    minTarget = min(targetDict, key=lambda x: x['dateDt'])['dateDt'] - timedelta(days=releaseRange)
    maxTarget = max(targetDict, key=lambda x: x['dateDt'])['dateDt'] + timedelta(days=releaseRange)
    
    candidateDictFiltered = [c for c in candidateDict if minTarget <= c['dateDt'] <= maxTarget]

    # Vectorize titles once
    titles_L2 = [item['title'] for item in candidateDictFiltered]
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3,3)).fit(titles_L2)
    vectors_L2 = vectorizer.transform(titles_L2)
    
    print(f"Number of candidates within our release range: {len(candidateDictFiltered)}")

    titles_L1 = [item['title'] for item in targetDict]
    vectors_L1 = vectorizer.transform(titles_L1)
    
    matches = []
    for targetIdx, target in tqdm(enumerate(targetDict), total=len(targetDict)):
        best_match = None
        highest_score = -1000000
        
        for i, candidate in enumerate(candidateDictFiltered):
            title_score = cosine_similarity(vectors_L1[targetIdx], vectors_L2[i])[0][0]
            if title_score > .95:
                best_match = candidate
                highest_score = 500
                break

            date_score = abs((target['dateDt'] - candidate['dateDt']).days)
            total_score = (400 * title_score) - date_score
            
            if total_score > highest_score:
                highest_score = total_score
                best_match = candidate
            
            if total_score > 320:
                break
        
        matches.append((target, best_match, highest_score))
    
    
    print(f"We found {len(matches)} good matches!")
    print(f'\n')
    
    return matches

In [None]:
startingPoint = len([e for e in os.listdir('../data/theNumbersData/') if 'allMatches' in e])

windowedWindows = createSlidingWindows(targetWindows[startingPoint:], windowSize = 8, overlap = 0)

print(f"WE HAVE {len(windowedWindows)} WINDOWED WINDOWS TO GO!")

In [None]:
startingSaveValue = int([e for e in os.listdir('../data/theNumbersData/') if 'allMatches' in e][0].split('_')[1]) + 1
print(f"STARTING SAVE VALUE: {startingSaveValue}")

for i, targetWindow in enumerate(windowedWindows, start=startingSaveValue):
    # Parallel processing
    # allMatches = Parallel(n_jobs=-1)(delayed(find_best_match_parallel)(targetDict, candidatesFiltered, vectors_L2, vectorizer) for targetDict in targetWindow)
    
    for j, targetWindow2 in enumerate(targetWindow):
        allMatches = find_best_match_parallel(targetDict = targetWindow2, candidateDict = candidatesFiltered, vectors_L2 = vectors_L2, vectorizer = vectorizer)
        # Flatten the list of matches
        allMatches = [match for sublist in allMatches for match in sublist]

        # Custom serialization function
        def custom_serializer(obj):
            if isinstance(obj, (np.datetime64, pd.Timestamp)):
                return obj.isoformat()
            raise TypeError(f"Type {type(obj)} not serializable")

        # Assuming allMatches is already defined and contains the data you want to write
        with open(f'../data/theNumbersData/allMatches_{i}_{j}.json', 'w') as f:
            json.dump(allMatches, f, default=custom_serializer, indent=4)

In [None]:
allImperfect = []

for file in [i for i in os.listdir('../data/theNumbersData/') if 'allMatches' in i]:
    with open(os.path.join('../data/theNumbersData/',file)) as f:
        matchesTemp = json.load(f)
        allImperfect += matchesTemp

# There was an error in saving, so for some reason it created a flat list. We can just create len 3 sliding windows
allImperfectGrouped = createSlidingWindows(l = allImperfect, windowSize = 3)

goodMatches = sorted([e for e in allImperfectGrouped if e[-1] > 245], key = lambda x: x[-1], reverse=True)
badMatches = sorted([e for e in allImperfectGrouped if e[-1] <= 245], key = lambda x: x[-1], reverse=True)

print(f"WE HAVE: {len(goodMatches)} GOOD MATCHES AND WE HAVE: {len(badMatches)} BAD MATCHES")

### Final pass on bad matches before accepting defeat

In [None]:

medMatches = [cosSimWords(t[0]['title'].replace('&','and').replace('’',"'"), t[1]['title'].replace('&','and').replace('’',"'")) for t in badMatches if t[-1]]
testMatches = [item for item, keep in zip(badMatches, medMatches) if keep]

goodMatches += testMatches

### Combine imperfect and perfect matches and then map to ids

In [114]:
with open('../data/theNumbersData/perfectMatches.json') as f:
    perfectMatches = json.load(f)

In [119]:
goodMatchesMapped = {}

for el in goodMatches:
    goodMatchesMapped[el[0]['uuid']] = {
        'imdbId': el[1]['imdbId'],
        'title': el[0]['title']
    }

In [121]:
allMatchesMapped = {**goodMatchesMapped, **perfectMatches}

### Save to redis

In [125]:
r4 = redis.Redis(
        host='localhost',
        port=6379,
        charset="utf-8",
        decode_responses=True,
        db = 4
    )

for uuid, data in allMatchesMapped.items():
    r4.set(uuid, json.dumps(data))

### Save to csv

In [129]:
allIdsMapped = {uuid: el['imdbId'] for uuid,el in allMatchesMapped.items()}

df['imdbId'] = df['uuid'].map(allIdsMapped)

In [131]:
df.to_csv('../data/numbersBoxOffice.csv')