## Environment Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
! pip install openai

Collecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.8


In [14]:
# Load OpenAI Key
import os
with open('/content/drive/MyDrive/ml-twotower-model/openai-key', 'r') as file:
    key = file.read().strip()
    os.environ['OPENAI_KEY'] = key


## Data Augmentation

In [135]:
import os
import pandas as pd
import openai
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

In [4]:
DATA_DIR = os.environ.get(
    "DATA_DIR", os.path.expanduser("/content/drive/MyDrive/ml-twotower-model/data/")
)

In [5]:
! ls $DATA_DIR

ml-100k			test	      train.parquet	  workflow
ml-100k.zip		test.parquet  valid
movielens_100k.parquet	train	      validation.parquet


In [40]:
movies = pd.read_parquet(os.path.join(DATA_DIR, 'movielens_100k.parquet'))
# grab movied_id and title
movies = movies[['movie_id', 'title']].drop_duplicates()
movies

Unnamed: 0,movie_id,title
0,242,Kolya (1996)
1,257,Men in Black (1997)
2,111,"Truth About Cats & Dogs, The (1996)"
3,25,"Birdcage, The (1996)"
4,382,"Adventures of Priscilla, Queen of the Desert, ..."
...,...,...
83462,1310,"Walk in the Sun, A (1945)"
92800,1614,"Reluctant Debutante, The (1958)"
94995,1505,Killer: A Journal of Murder (1995)
96951,1533,I Don't Want to Talk About It (De eso no se ha...


In [175]:
def get_movie_synopsis(movie_title, max_tokens):
    # Replace 'YOUR_API_KEY' with your actual API key from OpenAI
    api_key = os.environ['OPENAI_KEY']
    openai.api_key = api_key

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
                {"role": "system", "content": f'''You are an assistant that helps me with movie synopsis and cast of the movie (return unknown when no cast is available ) in less than {max_tokens} words given a movie title.
                    Please strictly follow the response format: Synopsis:\nCast:'''},
                {"role": "user", "content": f"Please provide a synopsis and cast (only actor names, skip character names, unknown when not available) delimited by ',' for the movie: {movie_title}\nSynopsis:\nCast:"},
            ],
        max_tokens=max_tokens
    )

    if 'choices' in response and len(response['choices']) > 0:
        # print( response['choices'][0]['message']['content'].strip())
        return response['choices'][0]['message']['content'].strip()
        # return response['choices'][0]['message']['content'].strip().replace('\n', ' ')
    else:
        print(f'Failed to get synopsis for the movie: {movie_title}')
        return


In [107]:
get_movie_synopsis('The Matrix', max_tokens=128)

Synopsis: In a dystopian future, computer hacker Neo discovers that the world around him is a simulated reality controlled by machines. With the help of a group of rebels, he becomes the chosen one destined to lead a revolution against the machines.

Cast: Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss, Hugo Weaving


In [138]:
max_tokens = 128
max_workers = 6
FAILED_FORMAT_MSG = 'Failed to collect correct format for movie {}, collected: {}'
def get_movie_synopsis_async(movie_title, max_tokens=max_tokens):
    if movie_title in synopsis:
        return
    response = get_movie_synopsis(movie_title, max_tokens)
    response_ = list(filter(None, response.split('\n')))
    if len(response_) != 2:
        print(FAILED_FORMAT_MSG.format(movie_title, response))
        return
    synopsis[movie_title] = (response_[0].replace('Synopsis:', '').strip(), response_[1].replace('Cast:', '').strip())



# for i in tqdm(range(len(movies))):
#     # print(movies.iloc[i]['title'])
#     title = movies.iloc[i]['title']
#     if title not in synopsis:
#         synopsis[title] = get_movie_synopsis(title)
#     else:
#         print(f'skip {title}')


In [134]:
get_movie_synopsis_async('The Matrix', max_tokens=128)

In [136]:
synopsis = {}

In [181]:
titles = movies['title']
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    results = list(tqdm(executor.map(get_movie_synopsis_async, titles), total=len(titles)))

100%|██████████| 1682/1682 [00:05<00:00, 328.05it/s]


In [None]:
c = 0
import time
for k,v in synopsis.items():
    if c > 10:
        time.sleep(5)
        c = 0
    print(k)
    print(v[0])
    print(v[1])
    print('-'*50)
    c +=1

In [160]:
# sanity check
def aug_sanity_check():
    faulty_titles = []
    for k,v in synopsis.items():
        if not v[0]:
            print(f'Missing synopsis for movie {k}', v[0])
            faulty_titles.append(k)
        if not v[1] or len(list(filter(None, v[1].split(',')))) < 2 :
            print(f'Cast probably faulty for movie {k}', v[1])
            faulty_titles.append(k)
    return faulty_titles


In [213]:
faulty_titles = aug_sanity_check()

Cast probably faulty for movie Sleepover (1995) Unknown
Cast probably faulty for movie Grand Day Out, A (1992) Unknown
Cast probably faulty for movie Last Klezmer: Leopold Kozlowski, His Life and Music, The (1995) Unknown
Cast probably faulty for movie Koyaanisqatsi (1983) Unknown
Cast probably faulty for movie You So Crazy (1994) Martin Lawrence
Cast probably faulty for movie JLG/JLG - autoportrait de décembre (1994) Unknown
Cast probably faulty for movie Caro Diario (Dear Diary) (1994) Nanni Moretti


In [210]:
# manual expulsion
faulty_titles.extend([
    'Caro Diario (Dear Diary) (1994)'
])

In [211]:
for t_ in faulty_titles:
    synopsis.pop(t_)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    results = list(tqdm(executor.map(get_movie_synopsis_async, faulty_titles), total=len(faulty_titles)))

100%|██████████| 6/6 [00:05<00:00,  1.02it/s]


In [212]:
synopsis_df = pd.DataFrame(dict(
    title=synopsis.keys(),
    synopsis=[v[0] for v in synopsis.values()],
    cast=[v[1] for v in synopsis.values()]
))
synopsis_df

Unnamed: 0,title,synopsis,cast
0,"Truth About Cats & Dogs, The (1996)","""The Truth About Cats & Dogs"" is a romantic co...","Uma Thurman, Janeane Garofalo, Ben Chaplin"
1,Groundhog Day (1993),Groundhog Day is a romantic comedy about a wea...,"Bill Murray, Andie MacDowell, Chris Elliott"
2,"Birdcage, The (1996)","In ""Birdcage, The"", a gay cabaret owner and hi...","Robin Williams, Nathan Lane, Gene Hackman, Dia..."
3,Men in Black (1997),"In Men in Black, Will Smith plays a talented N...","Will Smith, Tommy Lee Jones, Linda Fiorentino,..."
4,"Adventures of Priscilla, Queen of the Desert, ...","""Adventures of Priscilla, Queen of the Desert""...","Terence Stamp, Hugo Weaving, Guy Pearce"
...,...,...,...
1659,"Last Klezmer: Leopold Kozlowski, His Life and ...","""Last Klezmer: Leopold Kozlowski, His Life and...",Unknown
1660,Koyaanisqatsi (1983),Koyaanisqatsi is a 1983 experimental documenta...,Unknown
1661,You So Crazy (1994),"""You So Crazy"" is a 1994 stand-up comedy film ...",Martin Lawrence
1662,JLG/JLG - autoportrait de décembre (1994),JLG/JLG - autoportrait de décembre (1994) is a...,Unknown


In [214]:
merged_df = pd.merge(movies, synopsis_df, on='title')
merged_df

Unnamed: 0,movie_id,title,synopsis,cast
0,242,Kolya (1996),Kolya (1996) is a Czech comedy-drama film dire...,"Zdeněk Svěrák, Andrej Chal"
1,257,Men in Black (1997),"In Men in Black, Will Smith plays a talented N...","Will Smith, Tommy Lee Jones, Linda Fiorentino,..."
2,111,"Truth About Cats & Dogs, The (1996)","""The Truth About Cats & Dogs"" is a romantic co...","Uma Thurman, Janeane Garofalo, Ben Chaplin"
3,25,"Birdcage, The (1996)","In ""Birdcage, The"", a gay cabaret owner and hi...","Robin Williams, Nathan Lane, Gene Hackman, Dia..."
4,382,"Adventures of Priscilla, Queen of the Desert, ...","""Adventures of Priscilla, Queen of the Desert""...","Terence Stamp, Hugo Weaving, Guy Pearce"
...,...,...,...,...
1677,1310,"Walk in the Sun, A (1945)","""A Walk in the Sun"" is a 1945 war drama film t...","Dana Andrews, Richard Conte, George Tyne, John..."
1678,1614,"Reluctant Debutante, The (1958)",The Reluctant Debutante (1958) tells the story...,"Rex Harrison, Kay Kendall, John Saxon, Sandra"
1679,1505,Killer: A Journal of Murder (1995),"""Killer: A Journal of Murder"" is a 1995 crime ...","James Woods, Robert Sean Leonard, Ellen Greene..."
1680,1533,I Don't Want to Talk About It (De eso no se ha...,"""I Don't Want to Talk About It"" (De eso no se ...","Marcello Mastroianni, Luisina Brando, Alejandr..."


In [215]:
merged_df.to_parquet(f'{DATA_DIR}/movielens_100k_synopsis_raw.parquet', index=False)