In [2]:
import pandas as pd
import os
from dotenv import load_dotenv
import json
import openai
from openai import OpenAI
import dask.dataframe as dd
from langchain_community.graphs import Neo4jGraph
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

In a future release, Dask DataFrame will use a new implementation that
contains several improvements including a logical query planning.
The user-facing DataFrame API will remain unchanged.

The new implementation is already available and can be enabled by
installing the dask-expr library:

    $ pip install dask-expr

and turning the query planning option on:

    >>> import dask
    >>> dask.config.set({'dataframe.query-planning': True})
    >>> import dask.dataframe as dd

API documentation for the new implementation is available at
https://docs.dask.org/en/stable/dask-expr-api.html

Any feedback can be reported on the Dask issue tracker
https://github.com/dask/dask/issues 


    # via Python

    # via CLI


  import dask.dataframe as dd
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dotenv()

True

In [4]:
openai.api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI()

In [5]:
crew_df = pd.read_csv('clean_data/crew_clean.csv')
crew_df.head()

Unnamed: 0,tconst,directors,writers
0,tt0000012,"nm0525908,nm0525910",\N
1,tt0000417,nm0617588,"nm0617588,nm0894523,nm0920229"
2,tt0000439,nm0692105,"nm1145809,nm0692105"
3,tt0006864,nm0000428,"nm0048512,nm0115218,nm0000428,nm0002616,nm0640..."
4,tt0009968,nm0000428,"nm0121885,nm0000428"


In [6]:
movie_ratings_df = pd.read_csv('clean_data/movie_ratings_clean.csv')
movie_ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000012,7.4,12824
1,tt0000417,8.1,55524
2,tt0000439,7.3,20924
3,tt0006864,7.7,16683
4,tt0009968,7.2,11035


In [9]:
movie_title_df = pd.read_csv('clean_data/movie_title_clean.csv')
movie_title_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000012,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,0,1896.0,\N,1.0,"Documentary,Short"
1,tt0000417,short,A Trip to the Moon,Le voyage dans la lune,0,1902.0,\N,13.0,"Action,Adventure,Comedy"
2,tt0000439,short,The Great Train Robbery,The Great Train Robbery,0,1903.0,\N,11.0,"Action,Adventure,Crime"
3,tt0006864,movie,Intolerance,Intolerance: Love's Struggle Throughout the Ages,0,1916.0,\N,163.0,"Drama,History"
4,tt0009968,movie,Broken Blossoms,Broken Blossoms or The Yellow Man and the Girl,0,1919.0,\N,90.0,"Drama,Romance"


In [7]:
names_df = pd.read_csv('clean_data/names_clean.csv')
names_df.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0053137,tt0027125"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0117057,tt0038355"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949,1982,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0083922,tt0050976,tt0069467"


In [8]:
principals_df = pd.read_csv('clean_data/principals_clean.csv')
principals_df.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000012,1,nm2880396,self,\N,"[""Self""]"
1,tt0000012,2,nm9735580,self,\N,"[""Self""]"
2,tt0000012,3,nm0525900,self,\N,"[""Self""]"
3,tt0000012,4,nm9735581,self,\N,"[""Self""]"
4,tt0000012,5,nm9735579,self,\N,"[""Self""]"


In [10]:
def fetch_description(movie_name):
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo-1106",
            max_tokens = 70,
            messages=[
                {"role": "system", "content": "You are a Movie Expert that knows a lot about all movies. Provide a brief description of the movies, including its actors, genres and plots. keep it limited to 70 tokens"},
                {"role": "user", "content": f"Describe {movie_name}."}
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"An error occurred: {e}")
        return "Description not available"
    
def generate_embeddings(df):
    # Apply the model's encode method to the entire 'About' column for the partition
    model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
    embeddings = model.encode(df['About'].tolist())
    
    # Store embeddings in a new column. Since embeddings are arrays, you might store them as lists for compatibility
    df['Encoded_About'] = list(embeddings)
    
    return df

def encode_and_format(row):
    # Generate the embedding
    model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')
    embedding = model.encode(row['About'])

    # Convert the embedding array to a comma-separated string
    embedding_str = ",".join(map(lambda x: f"{x:.8f}", embedding))

    # Return the embedding string and its length (number of dimensions)
    return embedding_str

In [11]:
model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')

In [13]:
ddf = dd.from_pandas(movie_desc_df, npartitions=50)

In [12]:
ddf['About'] = ddf.map_partitions(lambda df: df['primaryTitle'].apply(fetch_description))

# Compute the result to get back a pandas DataFrame
# result_df = ddf.compute()

# Display the updated DataFrame
result_df.head()
# result_df.to_csv('clean_data/movie_description_clean.csv')

In [11]:
movie_desc_df = pd.read_csv('clean_data/movie_description_clean_embeddings.csv')
movie_desc_df.head()

Unnamed: 0.1,Unnamed: 0,tconst,primaryTitle,About,Embeddings
0,0,tt0000012,The Arrival of a Train,"""The Arrival of a Train"" is a 1896 silent shor...","[-0.6185780167579651, 0.3374672532081604, 0.87..."
1,1,tt0000417,A Trip to the Moon,"""A Trip to the Moon"" (1902) is a classic silen...","[-0.09322287887334824, 0.7676082849502563, 0.3..."
2,2,tt0000439,The Great Train Robbery,"""The Great Train Robbery"" is a 1978 British cr...","[0.700336217880249, 0.25397372245788574, 0.077..."
3,3,tt0006864,Intolerance,"""Intolerance"" (1916) is a silent epic directed...","[-0.0787254050374031, 0.4562748074531555, 0.61..."
4,4,tt0009968,Broken Blossoms,"""Broken Blossoms"" is a 1919 silent film direct...","[-1.0412063598632812, -0.3301662802696228, 0.7..."


In [None]:
dask_df = dd.from_pandas(movie_desc_df, npartitions=50)  
# Apply the function using Dask
dask_df['Embedding'] = dask_df.map_partitions(lambda df: df.apply(encode_and_format, axis=1), meta=('Embedding', str))
# Compute the result (trigger computation)
result_df = dask_df.compute()
# Display the first few rows of the DataFrame
(result_df.head())

In [None]:
result_df.to_csv('clean_data/movie_embeddings.csv')