In [6]:
# Type hints
from typing import Any, Dict, List, Tuple

# Standard library
import ast
import logging
import re
import warnings

# Third-party packages - Data manipulation
import pandas as pd
from tqdm import tqdm

# Third-party packages - Environment & Database
from dotenv import load_dotenv
from neo4j import GraphDatabase

# Third-party packages - Error handling & Retry logic
from tenacity import retry, stop_after_attempt, wait_exponential

# Langchain - Core
from langchain.chains import GraphCypherQAChain
from langchain.prompts import PromptTemplate
from langchain_core.documents import Document

# Langchain - Models & Connectors
#from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAI
#from langchain_ollama.llms import OllamaLLM

# Langchain - Graph & Experimental
from langchain_community.graphs import Neo4jGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer

# Suppress warnings
warnings.filterwarnings('ignore')

# Load environment variables
load_dotenv()

False

In [8]:
movies = pd.read_csv('/home/eric/srcs/Graph_Lab/sample_data/wiki_movie_plots_deduped.csv') # adjust the path if you manually downloaded the dataset
movies.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [9]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """Clean and preprocess DataFrame.
    
    Args:
        data: Input DataFrame
        
    Returns:
        Cleaned DataFrame
    """
    df.drop(["Wiki Page"], axis=1, inplace=True)

    # Drop duplicates
    df = df.drop_duplicates(subset='Title', keep='first')
    
    # Get object columns
    col_obj = df.select_dtypes(include=["object"]).columns
    
    # Clean string columns
    for col in col_obj:
        # Strip whitespace
        df[col] = df[col].str.strip()
        
        # Replace unknown/empty values
        df[col] = df[col].apply(
            lambda x: None if pd.isna(x) or x.lower() in ["", "unknown"] 
            else x.capitalize()
        )
    
    # Drop rows with any null values
    df = df.dropna(how="any", axis=0)
    
    return df

movies = clean_data(movies).head(1000)
movies.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Plot
13,1907,Daniel boone,American,Wallace mccutcheon and ediwin s. porter,"William craven, florence lawrence",Biographical,Boone's daughter befriends an indian maiden as...
15,1907,Laughing gas,American,Edwin stanton porter,"Bertha regustus, edward boulden",Comedy,The plot is that of a black woman going to the...
16,1908,The adventures of dollie,American,D. w. griffith,"Arthur v. johnson, linda arvidson",Drama,On a beautiful summer day a father and mother ...
17,1908,The black viper,American,D. w. griffith,D. w. griffith,Drama,A thug accosts a girl as she leaves her workpl...
18,1908,A calamitous elopement,American,D.w. griffith,"Harry solter, linda arvidson",Comedy,A young couple decides to elope after being ca...


In [10]:
class Neo4jConnection:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()
        print("Connection closed")

    def reset_database(self):
        with self.driver.session() as session:
            session.run("MATCH (n) DETACH DELETE n")
            print("Database resetted successfully!")

    def execute_query(self, query, parameters=None):
        with self.driver.session() as session:
            result = session.run(query, parameters or {})
            return [record for record in result]

In [11]:
uri = "bolt://localhost:7687"
user = "neo4j"
password = "ilovemovies"
conn = Neo4jConnection(uri, user, password)
conn.reset_database()

ServiceUnavailable: Couldn't connect to localhost:7687 (resolved to ()):
Failed to establish connection to ResolvedIPv4Address(('127.0.0.1', 7687)) (reason [Errno 111] Connection refused)