# Scripts

### 1.Creation Of MySql DataBase

In [None]:
CREATE DATABASE IF NOT EXISTS movies;
USE movies;

-- Table des Films (Fact Table)
CREATE TABLE Fact_Movie (
    movie_id INT AUTO_INCREMENT PRIMARY KEY,
    title VARCHAR(255),
    release_year INT,
    worldwide_revenue BIGINT,
    domestic_revenue BIGINT,
    domestic_percentage DECIMAL(5,2),
    foreign_revenue BIGINT,
    foreign_percentage DECIMAL(5,2),
    rating DECIMAL(3,1),
    vote_count INT,
    original_language VARCHAR(10)
);

-- Table des Genres (Dimension)
CREATE TABLE Dim_Genre (
    genre_id INT AUTO_INCREMENT PRIMARY KEY,
    genre_name VARCHAR(100) UNIQUE
);

-- Table de relation Many-to-Many entre Films et Genres
CREATE TABLE Movie_Genre (
    movie_id INT,
    genre_id INT,
    PRIMARY KEY (movie_id, genre_id),
    FOREIGN KEY (movie_id) REFERENCES Fact_Movie(movie_id) ON DELETE CASCADE,
    FOREIGN KEY (genre_id) REFERENCES Dim_Genre(genre_id) ON DELETE CASCADE
);

-- Table des Pays de Production (Dimension)
CREATE TABLE Dim_ProductionCountry (
    country_id INT AUTO_INCREMENT PRIMARY KEY,
    country_name VARCHAR(100) UNIQUE
);

-- Table de relation Many-to-Many entre Films et Pays
CREATE TABLE Movie_ProductionCountry (
    movie_id INT,
    country_id INT,
    PRIMARY KEY (movie_id, country_id),
    FOREIGN KEY (movie_id) REFERENCES Fact_Movie(movie_id) ON DELETE CASCADE,
    FOREIGN KEY (country_id) REFERENCES Dim_ProductionCountry(country_id) ON DELETE CASCADE
);


### 2. Load of the CSV File into the DW

In [None]:
pip install pymysql

In [None]:
import pymysql
import pandas as pd

# Charger le fichier CSV
df = pd.read_csv("movies_box_office.csv")
df = df.dropna()


try:
    # Connexion MySQL
    conn = pymysql.connect(
        host="localhost",
        user="root",
        password="",  # Mets ton mot de passe MySQL si nécessaire
        database="movies"
    )

    cursor = conn.cursor()
    print("✅ Successfully connected to MySQL!")

    # Renommer les colonnes pour correspondre à la base de données
    df = df.rename(columns={
        "Release Group": "title",
        "Year": "release_year",
        "$Worldwide": "worldwide_revenue",
        "$Domestic": "domestic_revenue",
        "Domestic %": "domestic_percentage",
        "$Foreign": "foreign_revenue",
        "Foreign %": "foreign_percentage",
        "Rating": "rating",
        "Vote_Count": "vote_count",
        "Original_Language": "original_language",
        "Production_Countries": "production_country",
        "Genres": "genre"
    })

    # Remplacer les NaN par None
    df = df.where(pd.notna(df), None)

    # Insérer les films
    for _, row in df.iterrows():
        cursor.execute("""
            INSERT INTO Fact_Movie (title, release_year, worldwide_revenue, domestic_revenue, domestic_percentage, 
                                    foreign_revenue, foreign_percentage, rating, vote_count, original_language)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        """, (row["title"], row["release_year"], row["worldwide_revenue"], row["domestic_revenue"],
              row["domestic_percentage"], row["foreign_revenue"], row["foreign_percentage"], row["rating"],
              row["vote_count"], row["original_language"]))

        movie_id = cursor.lastrowid  # Récupérer l'ID du film inséré

        # Insérer les genres dans Dim_Genre et Movie_Genre
        if row["genre"]:
            genres = row["genre"].split(", ")  # Séparer les genres
            for genre in genres:
                cursor.execute("INSERT IGNORE INTO Dim_Genre (genre_name) VALUES (%s)", (genre,))
                cursor.execute("SELECT genre_id FROM Dim_Genre WHERE genre_name = %s", (genre,))
                genre_id = cursor.fetchone()[0]
                cursor.execute("INSERT IGNORE INTO Movie_Genre (movie_id, genre_id) VALUES (%s, %s)", (movie_id, genre_id))

        # Insérer les pays de production dans Dim_ProductionCountry et Movie_ProductionCountry
        if row["production_country"]:
            countries = row["production_country"].split(", ")  # Séparer les pays
            for country in countries:
                cursor.execute("INSERT IGNORE INTO Dim_ProductionCountry (country_name) VALUES (%s)", (country,))
                cursor.execute("SELECT country_id FROM Dim_ProductionCountry WHERE country_name = %s", (country,))
                country_id = cursor.fetchone()[0]
                cursor.execute("INSERT IGNORE INTO Movie_ProductionCountry (movie_id, country_id) VALUES (%s, %s)", (movie_id, country_id))

    conn.commit()
    print("✅ Data successfully inserted!")

except pymysql.MySQLError as err:
    print(f"❌ MySQL error: {err}")

finally:
    if 'cursor' in locals():
        cursor.close()
    if 'conn' in locals() and conn.open:
        conn.close()
        print("🔌 MySQL connection closed.")

### 2. Neo4j DataBase

Neo4j was used in this project to handle unstructured and semi-structured data efficiently. Unlike traditional SQL databases, Neo4j allows us to represent relationships in a natural graph format, making it easier to analyze connections between movies, genres, languages, and reviews.

In [None]:

pip install neo4j


In [None]:
from neo4j import GraphDatabase

# Connect to your Neo4j instance (update with your credentials)
URI = "bolt://localhost:7687"  # Change if using a remote database
USERNAME = "neo4j"
PASSWORD = "neo4jmovie"

# Create a Neo4j connection
driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD))

def run_query(query):
    with driver.session() as session:
        result = session.run(query)
        return [record for record in result]


### 2. Neo4j DataBase

### **Get All Romance Movies**

In [None]:
query = """
MATCH (m:Movie)-[:BELONGS_TO]->(g:Genre {name: "Romance"})
RETURN m.title, m.year, m.metadata
ORDER BY m.year DESC
"""

romance_movies = run_query(query)

# Display results
for record in romance_movies:
    print(record["m.title"], record["m.year"])


### **Get All genres**

In [None]:
query = "MATCH (g:Genre) RETURN g.name ORDER BY g.name"
genres = run_query(query)

for record in genres:
    print(record["g.name"])


### **Average Box Office Revenue Per Genre:**

In [None]:
query = """
MATCH (m:Movie)-[:BELONGS_TO]->(g:Genre)
RETURN g.name AS Genre, avg(m.worldwide_revenue) AS AvgRevenue
ORDER BY AvgRevenue DESC
"""

avg_revenue_per_genre = run_query(query)

print("Average Box Office Revenue Per Genre:")
for record in avg_revenue_per_genre:
    print(f"{record['Genre']}: ${record['AvgRevenue']}")



### **Top 10 Highest-Grossing Movies:**

In [None]:
query = """
MATCH (m:Movie)
RETURN m.title, m.worldwide_revenue AS revenue
ORDER BY revenue DESC
LIMIT 10
"""

top_movies = run_query(query)

print("Top 10 Highest-Grossing Movies:")
for record in top_movies:
    revenue = record["revenue"] if record["revenue"] is not None else "N/A"
    print(f"{record['m.title']} - ${revenue}")


### **Get languages**

In [None]:
query = "MATCH (l:Language) RETURN l.name ORDER BY l.name"
languages = run_query(query)

print("Languages Available:")
for record in languages:
    print(record["l.name"])


### **Find Movies Produced in a Specific Country (e.g., France)**

In [None]:
query = """
MATCH (m:Movie)-[:PRODUCED_IN]->(c:Country {name: "France"})
RETURN m.title, m.year
ORDER BY m.year DESC
"""

french_movies = run_query(query)

print("Movies Produced in France:")
for record in french_movies:
    print(f"{record['m.title']} ({record['m.year']})")


### **Get number movies per genre**

In [None]:
query = """
MATCH (m:Movie)-[:BELONGS_TO]->(g:Genre)
RETURN g.name AS Genre, count(m) AS MovieCount
ORDER BY MovieCount DESC
"""

movies_per_genre = run_query(query)

print("Number of Movies Per Genre:")
for record in movies_per_genre:
    print(f"{record['Genre']}: {record['MovieCount']} movies")


**Creation DATABASE**

We split the data into multiple files to avoid duplication and improve efficiency. A movie can have multiple genres, languages, and countries, so storing everything in one file would repeat data. Instead, we created separate nodes in Neo4j and linked them with relationships, making searches faster and more organized. We also extracted key details from metadata for better querying. This structure follows best practices for handling connected data efficiently.

**Movies (Unstructured)**


In [None]:
LOAD CSV WITH HEADERS FROM 'file:///Movies_Unstructured_Cleaned.csv' AS row
CREATE (:Movie {
    title: row.title,
    year: toInteger(row.year),
    description: row.description,
    metadata: row.metadata
})

**Genres**

In [None]:
LOAD CSV WITH HEADERS FROM 'file:///Genres.csv' AS row
CREATE (:Genre {name: row.name});

**Movie-Genre Relationships**

In [None]:
LOAD CSV WITH HEADERS FROM 'file:///Movie-Genre.csv' AS row
MATCH (m:Movie {title: row.title})
MATCH (g:Genre {name: row.genre})
CREATE (m)-[:BELONGS_TO]->(g);

**Countries**

In [None]:
LOAD CSV WITH HEADERS FROM 'file:///Countries.csv' AS row
CREATE (:Country {name: row.name});

**Movie-Country Relationships**

In [None]:
LOAD CSV WITH HEADERS FROM 'file:///Movie-Country.csv' AS row
MATCH (m:Movie {title: row.title})
MATCH (c:Country {name: row.country})
CREATE (m)-[:PRODUCED_IN]->(c);


**Languages**

In [None]:
LOAD CSV WITH HEADERS FROM 'file:///Languages.csv' AS row
CREATE (:Language {name: row.name});

**Movie-Language Relationships**

In [None]:
LOAD CSV WITH HEADERS FROM 'file:///Movie-Language.csv' AS row
MATCH (m:Movie {title: row.title})
MATCH (l:Language {name: row.language})
CREATE (m)-[:LANGUAGE_IS]->(l);

**Reviews (Unstructured)**

In [None]:
LOAD CSV WITH HEADERS FROM 'file:///Reviews.csv' AS row
MATCH (m:Movie {title: row.title})
CREATE (r:Review {text: row.text, sentiment_score: toFloat(row.sentiment_score)})
CREATE (r)-[:HAS_REVIEW]->(m);
