In [None]:
%pip install -r requirements.txt

In [None]:
import gemini as gemini
import config as config
import bq as bq
import gcs as gcs
import utils as utils

import video_intelligence as gvi
from google.cloud import videointelligence_v1 as vi


config.WORKING_BUCKET = bucket = "video-working-bucket-031f"
config.OUTPUT_BUCKET = "video-working-bucket-031f"
config.INPUT_BUCKET = "video-input-bucket-031f"
config.PROJECT_ID = "media-414316"
config.BQ_DATASET = "video_analytics_031f"
config.BQ_TABLE_GEMINI_RESULT = "video_analytics_031f.dev_test"


import json
import pandas as pd


# Video intelligence API

In [None]:
from google.cloud import videointelligence_v1 as vi


In [None]:
# Shot change
file = "fr-FR/SHOT_CHANGE_DETECTION/cdanslair - 1708961447.5183704.json"

data = gcs.read_json_from_gcs(bucket, file)

print(data)
annotation = vi.AnnotateVideoResponse(data)

gvi.splitVideo(data)

In [None]:
# Explicit content
file = "fr-FR/EXPLICIT_CONTENT_DETECTION/test1 - BEST UPCOMING MOVIES 2024 (Trailers) - 1709023409.1792483.json"
data = gcs.read_json_from_gcs(bucket, file)

print(data)
annotation = vi.AnnotateVideoResponse(data)

#gvi.splitVideo(data)
gvi.storeVideoIntelligenceData(data)

In [None]:
# TEXT DETECTION
file = "fr-FR/TEXT_DETECTION/test1 - BEST UPCOMING MOVIES 2024 (Trailers) - 1709023409.2954755.json"

data = gcs.read_json_from_gcs(bucket, file)
annotation = vi.AnnotateVideoResponse(data)

#gvi.splitVideo(data)
gvi.storeVideoIntelligenceData(data)

# Gemini

In [None]:
import gemini as gemini
import config as config
import bq as bq
import gcs as gcs
import utils as utils

import video_intelligence as gvi
from google.cloud import videointelligence_v1 as vi


config.WORKING_BUCKET = bucket = "video-working-bucket-031f"
config.OUTPUT_BUCKET = "video-working-bucket-031f"
config.INPUT_BUCKET = "video-input-bucket-031f"
config.PROJECT_ID = "media-414316"
config.BQ_DATASET = "video_analytics_031f"
config.BQ_TABLE_GEMINI_RESULT = "video_analytics_031f.dev_test2"


import json
import pandas as pd


uri = "gs://video-output-bucket-031f/CSA/fr-FR/Copy of fr-FR_test1 - BEST UPCOMING MOVIES 2024 (Trailers).mp4/chunks - 1 - 0.0 - 0.633333.mp4"
name = "CSA/fr-FR/Copy of fr-FR_test1 - BEST UPCOMING MOVIES 2024 (Trailers).mp4/chunks - 1 - 0.0 - 0.633333.mp4"

prompt =    """Classification task. Choose between PEGI rating from (3, 7, 12, 16, 18). Based on the following content rate the intensity of the scene from: 
PEGI 3 The content of video with a PEGI 3 rating is considered suitable for all age groups. The video should not contain any sounds or pictures that are likely to frighten young children. A very mild form of violence (in a comical context or a childlike setting) is acceptable. No bad language should be heard. 

PEGI 7 video content with scenes or sounds that can possibly frightening to younger children should fall in this category. Very mild forms of violence (implied, non-detailed, or non-realistic violence) are acceptable for a video with a PEGI 7 rating 

PEGI 12 Video that show violence of a slightly more graphic nature towards fantasy characters or nonrealistic violence towards human-like characters would fall in this age category. Sexual innuendo or sexual posturing can be present, while any bad language in this category must be mild. Gambling as it is normally carried out in real life in casinos or gambling halls can also be present.

PEGI 16 This rating is applied once the depiction of violence (or sexual activity) reaches a stage that looks the same as would be expected in real life. The use of bad language in video with a PEGI 16 rating can be more extreme, while video of chance, and the use of tobacco, alcohol or illegal drugs can also be present. 

PEGI 18 is rating for adult content.

CONTENT TO RATE: 
"""

# prompt =    """what do you see ?
# VIDEO: 
# """

res = gemini.content_moderation_gemini(uri, prompt)
print(f"moderation content done on chunck uri {uri} with res = {res}")
print(80*"*")
print(res)
print(80*"*")

print("save json result in output bucket")
json_file_path = gcs.write_text_to_gcs(config.OUTPUT_BUCKET, utils.replace_extension(name, ".json"), res, "text/json")
print(f"json_file_path = {json_file_path}")

# dict= json.loads(res)
# dict= dict["csa_rules"]

print("read tags from source uri")
bucketname, video_blobname = gcs.split_gcs_uri(uri)
tags = gcs.read_tags_from_gcs(bucketname, video_blobname)

if tags is None:
    print("no tags found. WARNING do not save.")
else:
    print(f"uri= {uri} - tags = {tags}")
    dict.update(tags)     
    tags["description"]       = res
    # generate time
    tags["update_time"]       = utils.get_date_time_string()

    tags["uri"] = uri
    df = pd.DataFrame([tags])

    print(df.to_json())
    bq.save_bq(df,config.BQ_TABLE_GEMINI_RESULT, project_id=config.PROJECT_ID )

df

In [None]:
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part
import vertexai.generative_models as generative_models

def generate(video_input):
    print(f"content_moderation_gemini {video_input}")
    if type(video_input) == str:
        video_input = Part.from_uri(uri=video_input, mime_type="video/mp4")

    elif type(input) == 'Part':
        video_input = video_input
    else:
        print(f"input is not supported: {video_input}")
        return 
    vertexai.init(project="media-414316", location="europe-west1")
    model = GenerativeModel("gemini-1.0-pro-vision-001")
    responses = model.generate_content(
        ["""You are an expert in violence content moderation.
    Explain why you provide the rating with the content moderation rule and without offensive quote.

    You classify text with CSA rules. Answer short JSON results like an API without quote with the following format:
    {\"\\\"csa_rules\\\": {
        \\\"violence\\\": \"0\",
        \\\"violence_evidence\\\":  \\\"\\\"
    }

    Evaluate CSA rules based on this video part and output them in JSON. Return a valide JSON format.""", video_input, """JSON"""],
        generation_config={
            "max_output_tokens": 2048,
            "temperature": 0,
            "top_p": 1,
            "top_k": 40
        },
        safety_settings={
            generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
            generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
            generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
            generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
        },
        stream=True,
    )
  
        


    answer = []
    for response in responses:
        text = response.text
        print(text, end="")    
        answer.append(text)

    str_json = "".join(answer)
    #str_json = CleanJsonOutput(str_json)
    #print(str_json)

    return str_json
  
video_input = "gs://video-output-bucket-031f/CSA/fr-FR/Copy of fr-FR_test1 - BEST UPCOMING MOVIES 2024 (Trailers).mp4/chunks - 1 - 0.0 - 0.633333.mp4"

#video1 = Part.from_uri(uri=video_input, mime_type="video/mp4")
generate(video_input)

# Text processing

In [None]:
%pip install jsonpath-ng

In [None]:
import json
from google.cloud import bigquery  # You'll need to install this library

# Construct a BigQuery client
client = bigquery.Client()

# Replace with your BigQuery project ID and query
project_id = 'media-414316'
query = """ 
SELECT * FROM video_analytics_031f.results
"""

# Retrieve results from BigQuery
query_job = client.query(query)
results = query_job.result()

# Placeholder mapping (customize this according to your needs)
field_mapping = {
    "uri": "video_source",  # Example, assuming 'video_source' has the URI
    "time_start": "start_time_offset",
    "time_stop": "end_time_offset"
}

# Placeholder for generating time offsets (you'll need to implement the logic)
def generate_time_offsets(start_time, end_time):
    # Your logic to convert start_time and end_time into the required format
    # Example:
    if start_time:
        start_seconds = start_time.seconds  # Assuming timestamp object
        start_nanos = start_time.microseconds * 1000
    else:
        start_seconds, start_nanos = 0, 0  # Or default values

    if end_time:
        end_time = end_time.seconds  # Assuming timestamp object
        end_nanos = end_time.microseconds * 1000
    else:
        end_time, end_nanos = 0, 0  # Or default values

    return {"seconds": start_seconds, "nanos": start_nanos}, \
           {"seconds": end_time, "nanos": end_nanos}

#  Structure for the output JSON
output_json = {
    "annotation_results": []
}

# Iterate through BigQuery results
for row in results:
    annotation_result = {}

    # Populate 'input_uri' 
    annotation_result["input_uri"] = row.get(field_mapping.get("uri", "")) 

    # Populate segment
    segment = {}
    start_offset, end_offset = generate_time_offsets(
        row.get(field_mapping.get("time_start")),
        row.get(field_mapping.get("time_stop")) 
    )
    segment["start_time_offset"] = start_offset
    segment["end_time_offset"] = end_offset
    annotation_result["segment"] = segment

    # ... (Add logic for populating other fields)

    output_json["annotation_results"].append(annotation_result)

# Convert to JSON string
json_output = json.dumps(output_json, indent=2)
print(json_output)


In [None]:
# TEST 2


import json
from google.cloud import bigquery

# Replace with your project ID and BigQuery dataset/table names
PROJECT_ID = "media-414316"
DATASET = "video_analytics_031f"
TABLE = "results"


def bigquery_to_json(bigquery_results, video_source):
    annotation_results = []
    #shot_label_annotations = []

    last_end_time_offset = {
        "seconds": 0,
        "nanos": 0
    }
    segments = []
    
    result_item = {
        "input_uri": video_source,  
        "segment": {
            "start_time_offset": {
                "seconds": 0,
                "nanos": 0
                },
            "end_time_offset": last_end_time_offset
        },
        # "segment_label_annotations": [],
        "shot_label_annotations": []
    }
    annotation_results.append(result_item)

    for row in bigquery_results:
        description = get_description(row)
        if description: # and description:

            # Function to calculate time offsets based on your data
            start_time_offset, end_time_offset = calculate_time_offsets(row)

            segment_label = {
                "entity": {
                    "entity_id": "/m/00000",

                    # "language_code": "en-US"
                    "description": description
                },
                "segments": [{
                    "segment": {
                        "start_time_offset": start_time_offset,
                        "end_time_offset": end_time_offset
                    },
                    # Add 'confidence' if applicable
                    'confidence': 1.0
                }]
            }
            last_end_time_offset = end_time_offset
            
            segments.append(segment_label)
            
            result_item["shot_label_annotations"].append(segment_label)
            #shot_label_annotations.append(segment_label)

        #if len(result_item) > 0 and len(result_item["shot_label_annotations"]) > 0:
        annotation_results.append(result_item)

    #if len(annotation_results) > 0:
    annotation_results[0]['segment']['end_time_offset'] = last_end_time_offset
    

    return {"annotation_results": annotation_results}

# Example of how to calculate offsets (adjust to your data)


def calculate_time_offsets(row):
    start = float(row.get("time_start", "0"))  # Adjust field name if needed
    start_seconds, start_nanos = getSecondsNanos(start)

    end = float(row.get("time_stop", "0"))    # Adjust field name if needed
    end_seconds, end_nanos = getSecondsNanos(end)

    return {"seconds": start_seconds, "nanos": start_nanos}, {"seconds": end_seconds, "nanos": end_nanos}


def getSecondsNanos(start):
    start_seconds = int(start)
    start_nanos = int((start - start_seconds) * 10000000)
    return start_seconds, start_nanos


def get_description(row):
    description = row.get(f"description", "")
    index = row.get("index", "0")
    start = float(row.get("time_start", "0"))  # Adjust field name if needed
    end = float(row.get("time_stop", "0"))    # Adjust field name if needed

    if len(description) > 0:
        description = f"{index} ({start} - {end}) : {description}"
        print(description)
        return description
    else:
        return None


# BigQuery query execution
client = bigquery.Client(project=PROJECT_ID)
#filter = "gs://video-input-bucket-031f/fr-FR/House of the Dragon - Rhaenyra and Criston Cole sex scene.mp4"
filter = "gs://video-input-bucket-031f/fr-FR/BEST UPCOMING MOVIES  2024 (Trailers).mp4"

query = f"SELECT * FROM `{DATASET}.{TABLE}` where video_source = '{filter}'  order by  CAST(index AS INT64)  asc"
print(query)
query_job = client.query(query)
results = query_job.result()

# Conversion and output
json_data = bigquery_to_json(results, filter)
label_moderation = json.dumps(json_data, indent=2)
print(label_moderation)

# write file with content label_moderation

with open('label_moderation_Trailers.json', 'w') as f:
    f.write(label_moderation)