In [21]:
# !pip install streamlit yt-dlp python-dotenv requests pyngrok
# !ngrok config add-authtoken 1dNsR0kkJxjfxj6zJUGXqOtTXIX_3aRKjTGdN8ZnUArHKKZbz
# !xcode-select --install
# !pip install watchdog

In [26]:
from pyngrok import ngrok
from dotenv import load_dotenv
load_dotenv()
NGROK_AUTH = os.getenv("NGROK_AUTH")
ngrok.set_auth_token(NGROK_AUTH)

In [94]:
import ipywidgets as widgets
from IPython.display import display, HTML
import yt_dlp
import glob
import os
from ipywidgets import Layout
import requests
import json
from dotenv import load_dotenv
from urllib.parse import urlparse, parse_qs
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

def get_youtube_id(url):
    query = urlparse(url)
    if query.hostname == 'youtu.be':
        return query.path[1:]
    if query.hostname in ('www.youtube.com', 'youtube.com'):
        if query.path == '/watch':
            p = parse_qs(query.query)
            return p['v'][0]
        if query.path[:7] == '/embed/':
            return query.path.split('/')[2]
        if query.path[:3] == '/v/':
            return query.path.split('/')[2]
    return None

def download_video(url):
    video_id = get_youtube_id(url)
    if not video_id:
        raise ValueError("Invalid YouTube URL")
    
    ydl_opts = {
        'format': 'bestvideo+bestaudio/best',
        'outtmpl': f'{video_id}.%(ext)s',
        'postprocessors': [{
            'key': 'FFmpegVideoConvertor',
            'preferedformat': 'mp4',
        }],
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=True)
        filename = ydl.prepare_filename(info)
    
    # The file extension might have changed, so we need to get the actual filename
    actual_filename = glob.glob(f"{video_id}.*")[0]
    return actual_filename

def process_with_deepgram(file_path):
    load_dotenv()
    
    url = "https://api.deepgram.com/v1/listen"
    params = {
        "diarize": "true",
        "punctuate": "true",
        "utterances": "true"
    }
    
    DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
    headers = {
        "Authorization": f"Token {DEEPGRAM_API_KEY}",
        "Content-Type": "video/webm"
    }
    
    with open(file_path, "rb") as file:
        data = file.read()
    
    # Create a retry strategy
    retry_strategy = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS", "POST"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    
    # Create a session with the retry strategy
    session = requests.Session()
    session.mount("https://", adapter)
    
    try:
        response = session.post(url, params=params, headers=headers, data=data, verify=True, timeout=30)
        response.raise_for_status()  # Raises an HTTPError if the HTTP request returned an unsuccessful status code
        
        if response.status_code == 200:
            result = response.json()
            
            # Create 'transcripts' directory if it doesn't exist
            os.makedirs('transcripts', exist_ok=True)
            
            # Generate output filename in the 'transcripts' directory
            output_filename = os.path.join('transcripts', f"{os.path.splitext(os.path.basename(file_path))[0]}_transcript.json")
            
            with open(output_filename, "w") as outfile:
                json.dump(result, outfile, indent=2)
            print(f"Transcript saved to {output_filename}")
            return output_filename
        else:
            print(f"Error: {response.status_code}")
            print(response.text)
    except requests.exceptions.RequestException as e:
        print(f"An error occurred during the request: {e}")
    except json.JSONDecodeError:
        print("Error decoding JSON response")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    return None



# Create widgets
url_input = widgets.Text(description="YouTube URL:", layout=widgets.Layout(width='500px'))
submit_button = widgets.Button(description="Submit")
output = widgets.Output()
transcript_output = widgets.Output()

# Global variable to store the transcript path
global_transcript_path = None

# Define button click event
def on_button_click(b):
    global global_transcript_path
    with output:
        output.clear_output()
        print("Downloading video...")
        try:
            video_path = download_video(url_input.value)
            print(f"Download complete! Video saved at: {video_path}")
            os.makedirs('videos_webm', exist_ok=True)
            display(HTML(f'<video width="320" height="240" controls><source src="{video_path}" type="video/webm"></video>'))
            
            # Create download link
            download_link = f'<a href="{video_path}" download>Download Video</a>'
            display(HTML(download_link))
            
            print("Processing with Deepgram...")
            transcript_path = process_with_deepgram(video_path)
            if transcript_path:
                print(f"Transcript generated and saved at: {transcript_path}")
                # Create download link for transcript
                transcript_download_link = f'<a href="{transcript_path}" download>Download Transcript</a>'
                display(HTML(transcript_download_link))
                global_transcript_path = transcript_path
            else:
                print("Failed to generate transcript.")
                global_transcript_path = None
        except Exception as e:
            print(f"An error occurred: {str(e)}")
            global_transcript_path = None
    
    # Update the transcript output
    with transcript_output:
        transcript_output.clear_output()
        if global_transcript_path:
            print(f"Transcript path: {global_transcript_path}")
        else:
            print("No transcript generated.")

# Connect the button click event to the function
submit_button.on_click(on_button_click)

# Display widgets
display(url_input, submit_button, output, transcript_output)

Text(value='', description='YouTube URL:', layout=Layout(width='500px'))

Button(description='Submit', style=ButtonStyle())

Output()

Output()

In [None]:
# https://www.youtube.com/watch?v=yJzjyYL8l5Y

In [60]:
global_transcript_path

'transcripts/yJzjyYL8l5Y_transcript.json'

In [83]:
import json
import numpy as np

# Load JSON data from file
with open('WLASL_v0.3.json', 'r') as file:
    data = json.load(file)

# Extract and format gloss values
def get_gloss_array(data):
    gloss_values = []

    for item in data:
        if 'gloss' in item:
            gloss_values.append(item['gloss'])

    # Convert to numpy array
    gloss_array = np.array(gloss_values, dtype=object)
    return gloss_array

# Get the gloss array
unique_words = get_gloss_array(data)

unique_words

array(['book', 'drink', 'computer', ..., 'weigh', 'wheelchair', 'whistle'],
      dtype=object)

In [92]:
import json 


global_transcript_path = 'transcripts/S0P3hjM0DDM_transcript.json'
with open(global_transcript_path, 'r') as f:
    data = json.load(f)
    
words = data['results']['channels'][0]['alternatives'][0]['words']
words

[{'word': 'welcome',
  'start': 3.32,
  'end': 3.82,
  'confidence': 0.99609375,
  'speaker': 0,
  'speaker_confidence': 1.0,
  'punctuated_word': 'Welcome'},
 {'word': 'to',
  'start': 3.8799999,
  'end': 4.04,
  'confidence': 0.99560547,
  'speaker': 0,
  'speaker_confidence': 1.0,
  'punctuated_word': 'to'},
 {'word': 'our',
  'start': 4.04,
  'end': 4.2799997,
  'confidence': 0.9975586,
  'speaker': 0,
  'speaker_confidence': 1.0,
  'punctuated_word': 'our'},
 {'word': 'basic',
  'start': 4.2799997,
  'end': 4.7799997,
  'confidence': 0.8647461,
  'speaker': 0,
  'speaker_confidence': 1.0,
  'punctuated_word': 'basic'},
 {'word': 'series',
  'start': 4.8399997,
  'end': 5.24,
  'confidence': 0.99609375,
  'speaker': 0,
  'speaker_confidence': 1.0,
  'punctuated_word': 'series.'},
 {'word': 'these',
  'start': 5.8799996,
  'end': 6.2,
  'confidence': 0.9946289,
  'speaker': 0,
  'speaker_confidence': 1.0,
  'punctuated_word': 'These'},
 {'word': 'videos',
  'start': 6.2,
  'end': 6.

In [86]:
import json 
with open(global_transcript_path, 'r') as f:
    data = json.load(f)
    
words = data['results']['channels'][0]['alternatives'][0]['words']

# Create a list of tuples with word and timestamps
parsed_words = [(word_info['word'], word_info['start'], word_info['end']) for word_info in words]

# Display the list of words and timestamps
in_unique_words = 0
not_in_unique_words = 0

for word, start, end in parsed_words:
    word_lower = word.lower().strip()
    if word_lower in unique_words:
        tag = "Yes"
        in_unique_words += 1
    else:
        tag = "No"
        not_in_unique_words += 1
    print(f"Word: {word}, Start: {start}, End: {end}, Tag: {tag}")

print(f"\nWords in unique words list: {in_unique_words}")
print(f"Words not in unique words list: {not_in_unique_words}")



Word: welcome, Start: 3.32, End: 3.82, Tag: Yes
Word: to, Start: 3.8799999, End: 4.04, Tag: Yes
Word: our, Start: 4.04, End: 4.2799997, Tag: Yes
Word: basic, Start: 4.2799997, End: 4.7799997, Tag: Yes
Word: series, Start: 4.8399997, End: 5.24, Tag: No
Word: these, Start: 5.8799996, End: 6.2, Tag: No
Word: videos, Start: 6.2, End: 6.68, Tag: No
Word: have, Start: 6.68, End: 6.92, Tag: Yes
Word: real, Start: 6.92, End: 7.24, Tag: Yes
Word: english, Start: 7.48, End: 7.98, Tag: Yes
Word: for, Start: 8.2, End: 8.44, Tag: Yes
Word: low, Start: 8.44, End: 8.76, Tag: No
Word: level, Start: 8.76, End: 9.26, Tag: No
Word: beginners, Start: 9.32, End: 9.559999, Tag: No
Word: our, Start: 10.450819, End: 10.84836, Tag: Yes
Word: teachers, Start: 10.84836, End: 11.166393, Tag: No
Word: teach, Start: 11.166393, End: 11.404918, Tag: Yes
Word: you, Start: 11.404918, End: 11.643442, Tag: Yes
Word: words, Start: 11.643442, End: 12.120491, Tag: No
Word: and, Start: 12.120491, End: 12.59754, Tag: Yes
Word

In [68]:
data = {
  "new_sentences": [
    "It is that fundamental belief.",
    "I am my brother's keeper.",
    "I am my sister's keeper that makes this country work.",
    "It's what allows us to pursue our individual dreams and yet still come together as one American family.",
    "E pluribus unum: out of many, one.",
    "Now, even as we speak, there are those who are preparing to divide.",
    "The spin masters, the negative ad peddlers who embrace the politics of anything goes.",
    "Well, I say to them tonight, there is not a liberal America and a conservative America.",
    "There is the United States of America.",
    "There is not a black America and a white America and Latino America and Asian America.",
    "There's the United States of America.",
    "The pundits like to slice and dice our country into red states and blue states, red states for Republicans, blue states for Democrats, but I've got news for them too.",
    "We worship an awesome God in the blue states, and we don't like federal agents poking around in our libraries in the red states.",
    "We coach little league in the blue states.",
    "And, yes, we've got some gay friends in the red states.",
    "There are patriots who oppose the war in Iraq.",
    "And there are patriots who supported the war in Iraq.",
    "We are one people, all of us pledging allegiance to the stars and stripes.",
    "All of us defending the United States of America."
  ],
  "gloss": [
    ["believe"],
    ["me", "brother", "help"],
    ["me", "sister", "help", "make", "country", "work"],
    ["allow", "we", "want", "dream", "but", "still", "together", "one", "america", "family"],
    ["many", "become", "one"],
    ["now", "we", "talk", "some", "people", "prepare", "divide"],
    ["people", "negative", "politics", "accept", "anything"],
    ["me", "tell", "them", "tonight", "no", "liberal", "america", "conservative", "america"],
    ["have", "united", "states", "america"],
    ["no", "black", "america", "white", "america", "latino", "america", "asia", "america"],
    ["have", "united", "states", "america"],
    ["people", "like", "cut", "country", "red", "state", "blue", "state", "red", "republican", "blue", "democrat", "but", "me", "have", "news"],
    ["we", "worship", "god", "blue", "state", "we", "no", "like", "government", "look", "library", "red", "state"],
    ["we", "teach", "baseball", "blue", "state"],
    ["yes", "we", "have", "gay", "friend", "red", "state"],
    ["have", "patriot", "against", "war", "iraq"],
    ["have", "patriot", "support", "war", "iraq"],
    ["we", "one", "people", "all", "we", "promise", "flag"],
    ["all", "we", "protect", "united", "states", "america"]
  ]
}

In [69]:
signer_count = {}
tokens_not_found = set()
for sentence in data['gloss']:
    print(sentence)
    sentence_videos = []
    for token in sentence:
        print(token)
        try:
            row = df.loc[df['gloss'] == token].iloc[0]
        except IndexError:
            print(f"Token not found: {token}")
            tokens_not_found.add(token)
            sentence_videos.append([])
            continue
        video_data = row['instances']
        for video in video_data:
            if video['signer_id'] not in signer_count:
                signer_count[video['signer_id']] = 0
            signer_count[video['signer_id']] += 1
        sentence_videos.append(video_data)

print(tokens_not_found)
print(signer_count)

['believe']
believe
['me', 'brother', 'help']
me
brother
help
['me', 'sister', 'help', 'make', 'country', 'work']
me
sister
help
make
country
work
['allow', 'we', 'want', 'dream', 'but', 'still', 'together', 'one', 'america', 'family']
allow
we
want
dream
but
still
together
one
america
family
['many', 'become', 'one']
many
become
one
['now', 'we', 'talk', 'some', 'people', 'prepare', 'divide']
now
we
talk
some
people
prepare
divide
['people', 'negative', 'politics', 'accept', 'anything']
people
negative
politics
accept
anything
Token not found: anything
['me', 'tell', 'them', 'tonight', 'no', 'liberal', 'america', 'conservative', 'america']
me
tell
them
tonight
no
liberal
Token not found: liberal
america
conservative
Token not found: conservative
america
['have', 'united', 'states', 'america']
have
united
Token not found: united
states
Token not found: states
america
['no', 'black', 'america', 'white', 'america', 'latino', 'america', 'asia', 'america']
no
black
america
white
america
la

In [49]:
import subprocess
import time
from pyngrok import ngrok
import streamlit as st
from streamlit_app import main  # Import the main function from streamlit_app.py

# Start the Streamlit app as a subprocess
port = 8502
streamlit_process = subprocess.Popen(["streamlit", "run", "streamlit_app.py", f"--server.port={port}"])

# Wait a bit for Streamlit to start
time.sleep(5)

# Use ngrok to create a public URL
public_url = ngrok.connect(port)
print(f"You can now view your Streamlit app in your browser.")
print(f"Local URL: http://localhost:{port}")
print(f"Network URL: {public_url}")

# Run the main function in the notebook
main()

# Keep the notebook running and clean up when interrupted
try:
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    print("Shutting down...")
    streamlit_process.terminate()
    ngrok.kill()


  You can now view your Streamlit app in your browser.

  Local URL: http://localhost:8502
  Network URL: http://192.168.1.109:8502

[youtube] Extracting URL: https://www.youtube.com/watch?v=yJzjyYL8l5Y
[youtube] yJzjyYL8l5Y: Downloading webpage
[youtube] yJzjyYL8l5Y: Downloading ios player API JSON
[youtube] yJzjyYL8l5Y: Downloading tv player API JSON
[youtube] yJzjyYL8l5Y: Downloading m3u8 information
[info] yJzjyYL8l5Y: Downloading 1 format(s): 243+251


2024-08-03 15:43:44.006 Thread 'MainThread': missing ScriptRunContext
2024-08-03 15:43:44.008 Thread 'MainThread': missing ScriptRunContext


[download] yJzjyYL8l5Y.webm has already been downloaded
You can now view your Streamlit app in your browser.
Local URL: http://localhost:8502
Network URL: NgrokTunnel: "https://a031-107-3-134-29.ngrok-free.app" -> "http://localhost:8502"


2024-08-03 15:43:44.009 Thread 'MainThread': missing ScriptRunContext
2024-08-03 15:43:44.010 Thread 'MainThread': missing ScriptRunContext
2024-08-03 15:43:44.012 Thread 'MainThread': missing ScriptRunContext
2024-08-03 15:43:44.013 Thread 'MainThread': missing ScriptRunContext
2024-08-03 15:43:44.014 Thread 'MainThread': missing ScriptRunContext
2024-08-03 15:43:44.015 Thread 'MainThread': missing ScriptRunContext
2024-08-03 15:43:44.015 Thread 'MainThread': missing ScriptRunContext
2024-08-03 15:43:44.016 Thread 'MainThread': missing ScriptRunContext
2024-08-03 15:43:44.017 Thread 'MainThread': missing ScriptRunContext
2024-08-03 15:43:44.018 Thread 'MainThread': missing ScriptRunContext
2024-08-03 15:43:44.019 Thread 'MainThread': missing ScriptRunContext
2024-08-03 15:43:44.020 Thread 'MainThread': missing ScriptRunContext
2024-08-03 15:43:44.020 Thread 'MainThread': missing ScriptRunContext
2024-08-03 15:43:44.021 Thread 'MainThread': missing ScriptRunContext
2024-08-03 15:43:44.

Shutting down...  Stopping...

  Stopping...


In [None]:
https://www.youtube.com/watch?v=yJzjyYL8l5Y