# Youtube to transcript

In [1]:
!pip install youtube-transcript-api

Defaulting to user installation because normal site-packages is not writeable
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [2]:
from youtube_transcript_api import YouTubeTranscriptApi

# Extract video ID from URL
video_url = "https://www.youtube.com/watch?v=7YHjppyKagc"
video_id = video_url.split("v=")[1].split("&")[0]

# Fetch transcript
transcript = YouTubeTranscriptApi.get_transcript(video_id)

In [3]:
print(*transcript[:10])

{'text': "let's let's get started", 'start': 0.52, 'duration': 6.2} {'text': 'um welcome', 'start': 4.72, 'duration': 4.36} {'text': "everybody very happy you're here this is", 'start': 6.72, 'duration': 4.919} {'text': 'the first talk in', 'start': 9.08, 'duration': 5.639} {'text': 'the uh neuros symbolic AI Journal', 'start': 11.639, 'duration': 6.681} {'text': "webinar series and um I'm very happy", 'start': 14.719, 'duration': 5.841} {'text': 'that Frank is here to', 'start': 18.32, 'duration': 5.4} {'text': 'present thanks a lot Frank over to you', 'start': 20.56, 'duration': 4.559} {'text': "and it's great to see so many of the", 'start': 23.72, 'duration': 3.84} {'text': 'people that I know uh to be online so', 'start': 25.119, 'duration': 4.56}


In [4]:
transcript[:5]

[{'text': "let's let's get started", 'start': 0.52, 'duration': 6.2},
 {'text': 'um welcome', 'start': 4.72, 'duration': 4.36},
 {'text': "everybody very happy you're here this is",
  'start': 6.72,
  'duration': 4.919},
 {'text': 'the first talk in', 'start': 9.08, 'duration': 5.639},
 {'text': 'the uh neuros symbolic AI Journal',
  'start': 11.639,
  'duration': 6.681}]

In [5]:
# Combine all transcript text into one string
transcript_text = " ".join([entry['text'].lower() for entry in transcript])

# Topics from CSV and its matching

In [6]:
import pandas as pd
# --- Step 1: Load Topics from CSV ---
topics_df = pd.read_csv("topics.csv")  # Assumes a column with topic names
topics = topics_df.iloc[:, 1].dropna().str.lower().tolist()  # Clean and lowercase

In [7]:
# --- Step 3: Match Topics in Transcript ---
matched_topics = [topic for topic in topics if topic in transcript_text]

In [8]:
matched_topics

['probability',
 'machine learning',
 'ai',
 'deep learning',
 'knowledge graphs',
 'r',
 'cities',
 'stem',
 'neural networks',
 'generative ai',
 'large language models',
 'semantic web',
 'space',
 'classes',
 'properties',
 'relationships',
 'semantics',
 'inference',
 'knowledge graph',
 'domain',
 'transitivity',
 'disjointness',
 'graph',
 'intersection',
 'axiom',
 'owl',
 'class',
 'object',
 'predicate',
 'property',
 'subject',
 'triple',
 'type',
 'construct',
 'explain',
 'schema',
 'uri']

# Stopword Removal Setup Code

In [9]:
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [10]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Make sure you download NLTK resources (only once)
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
# --- Step 3: Remove Stop Words ---
stop_words = set(stopwords.words('english'))

In [12]:
def clean_text(text):
    words = word_tokenize(text)
    filtered = [word for word in words if word.isalnum() and word not in stop_words]
    return " ".join(filtered)

# topic match from stop word removed Data

In [13]:
# Cleaned transcript and topics
cleaned_transcript = clean_text(transcript_text)
cleaned_topics = [clean_text(topic) for topic in topics if clean_text(topic)]
cleaned_matched_topics = [topic for topic in cleaned_topics if topic in cleaned_transcript]

In [14]:
cleaned_matched_topics[:10]

['probability',
 'machine learning',
 'ai',
 'deep learning',
 'knowledge graphs',
 'r',
 'cities',
 'stem',
 'neural networks',
 'generative ai']

# use KG directly to find cleaned matched topics

In [15]:
!pip install SPARQLWrapper

Defaulting to user installation because normal site-packages is not writeable
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [16]:
from SPARQLWrapper import SPARQLWrapper, JSON

In [17]:
# --- SPARQL Query Setup ---
sparql = SPARQLWrapper("http://arsenal.cs.wright.edu:3030/currkg/sparql")
sparql.setQuery("""
PREFIX edu-ont: <https://edugate.cs.wright.edu/lod/ontology/>

SELECT ?topic ?topicTitle
WHERE {
  ?topic a edu-ont:Topic ;
         edu-ont:asString ?topicTitle;
}
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

In [18]:
# Extract topic labels
topics_raw_kg = [result['topicTitle']['value'].lower() for result in results["results"]["bindings"]]

In [19]:
topics_raw_kg[:10]

['coding',
 'python',
 'pandas',
 'data science careers',
 'probability',
 'case studies',
 'machine learning',
 'natural language processing (nlp)',
 'math fundamentals',
 'statistics']

In [20]:
kg_cleaned_topics = [clean_text(topic) for topic in topics_raw_kg if clean_text(topic)]
kg_cleaned_topics[:10]

['coding',
 'python',
 'pandas',
 'data science careers',
 'probability',
 'case studies',
 'machine learning',
 'natural language processing nlp',
 'math fundamentals',
 'statistics']

In [21]:
kg_matched_topics = [topic for topic in kg_cleaned_topics if topic in clean_text(transcript_text)]
kg_matched_topics[:10]

['probability',
 'machine learning',
 'ai',
 'deep learning',
 'knowledge graphs',
 'r',
 'cities',
 'stem',
 'neural networks',
 'generative ai']

## Get Exact transcript entitity to match it to temporal extent

In [22]:
import re
# Match against each transcript entry
kg_matched_transcripts_entries = []

for entry in transcript:
    entry_text_cleaned = clean_text(entry['text'])  # Cleaned version of transcript
    for original_topic, cleaned_topic in zip(topics, cleaned_topics):
        # Create a regex pattern for exact whole word or phrase match
        pattern = r'\b' + re.escape(cleaned_topic) + r'\b'
        if re.search(pattern, entry_text_cleaned):
            kg_matched_transcripts_entries.append({
                "matched_topic": original_topic,
                "transcript_text": entry['text'],
                "start": entry['start'],
                "duration": entry['duration']
            })

In [23]:
# Convert to DataFrame for easy viewing/export
import pandas as pd
matched_df = pd.DataFrame(kg_matched_transcripts_entries)

In [24]:
matched_df[:10]

Unnamed: 0,matched_topic,transcript_text,start,duration
0,knowledge graphs,knowledge graphs but that's just an,98.56,3.4
1,machine learning,of doing machine learning on symbolic,132.0,4.44
2,disjunction,graph in their abstract or title over,140.879,5.561
3,cities,companies and people and cities and,182.599,4.161
4,cities,companies that people work for cities,185.12,4.119
5,space,dimensional Vector space uh where,201.44,5.879
6,disjunction,somehow what is similar in the graph,204.799,4.681
7,space,space um and then the question is well,211.08,7.079
8,space,vectors in the space in this case a drug,248.76,4.0
9,disjunction,representation uh in this case a graph,291.36,4.72
