In [4]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import json
import os
from dotenv import load_dotenv
from openai import OpenAI
from youtube_transcript_api import YouTubeTranscriptApi
from urllib.parse import urlparse, parse_qs
import warnings
import subprocess




# Suppress SSL warnings for testing
warnings.filterwarnings("ignore", message="Unverified HTTPS request")


In [6]:
class ContentProcessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))

    def is_youtube_url(self, url):
        parsed = urlparse(url)
        return any(domain in parsed.netloc for domain in ['youtube.com', 'youtu.be'])

    def get_youtube_id(self, url):
        parsed = urlparse(url)
        if parsed.netloc == 'youtu.be':
            return parsed.path[1:]
        if 'youtube.com' in parsed.netloc:
            if 'v' in parse_qs(parsed.query):
                return parse_qs(parsed.query)['v'][0]
            if parsed.path.startswith('/embed/'):
                return parsed.path.split('/')[2]
            if parsed.path.startswith('/watch/'):
                return parsed.path.split('/')[2]
        return None

    def extract_youtube_content(self, url):
        try:
            video_id = self.get_youtube_id(url)
            if not video_id:
                return {"error": "Invalid YouTube URL", "url": url}

            transcript = YouTubeTranscriptApi.get_transcript(video_id)
            content = ' '.join([entry['text'] for entry in transcript])
            
            # Get video title
            embed_url = f"https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={video_id}&format=json"
            response = requests.get(embed_url)
            title = response.json().get('title', f"Video {video_id}")

            return {
                "title": title,
                "content": content,
                "url": url
            }
        except Exception as e:
            return {"error": f"YouTube error: {str(e)}", "url": url}

    def extract_web_content(self, url):
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                'Accept-Language': 'en-US,en;q=0.9'
            }
            
            response = requests.get(url, headers=headers, timeout=10, verify=False)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'lxml')
            
            # Clean page
            for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'form', 'button', 'iframe', 'noscript']):
                element.decompose()

            # Extract content
            article = soup.find('article') or soup.find('main') or soup.find('div', role='main')
            if article:
                content = article.get_text(separator='\n', strip=True)
            else:
                body = soup.find('body')
                paragraphs = body.find_all(['p', 'h1', 'h2', 'h3']) if body else []
                content = '\n'.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])

            content = re.sub(r'\n{3,}', '\n\n', content.strip())
            return {
                "title": soup.title.string.strip() if soup.title else "Untitled",
                "content": content,
                "url": url
            }
        except Exception as e:
            return {"error": f"Web extraction error: {str(e)}", "url": url}

    def extract_content_from_url(self, url):
        if self.is_youtube_url(url):
            return self.extract_youtube_content(url)
        return self.extract_web_content(url)

    def extract_key_sentences(self, content, max_sentences=20):
        sentences = sent_tokenize(content)
        if not sentences:
            return []

        word_freq = {}
        for sentence in sentences:
            words = sentence.lower().split()
            for word in words:
                if word not in self.stop_words and len(word) > 3:
                    word_freq[word] = word_freq.get(word, 0) + 1

        sentence_scores = {}
        for i, sentence in enumerate(sentences):
            words = sentence.lower().split()
            score = sum(word_freq.get(word, 0) for word in words if word not in self.stop_words)
            if len(words) > 0:
                score /= len(words)
            sentence_scores[i] = score

        top_indices = sorted(sentence_scores.keys(), key=lambda i: sentence_scores[i], reverse=True)[:max_sentences]
        top_indices.sort()
        return [sentences[idx] for idx in top_indices]


In [7]:
processor = ContentProcessor()
url = "https://jasminbharadiya.medium.com/the-art-of-data-extraction-enhancing-web-content-analysis-with-large-language-models-65e6f3dfbf1d"
url = "https://www.youtube.com/watch?v=kRybqfQ9zCE&list=WL&index=16&t=64s&pp=gAQBiAQB"
content_data = processor.extract_content_from_url(url)

In [8]:
content_data

{'title': "2023's Most Beautiful Movie About Love",
 'content': 'This video is brought to you by AdamandEve.com.\xa0\nUse my code LSOO for 50% off 1 item + Free\xa0Shipping in US & Canada Who do you think they are to each other?\xa0 This is the question that opens Past Lives,\xa0\nthe first feature movie by Celine Song,\xa0\xa0 as we follow a voyeuristic gaze at three strangers\xa0\nsitting across the bar. They won’t, of course,\xa0\xa0 be strangers for long. Soon, we will learn\xa0\nexactly who they are and how their lives are\xa0\xa0 intertwined. And yet, the question remains\xa0\nsignificant. For it tells us that whatever\xa0\xa0 expectations we might have, things will not\xa0\nbe as they appear at first sight. Indeed,\xa0\xa0 Past Lives is a movie that goes on to quietly\xa0\nyet significantly subvert many tropes of the\xa0\xa0 romantic genre, but it also reconstructs them into\xa0\nsomething new, something that I wasn’t expecting,\xa0\xa0 and which to me, elevates it to one of, if

In [9]:
if "error" in content_data:
    error = content_data["error"]
    if "YouTube" in error:
        error += ". Ensure captions are enabled."
    print( {"error": error, "title": "Error", "url": url, "flashcards": []})

key_sentences = processor.extract_key_sentences(content_data["content"])

In [10]:
key_sentences[:2]

['This is the question that opens Past Lives,\xa0\nthe first feature movie by Celine Song,\xa0\xa0 as we follow a voyeuristic gaze at three strangers\xa0\nsitting across the bar.',
 'Indeed,\xa0\xa0 Past Lives is a movie that goes on to quietly\xa0\nyet significantly subvert many tropes of the\xa0\xa0 romantic genre, but it also reconstructs them into\xa0\nsomething new, something that I wasn’t expecting,\xa0\xa0 and which to me, elevates it to one of, if not\xa0\nthe most beautiful love story in a long time.']

In [11]:
if not key_sentences:
    print( {"error": "No meaningful content found", "title": content_data["title"], "url": url, "flashcards": []})


In [12]:
content_data["content"] = " ".join(key_sentences)


In [None]:
system_message = """You are an educational content expert. Create flashcards in this EXACT format:
[
    {"question": "...", "answer": "..."},
    // more items
]
ONLY output valid JSON array with question/answer pairs. Follow these rules:
1. Questions should test understanding of key concepts
2. Answers must be concise but complete
3. Never add explanations or markdown
4. Keep in mind of the outout {} brackets
4. Keep in mind of [,] brackets. Add [ at start of response and ] at end
5. Keep in mind of the amount of flashcards to generate"""

user_message = f"Content Title: {content_data['title']}\nContent Excerpt:\n{content_data['content']}\n\nGenerate exactly {2} flashcards:"


# Run Ollama LLM locally
prompt = f"{system_message}\n{user_message}"

In [93]:
prompt

'You are an educational content expert. Create flashcards in this EXACT format:\n[\n    {"question": "...", "answer": "..."},\n    // more items\n]\nONLY output valid JSON array with question/answer pairs. Follow these rules:\n1. Questions should test understanding of key concepts\n2. Answers must be concise but complete\n3. Never add explanations or markdown\n4. Keep in mind of the outout {} brackets\n4. Keep in mind of [,] brackets. Add [ at start of response and ] at end\n5. Keep in mind of the amount of flashcards to generate\nContent Title: 2023\'s Most Beautiful Movie About Love\nContent Excerpt:\nThis is the question that opens Past Lives,\xa0\nthe first feature movie by Celine Song,\xa0\xa0 as we follow a voyeuristic gaze at three strangers\xa0\nsitting across the bar. Indeed,\xa0\xa0 Past Lives is a movie that goes on to quietly\xa0\nyet significantly subvert many tropes of the\xa0\xa0 romantic genre, but it also reconstructs them into\xa0\nsomething new, something that I wasn

In [94]:
len(prompt)

3177

In [None]:
system_message = """You are an educational content expert. Create a summary of given content 
and return as string. """

user_message = f"Content Title: {content_data['title']}\nContent Excerpt:\n{content_data['content']}\n"


# Run Ollama LLM locally
prompt = f"{system_message}\n{user_message}"

In [99]:
import ollama

# Initialize the Ollama client
client = ollama.Client()

# Define the model and the input prompt
model = "mistral"  # Replace with your model name

# Send the query to the model
response = client.generate(model=model, prompt=prompt)

# Process JSON response
result = response.response
cleaned_text = result.strip() 

In [100]:
cleaned_text

'Title: 2023\'s Most Beautiful Movie About Love: Past Lives\n   Content Excerpt: "Past Lives" by Celine Song challenges and redefines romantic genre tropes, offering a beautiful and profound exploration of love. The movie can be divided into three chapters, each tracing the relationship between Nora and Hae Sung over decades. Despite the lingering questions about the choices made in life, "Past Lives" subtly avoids typical romantic resolution, instead delving into the complexities of human connection, the search for meaning, and the exploration of alternate lives and loves. The movie is as much about Nora\'s relationship with her husband Arthur as it is with Hae Sung. The unique aspect of "Past Lives" lies in its refusal to resolve between dreamy idealism and stark realism, making it a standout film in the romantic genre.'

In [98]:
import ollama

# Initialize the Ollama client
client = ollama.Client()

# Define the model and the input prompt
model = "mistral"  # Replace with your model name

# Send the query to the model
response = client.generate(model=model, prompt=prompt)

# Process JSON response
result = response.response
cleaned_text = result.strip() 
json_str = cleaned_text
flashcards = json.loads(json_str)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [96]:
flashcards

[{'question': "What is the title of Celine Song's first feature movie?",
  'answer': 'Past Lives'},
 {'question': 'How many chapters can Past Lives be roughly divided into?',
  'answer': 'Three'},
 {'question': 'In the first chapter of Past Lives, who are the main characters and where do they live?',
  'answer': 'Nora and Hae Sung. They live in Korea.'},
 {'question': "What is the setting of the second chapter of Past Lives' story?",
  'answer': 'New York'},
 {'question': 'Who is Nora married to in the second chapter of Past Lives?',
  'answer': 'Arthur'},
 {'question': 'What is the main theme that Past Lives subverts about romantic relationships?',
  'answer': "The expectation of making the 'right' choice in a relationship."},
 {'question': 'What feeling does Past Lives convey through its story and dialogues that is not quite captured by words like doubt, longing or regret?',
  'answer': 'A bittersweet melancholy'},
 {'question': 'What term is introduced in the third chapter of Past L

In [86]:
type(flashcards)

list

In [78]:
result = response.response

In [79]:
result


'[\n    {"question": "What is the name of the first feature movie by Celine Song?", "answer": "Past Lives"},\n    {"question": "What do the three strangers sitting across the bar in Past Lives represent?", "answer": "Their past lives or past connections"},\n    {"question": "How many chapters does Past Lives have?", "answer": "Three"},\n    {"question": "What is Nora\'s relationship with Hae Sung as an adolescent?", "answer": "Secret crush"},\n    {"question": "Where do Nora and Hae Sung live at different stages of their lives?", "answer": "Korea and New York"},\n    {"question": "How many times does the 12-year gap appear in Past Lives?", "answer": "Two"},\n    {"question": "What is Nora\'s occupation in the second chapter of Past Lives?", "answer": "Playscript writer or playwright"},\n    {"question": "Who does Hae Sung still live with after Nora marries Arthur?", "answer": "Himself, in Korea"},\n    {"question": "Is the love story between Nora and Hae Sung fully developed in Past Li

In [80]:
cleaned_text = result.strip() 

print(cleaned_text)

[
    {"question": "What is the name of the first feature movie by Celine Song?", "answer": "Past Lives"},
    {"question": "What do the three strangers sitting across the bar in Past Lives represent?", "answer": "Their past lives or past connections"},
    {"question": "How many chapters does Past Lives have?", "answer": "Three"},
    {"question": "What is Nora's relationship with Hae Sung as an adolescent?", "answer": "Secret crush"},
    {"question": "Where do Nora and Hae Sung live at different stages of their lives?", "answer": "Korea and New York"},
    {"question": "How many times does the 12-year gap appear in Past Lives?", "answer": "Two"},
    {"question": "What is Nora's occupation in the second chapter of Past Lives?", "answer": "Playscript writer or playwright"},
    {"question": "Who does Hae Sung still live with after Nora marries Arthur?", "answer": "Himself, in Korea"},
    {"question": "Is the love story between Nora and Hae Sung fully developed in Past Lives?", "answ

In [81]:
json_str = cleaned_text

In [82]:
flashcards = json.loads(json_str)

In [83]:
flashcards

[{'question': 'What is the name of the first feature movie by Celine Song?',
  'answer': 'Past Lives'},
 {'question': 'What do the three strangers sitting across the bar in Past Lives represent?',
  'answer': 'Their past lives or past connections'},
 {'question': 'How many chapters does Past Lives have?', 'answer': 'Three'},
 {'question': "What is Nora's relationship with Hae Sung as an adolescent?",
  'answer': 'Secret crush'},
 {'question': 'Where do Nora and Hae Sung live at different stages of their lives?',
  'answer': 'Korea and New York'},
 {'question': 'How many times does the 12-year gap appear in Past Lives?',
  'answer': 'Two'},
 {'question': "What is Nora's occupation in the second chapter of Past Lives?",
  'answer': 'Playscript writer or playwright'},
 {'question': 'Who does Hae Sung still live with after Nora marries Arthur?',
  'answer': 'Himself, in Korea'},
 {'question': 'Is the love story between Nora and Hae Sung fully developed in Past Lives?',
  'answer': 'No'},
 