In [1]:
import pandas as pd
import requests 
from bs4 import BeautifulSoup
import re
from youtube_transcript_api import YouTubeTranscriptApi

## Get the list of resources

In [2]:
list_of_resources = pd.read_csv('/Users/kenlam/Desktop/Data science/ML projects/RAG_resume/model/list_resources.csv')
list_of_resources.head()

Unnamed: 0,name,link,website,form
0,10 Resume Writing Tips To Help You Land a Posi...,https://www.indeed.com/career-advice/resumes-c...,Indeed,article
1,How to Write a Résumé That Stands Out,https://hbr.org/2014/12/how-to-write-a-resume-...,Harvard Business Review,article
2,18 Résumé Writing Tips to Help You Stand Out,https://www.wsj.com/articles/18-resume-writing...,WSJ,article
3,Words To Avoid and Include on a Resume,https://www.indeed.com/career-advice/resumes-c...,Indeed,article
4,Everything You Need To Know About Job Applicat...,https://www.indeed.com/career-advice/finding-a...,Indeed,article


### Divide the list into articles and youtube

In [3]:
list_of_articles = list_of_resources[list_of_resources['form'] == 'article']
list_of_yt = list_of_resources[list_of_resources['form'] == 'video']

## Gather articles from articles giving resume advice

In [12]:
# Get a list of urls
list_of_urls = list(list_of_articles['link'])

In [39]:
def detect_block(url):
    """Detects access to the article's content"""
    warning_phrases = ["You have been blocked", "Page not found"]

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    content = soup.get_text(strip = True).lower()
    
    for phrase in warning_phrases:
        if phrase.lower() in content:
            return True
    return False

In [40]:
def extract_content(url):
    """Extracts the raw content of an article if didn't get block"""
    try:
        # If there's no warning messages
        if detect_block(url):
            print(f"Warning detected for {url}")
            return None
        
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        content = soup.get_text(strip = True)
        print(f"Successfully extract content for {url}")
        return content
    except requests.RequestException as e:
        print(f"Failed to scrape {url}: {e}")
        return None

In [41]:
def clean_text(content):
    """Cleans the text by removing unecessary characters like whitespaces"""
    cleaned_content = re.sub(r'\s+', ' ', content)  # Remove extra whitespace
    cleaned_content = re.sub(r'<.*?>', '', content)  # Remove HTML tags)
    return cleaned_content.strip()

In [44]:
url = list_of_urls[10]
content = extract_content(url)
cleaned_content = clean_text(content)
print(cleaned_content)

Successfully extract content for https://extendedstudies.ucsd.edu/news-and-events/division-of-extended-studies-blog/how-to-land-a-job-in-2024-resume-tips
12 Resume Best Practices from a Career Advisor: How to Make Your Resume Stand Out in 2024 | Continuing Education | UC San Diego Division of Extended StudiesSkip to Contentshopping_cartsearchtwitterinstagramfacebookgooglelinkedinyoutubemailthumbs-upphoneChat Bubbleenvelope iconLocation Pin IconDownload IconcalendarmobileNew Campus Location:As of June 17th, 2024, we have moved to our brand-new campus at 8980 Villa La Jolla Drive, directly across the street from UC San Diego. Student Services and International Programs are on-site and ready to assist students. For directions or general inquiries, please contact us at (858) 534-3400. We look forward to welcoming you as we usher in a new era of innovation to serve our vibrant San Diego community and beyond.View Additional InformationToggle main menu visibilityCollaboratePartner With UsCust

In [46]:
list_articles_content = []

for url in list_of_urls:
    content = extract_content(url)
    if content:
        cleaned_content = clean_text(content)
        list_articles_content.append(cleaned_content)

Successfully extract content for https://hbr.org/2014/12/how-to-write-a-resume-that-stands-out
Successfully extract content for https://www.linkedin.com/pulse/24-resume-tips-get-you-hired-2024-steph-cartwright-cprw-5z3kc/
Failed to scrape https://www.myperfectresume.com/career-center/resumes/how-to/6-resume-trends-you-should-follow/t3: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Successfully extract content for https://extendedstudies.ucsd.edu/news-and-events/division-of-extended-studies-blog/how-to-land-a-job-in-2024-resume-tips
Successfully extract content for https://www.reddit.com/r/cscareerquestions/comments/18wk5ul/resume_advice_thread_january_02_2024/
Successfully extract content for https://www.capitalonecareers.com/6-resume-tips-from-a-tech-recruiter-cdev-101
Successfully extract content for https://careerfoundry.com/en/blog/career-change/best-tech-resume-guide-tips-examples/
Successfully extract content for https://www.pathrise

In [None]:
# Save the outputs in a corpus.txt
with open("corpus.txt", 'w', encoding='utf-8') as f:
    for article in list_articles_content:
        f.write(article + '\n\n')

## Gather transcriptions from YouTube videos giving resume advice

In [4]:
list_of_yt.head()

Unnamed: 0,name,link,website,form
19,7 Must-Know Resume Tips From a Former J.P. Mor...,https://www.youtube.com/watch?v=Cm4YSClxAI4,YouTube,video
20,"This resume got me offers from Google, Microso...",https://www.youtube.com/watch?v=kCgn-7NBPSs,YouTube,video
21,Write an Incredible Resume: 5 Golden Rules!,https://www.youtube.com/watch?v=Tt08KmFfIYQ,YouTube,video
22,How to Drastically Improve Your RESUME with 3 ...,https://www.youtube.com/watch?v=rvKNhhhzkP8,YouTube,video
23,How to Make An Impressive Resume for FREE (in ...,https://www.youtube.com/watch?v=7apj4sVvbro,YouTube,video


In [5]:
# Get the YT link from each url
def filter_url(link):
    match = re.search(r'v=([^&]+)', link)
    if match:
        return match.group(1)
    return None

In [6]:
# Extract the transcript text
def extract_text(yt_id):
    try:
        transcript_list = YouTubeTranscriptApi.get_transcript(yt_id)
        transcript_text = ''.join(snippet['text'] for snippet in transcript_list)
        return transcript_text
    except Exception as e:
        print(f"Errors in fetching {yt_id}: {e}")
        return None

In [16]:
yt_links = list_of_yt['link']
corpus_text = ""
for link in yt_links:
    print("filtering the url")
    yt_id = filter_url(link)
    print("extracting the text")
    text = extract_text(yt_id)
    if text:
        corpus_text = corpus_text + text +'\n\n'
    else:
        print(f"Cannot fetch {yt_id}")

filtering the url
extracting the text
Errors in fetching Cm4YSClxAI4: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=Cm4YSClxAI4! This is most likely caused by:

No transcripts were found for any of the requested language codes: ('en',)

For this video (Cm4YSClxAI4) transcripts are available in the following languages:

(MANUALLY CREATED)
 - en-US ("English (United States)")[TRANSLATABLE]

(GENERATED)
None

(TRANSLATION LANGUAGES)
 - af ("Afrikaans")
 - ak ("Akan")
 - sq ("Albanian")
 - am ("Amharic")
 - ar ("Arabic")
 - hy ("Armenian")
 - as ("Assamese")
 - ay ("Aymara")
 - az ("Azerbaijani")
 - bn ("Bangla")
 - eu ("Basque")
 - be ("Belarusian")
 - bho ("Bhojpuri")
 - bs ("Bosnian")
 - bg ("Bulgarian")
 - my ("Burmese")
 - ca ("Catalan")
 - ceb ("Cebuano")
 - zh-Hans ("Chinese (Simplified)")
 - zh-Hant ("Chinese (Traditional)")
 - co ("Corsican")
 - hr ("Croatian")
 - cs ("Czech")
 - da ("Danish")
 - dv ("Divehi")
 - nl ("Dutch")
 - en ("English")
 - e

In [17]:
print(corpus_text)

hey pirates pika here today i'mrevealing the resume i used to apply andget offers from companies like googlemicrosoft amazon etc i've redesigned ita bit to fit into the context of thisvideo but it's almost identical to theone i used to apply to google and laterreceive the 360k offercreating a resume is not difficult butyou do need to incorporate the rightstrategy to craft an effective one forthat reason i'll share my 10 tips on howto write an effective softwareengineering resume by using mine as anexample just an fyi i also used to workas a software engineering mentor at afew educational startups includingspringboard for over a year and becausei've reviewed and edited countlessresumes throughout the mentorship i knowprecisely the common mistakes candidatesoften make trust me on this implementingmy techniques into your resume willsignificantly increase traction from therecruiters that's how i help dozens ofpeople get offers from companies likefang i also have a special announcementregar

In [22]:
# Remove whitespaces and other extra characters
def clean_text(text):
    # Remove leading and trailing whitespaces
    text = text.strip()
    
    # Replace multiple spaces with a single space but preserve double newlines
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    
    # Remove special characters
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    
    return text

In [24]:
cleaned_corpus_text = clean_text(corpus_text)

In [25]:
# Save the outputs in a corpus.txt
with open("corpus_video.txt", 'w', encoding='utf-8') as f:
        f.write(cleaned_corpus_text)