In [None]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup

df = pd.read_excel("corpo_announcements.xlsx") #file too big to be exhibited raw on github

df['DATE'] = df['last_update'].apply(lambda x: str(x)[:10])
relevant_columns = ['HEADLINE', 'NEWSSUB', 'MORE', 'NSURL', 'SOURCE']
df = df[relevant_columns]


# Identify links to transcripts and audio files using regular expressions
transcript_regex = r'https?://.*\.pdf'
audio_regex = r'https?://.*\.mp3'

def extract_links(row):
    url = row['SOURCE']
    if re.match(transcript_regex, url):
        row['TRANSCRIPT_LINK'] = url
    elif re.match(audio_regex, url):
        row['AUDIO_LINK'] = url
    else:
        # Download the source file if needed
        response = requests.get(url)
        if response.status_code == 200:
            content_type = response.headers.get('Content-Type')
            if 'pdf' in content_type:
                row['TRANSCRIPT_LINK'] = url
            elif 'audio' in content_type:
                row['AUDIO_LINK'] = url
    return row


df = df.apply(extract_links, axis=1)

links = []

# Iterate through each URL in the dataframe
for url in df['NSURL']:
    # Send a GET request to the URL
    response = requests.get(url)
    # Parse the HTML content of the webpage
    soup = BeautifulSoup(response.content, 'html.parser')
    # Extract the links to the earnings call transcripts and audio files using regular expressions
    transcript_link = soup.find('a', href=re.compile(transcript_regex))
    audio_link = soup.find('a', href=re.compile(audio_regex))
    # Add the links to the list
    links.append({'Transcript': transcript_link['href'] if transcript_link else None, 'Audio': audio_link['href'] if audio_link else None})


links_df = pd.DataFrame(links)


df = pd.concat([df, links_df], axis=1)



df.to_excel("audio_ts.xlsx", index=False)