In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Function to scrape data from a given URL
def scrape_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # Extracting company name
    company_name = soup.find('h1', class_='company-name').text if soup.find('h1', class_='company-name') else None

    # Extracting sector and industry
    sector = soup.select('div.company-info span.sector')[0].text.strip() if soup.select('div.company-info span.sector') else None
    industry = soup.select('div.company-info span.industry')[0].text.strip() if soup.select('div.company-info span.industry') else None

    # Extracting market capitalization
    market_cap = soup.find('span', class_='market-cap').text if soup.find('span', class_='market-cap') else None

    # Extracting revenue
    revenue = soup.find('span', class_='revenue').text if soup.find('span', class_='revenue') else None

    # Add more extraction logic for other relevant information

    return {
        'Company Name': company_name,
        'Sector': sector,
        'Industry': industry,
        'Market Cap': market_cap,
        'Revenue': revenue,
        # Add more keys as needed
    }


In [3]:
# Function to save data to CSV
def save_to_csv(data, filename='output.csv'):
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f'Data saved to {filename}')

In [5]:
# List of query links
query_links = [
    'https://www.forbes.com/companies/canon/?sh=1b8152a5427b',
    'https://www.canoo.com/',
    'https://en.wikipedia.org/wiki/Canoo',
    'https://techcrunch.com/tag/canoo/',
    'https://prophet.com/case-studies/canoo-jump-starting-a-new-electric-vehicle-brand/',
    'https://xdinnovation.com/wp-content/uploads/2020/01/15249-1_DS_Canoo_Case_Study_Partner_Hi.pdf',
    'https://techcrunch.com/2021/08/16/ev-startup-canoo-is-gearing-up-for-production-in-oklahoma-factory/',
    'https://dcf.fm/blogs/blog/goev-history-mission-ownership',
    'https://rivian.com/',
    'https://arrival.com/',
    'https://ree.auto/',
    'https://sonomotors.com/',
    'https://www.gojoauto.com/used-inventory/index.htm',
    'https://www.ford.com/trucks/f150/f150-lightning/',
    'https://www.gm.com/commitments/electrification',
    'https://investors.canoo.com/',
    'https://www.greencarreports.com/',
    'https://www.globaldata.com/company-profile/canoo-inc/',
    'https://investors.canoo.com/',
    'https://investors.canoo.com/',
    'https://www.capitalone.com/auto-financing/',
    'https://mapandfire.com/brand-guidebook/',
    'https://www.secform4.com/insider-trading/1750153.htm',
    'https://prophet.com/case-studies/canoo-jump-starting-a-new-electric-vehicle-brand/',
    'https://www.fool.com/investing/2023/08/27/canoos-genius-strategy/',
    'https://investors.canoo.com/',
    'https://prophet.com/case-studies/canoo-jump-starting-a-new-electric-vehicle-brand/',
    'https://investors.canoo.com/news-presentations/press-releases/detail/113/canoo-has-entered-into-a-45-million-convertible-preferred',
]

In [6]:
# Dictionary to store scraped data
scraped_data = {}

# Loop through query links and scrape data
for i, link in enumerate(query_links, 1):
    print(f"Scraping data from link {i}/{len(query_links)}: {link}")
    scraped_data[link] = scrape_data(link)

# Save data to CSV 
save_to_csv(scraped_data, 'scraped_data.csv')

Scraping data from link 1/28: https://www.forbes.com/companies/canon/?sh=1b8152a5427b
Scraping data from link 2/28: https://www.canoo.com/
Scraping data from link 3/28: https://en.wikipedia.org/wiki/Canoo
Scraping data from link 4/28: https://techcrunch.com/tag/canoo/
Scraping data from link 5/28: https://prophet.com/case-studies/canoo-jump-starting-a-new-electric-vehicle-brand/
Scraping data from link 6/28: https://xdinnovation.com/wp-content/uploads/2020/01/15249-1_DS_Canoo_Case_Study_Partner_Hi.pdf
Scraping data from link 7/28: https://techcrunch.com/2021/08/16/ev-startup-canoo-is-gearing-up-for-production-in-oklahoma-factory/
Scraping data from link 8/28: https://dcf.fm/blogs/blog/goev-history-mission-ownership
Scraping data from link 9/28: https://rivian.com/
Scraping data from link 10/28: https://arrival.com/
Scraping data from link 11/28: https://ree.auto/
Scraping data from link 12/28: https://sonomotors.com/
Scraping data from link 13/28: https://www.gojoauto.com/used-inventor

In [9]:
# Specify the path to your CSV file
csv_file_path = 'scraped_data.csv'

# Load CSV data into a DataFrame
df = pd.read_csv(csv_file_path)

# Display the DataFrame
df.head()

Unnamed: 0,https://www.forbes.com/companies/canon/?sh=1b8152a5427b,https://www.canoo.com/,https://en.wikipedia.org/wiki/Canoo,https://techcrunch.com/tag/canoo/,https://prophet.com/case-studies/canoo-jump-starting-a-new-electric-vehicle-brand/,https://xdinnovation.com/wp-content/uploads/2020/01/15249-1_DS_Canoo_Case_Study_Partner_Hi.pdf,https://techcrunch.com/2021/08/16/ev-startup-canoo-is-gearing-up-for-production-in-oklahoma-factory/,https://dcf.fm/blogs/blog/goev-history-mission-ownership,https://rivian.com/,https://arrival.com/,...,https://www.ford.com/trucks/f150/f150-lightning/,https://www.gm.com/commitments/electrification,https://investors.canoo.com/,https://www.greencarreports.com/,https://www.globaldata.com/company-profile/canoo-inc/,https://www.capitalone.com/auto-financing/,https://mapandfire.com/brand-guidebook/,https://www.secform4.com/insider-trading/1750153.htm,https://www.fool.com/investing/2023/08/27/canoos-genius-strategy/,https://investors.canoo.com/news-presentations/press-releases/detail/113/canoo-has-entered-into-a-45-million-convertible-preferred
0,Canon | Company Overview & News,,Canoo - Wikipedia,Canoo | TechCrunch,403 Forbidden,,EV startup Canoo is gearing up for production ...,,Rivian - Electric Adventure Vehicles,Arrival | Zero-emission solutions,...,2023 Ford F-150Â® LightningÂ® | Electric Truck...,Committing to an All-Electric Future | General...,Investor Relations :: Canoo Inc. (GOEV),Green Car Reports - Hybrid and Electric Car Ne...,Canoo Inc Company Profile - Overview - GlobalData,Auto Financing,403 Forbidden,403 Forbidden,Canoo's Genius Strategy? | The Motley Fool,Canoo has entered into a $45 million Convertib...


In [13]:
import spacy

# Load spaCy model
nlp = spacy.load('en_core_web_md')

# Function to vectorize text using spaCy
def vectorize_text(text):
    doc = nlp(text)
    return doc.vector.tolist()

# Assuming 'Description' is a column containing text data in your DataFrame
df['https://rivian.com/'] = df['https://rivian.com/'].apply(vectorize_text)

# Save the DataFrame with vectors to a new CSV file
df.to_csv('canoo_financial_data_with_vectors.csv', index=False)
print('Data with vectors saved to canoo_financial_data_with_vectors.csv')


Data with vectors saved to canoo_financial_data_with_vectors.csv


In [14]:
from sklearn.metrics.pairwise import cosine_similarity

# Load spaCy model
nlp = spacy.load('en_core_web_md')

# Function to vectorize text using spaCy
def vectorize_text(text):
    doc = nlp(text)
    return doc.vector.reshape(1, -1)

# Assuming 'Description' is a column containing text data in your DataFrame
df['https://rivian.com/_Vector'] = df['https://rivian.com/'].apply(vectorize_text)

# Vectorize the query text
query_text = "Some text related to the query"
query_vector = vectorize_text(query_text)

# Apply cosine similarity to find similar vectors
df['Similarity'] = df['https://rivian.com/_Vector'].apply(lambda x: cosine_similarity(query_vector, x).item())

# Display the DataFrame with similarity scores
print(df[['https://rivian.com/', 'Similarity']])

# Save the DataFrame with similarity scores to a new CSV file
df.to_csv('canoo_financial_data_with_similarity.csv', index=False)
print('Data with similarity scores saved to canoo_financial_data_with_similarity.csv')


ValueError: [E1041] Expected a string, Doc, or bytes as input, but got: <class 'list'>

In [None]:
from gensim.summarization import summarize

# Load spaCy model
nlp = spacy.load('en_core_web_md')

# Function to summarize text using Gensim
def summarize_text(text):
    return summarize(text)

# Assuming 'Description' is a column containing text data in your DataFrame
df['Summary'] = df['Description'].apply(summarize_text)

# Display the DataFrame with summaries
print(df[['Description', 'Summary']])

# Save the DataFrame with summaries to a new CSV file
df.to_csv('canoo_financial_data_with_summaries.csv', index=False)
print('Data with summaries saved to canoo_financial_data_with_summaries.csv')


In [None]:
import pandas as pd

# Load the DataFrame with summarized data
df = pd.read_csv('canoo_financial_data_with_summaries.csv')

# Create a report DataFrame
report_df = pd.DataFrame(columns=['Section', 'Content'])

# Add sections and content to the report
report_df = report_df.append({'Section': 'Company Overview', 'Content': f"Name: {df['Name'].iloc[0]}\nSector: {df['Sector'].iloc[0]}\nIndustry: {df['Industry'].iloc[0]}\n"}, 
ignore_index=True)
report_df = report_df.append({'Section': 'Financial Data', 'Content': f"Market Capitalization: {df['Market Capitalization'].iloc[0]}\nRevenue: {df['Revenue'].iloc[0]}\n"}, 
ignore_index=True)
report_df = report_df.append({'Section': 'Financial News Headlines', 'Content': '\n'.join(df['Summary'].tolist())}, ignore_index=True)

# Display the report
print(report_df)

# Save the report to a file (e.g., report.txt)
report_df.to_csv('report.txt', sep='\t', index=False)
print('Report saved to report.txt')