# Code by Yi Li, MQE UCLA

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import re
import numpy as np
from tkinter import _flatten
import time
import selenium
from selenium import webdriver

# 1. Web Crawling Tables. Create a list of links for all the wikipedia pages for NYSE traded companies A-Z and 0-9. Crawl through all the URLs and make 1 DF with all the NYSE publically traded companies. Show the percetages of companies that contain 1 letter, 2 letters, 3 letters, 4 letters and 5 letters in the ticker.

In [2]:
def lovely_soup(u):
    page = requests.get(u)
    return(BeautifulSoup(page.content, 'html.parser'))

base_url = "https://en.wikipedia.org"
page_url = "https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(0-9)"
soup = lovely_soup(page_url)

page_list = [page_url]

for a in soup.findAll("a", href = True):
    if len(a.text) == 1:
        if ord(a.text) <=  ord("Z") or ord(a.text) >= ord("Z"):
            page_list.append(base_url + a["href"])
page_list

['https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(0-9)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(A)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(B)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(C)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(D)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(E)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(F)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(G)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(H)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(I)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(J)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_

In [3]:
company_list = []
url_list = []

for p in page_list:
    soup = lovely_soup(p)
    for tb in soup.findAll("table", style = "background:transparent;"):
        for tr in tb.findAll("tr"):
            td = tr.find("td")
            if td is not None:
                if td.find("a"):
                    company_list.append(td.find("a").text)
                    url_list.append(base_url + td.find("a")["href"])
                else:
                    company_list.append(td.text)
                    url_list.append("No Wikipedia Page")

In [4]:
df_traded_comp = pd.DataFrame(columns = ["Stock name", "Symbol", "Country of origin"])

for page in page_list:
    html = requests.get(page).content
    df = pd.read_html(html)[1]
    df_traded_comp = pd.concat([df_traded_comp, df], axis = 0, ignore_index = True)

In [5]:
df = pd.DataFrame({"Company": df_traded_comp["Stock name"], "Wikipedia Page Link": url_list})
df

Unnamed: 0,Company,Wikipedia Page Link
0,3D Systems Corporation,https://en.wikipedia.org/wiki/3D_Systems
1,3M Company,https://en.wikipedia.org/wiki/3M
2,500.com,No Wikipedia Page
3,58.com Inc.,No Wikipedia Page
4,8x8 Inc.,https://en.wikipedia.org/wiki/8x8
...,...,...
2718,Zimmer Biomet,https://en.wikipedia.org/wiki/Zimmer_Biomet
2719,Zions Bancorporation,https://en.wikipedia.org/wiki/Zions_Bancorpora...
2720,Zuora,https://en.wikipedia.org/wiki/Zuora
2721,Zoetis,https://en.wikipedia.org/wiki/Zoetis


In [6]:
desc_list = []
for url in url_list:
    if url == "No Wikipedia Page":
        desc_list.append("No Wikipedia Page")
    else:
        soup = lovely_soup(url)
        ps = soup.findAll("p")
        desc = ""
        for p in ps:
            desc = desc + str(p.text)
        desc_list.append(desc)
desc = pd.DataFrame({"Wikipedia Page Content":desc_list})

In [7]:
df_final = pd.concat([df_traded_comp, df[["Wikipedia Page Link"]]], axis = 1)
df_final = pd.concat([df_final, desc], axis = 1)
df_final

Unnamed: 0,Stock name,Symbol,Country of origin,Wikipedia Page Link,Wikipedia Page Content
0,3D Systems Corporation,DDD,US,https://en.wikipedia.org/wiki/3D_Systems,"3D Systems, headquartered in Rock Hill, South ..."
1,3M Company,MMM,US,https://en.wikipedia.org/wiki/3M,\n3M (originally the Minnesota Mining and Manu...
2,500.com,WBAI,China,No Wikipedia Page,No Wikipedia Page
3,58.com Inc.,WUBA,China,No Wikipedia Page,No Wikipedia Page
4,8x8 Inc.,EGHT,US,https://en.wikipedia.org/wiki/8x8,\n8x8 Inc. is an American provider of Voice ov...
...,...,...,...,...,...
2718,Zimmer Biomet,ZBH,United States,https://en.wikipedia.org/wiki/Zimmer_Biomet,Zimmer Biomet is a publicly traded medical dev...
2719,Zions Bancorporation,ZION,United States,https://en.wikipedia.org/wiki/Zions_Bancorpora...,Zions Bancorporation is a bank holding company...
2720,Zuora,ZUO,United States,https://en.wikipedia.org/wiki/Zuora,\nZuora is an American enterprise software com...
2721,Zoetis,ZTS,United States,https://en.wikipedia.org/wiki/Zoetis,\nZoetis Inc. (/zō-EH-tis/[3]) is an American ...


In [8]:
Symbol = df_final.Symbol.str.replace(".", "")
list_1 = []
list_2 = []
list_3 = []
list_4 = []
list_5 = []
for i in Symbol:
    if len(i) == 1:
        list_1.append(i)
    elif len(i) == 2:
        list_2.append(i)
    elif len(i) == 3:
        list_3.append(i)
    elif len(i) == 4:
        list_4.append(i)
    elif len(i) == 5:
        list_5.append(i)
per_1 = len(list_1) / len(df_final) * 100
per_2 = len(list_2) / len(df_final) * 100
per_3 = len(list_3) / len(df_final) * 100
per_4 = len(list_4) / len(df_final) * 100
per_5 = len(list_5) / len(df_final) * 100
print("The percetages of companies that contain 1 letter: {}%".format(per_1)) 
print("The percetages of companies that contain 2 letters: {}%".format(per_2)) 
print("The percetages of companies that contain 3 letters: {}%".format(per_3))
print("The percetages of companies that contain 4 letters: {}%".format(per_4)) 
print("The percetages of companies that contain 5 letters: {}%".format(per_5)) 

The percetages of companies that contain 1 letter: 0.8813808299669483%
The percetages of companies that contain 2 letters: 6.9775982372383405%
The percetages of companies that contain 3 letters: 65.29562982005142%
The percetages of companies that contain 4 letters: 15.534337128167463%
The percetages of companies that contain 5 letters: 1.7260374586852736%


# 2. Web Scraping Using Beautiful Soup. Using Beautiful soup .findAll method to webscrape the front page of Reddit. Get a list of all of the "timestamps". Using the functions findChild, descendents, etc. locate the post title and store in a list. Create a dataframe that has the associated title and post time for each post.

In [13]:
URL = "https://www.reddit.com"

def lovely_soup(u):
    page = requests.get(u)
    return(BeautifulSoup(page.content, 'html.parser'))

soup = lovely_soup(URL)

In [14]:
list_timestamps = []
for ts in soup.findAll("span", attrs = {"data-click-id":"timestamp"}):
    list_timestamps.append(ts.text)
list_timestamps

['5 hours ago',
 '15 hours ago',
 '19 hours ago',
 '5 hours ago',
 '9 hours ago',
 '6 hours ago',
 '17 hours ago',
 '11 hours ago']

In [15]:
list_post_title = []
for tag in soup.findAll("div"):
    child = tag.findChild("h3")
    if child is not None:
        list_post_title.append(tag.findChild("h3").text)
list_post_title = pd.DataFrame(list_post_title)[0].unique().tolist()
list_post_title

['[Highlight] Harden drills the classic stepback over Horford to put the Sixers up with 8.4 left',
 'Richard Feynman said, “Never confuse education with intelligence, you can have a PhD and still be an idiot.” What are some real life examples of this?',
 '‘Godfather of AI’ quits Google with regrets and fears about his life’s work',
 "TIL that the invention of bicycles was fundamental to the early women's liberation movement. Bicycles promised freedom to women long accustomed to relying on men for transportation. It was also the main reason corsets and long skirts fell out of fashion in the early 20th century.",
 'just wow',
 'Just look at the smug smiles on these people who showed up first thing in the morning to try to obstruct Zooey Zypher from doing her work today. Same smug smiles that have always opposed the progress of civil rights throughout history.',
 'During the 2018 wildfires, this man captured his drive to work in the morning',
 'Texas, where they\'re banning books in schoo

In [16]:
df = pd.DataFrame({"Post Time": list_timestamps, "Post Title": list_post_title})
df

Unnamed: 0,Post Time,Post Title
0,5 hours ago,[Highlight] Harden drills the classic stepback...
1,15 hours ago,"Richard Feynman said, “Never confuse education..."
2,19 hours ago,‘Godfather of AI’ quits Google with regrets an...
3,5 hours ago,TIL that the invention of bicycles was fundame...
4,9 hours ago,just wow
5,6 hours ago,Just look at the smug smiles on these people w...
6,17 hours ago,"During the 2018 wildfires, this man captured h..."
7,11 hours ago,"Texas, where they're banning books in schools ..."


# 3. RegEx. Using RegEx, get all the urls of ladder faculty profiles for UCLA Economics. Webcrawl the links from A and use RegEx to get all the emails and phone numbers of ladder faculty profiles. 

In [117]:
URL = "https://economics.ucla.edu/faculty/ladder"

def lovely_soup(u):
    page = requests.get(u)
    return(BeautifulSoup(page.content, 'html.parser'))

soup = lovely_soup(URL)
text = str(soup)

In [118]:
pattern1 = re.compile(r'https://economics.ucla.edu/person/\w+-\w+-?(\w+)?-?(\w+)?/')
matches1 = pattern1.finditer(text)
urlslist = [match[0] for match in matches1]
urlslist = list(set(urlslist))
urlslist

['https://economics.ucla.edu/person/andres-santos/',
 'https://economics.ucla.edu/person/jay-lu/',
 'https://economics.ucla.edu/person/tomasz-sadzik/',
 'https://economics.ucla.edu/person/saki-bigio/',
 'https://economics.ucla.edu/person/daniel-haanwinckel/',
 'https://economics.ucla.edu/person/michael-rubens/',
 'https://economics.ucla.edu/person/william-r-zame/',
 'https://economics.ucla.edu/person/rodrigo-pinto/',
 'https://economics.ucla.edu/person/jonathan-vogel/',
 'https://economics.ucla.edu/person/jinyong-hahn/',
 'https://economics.ucla.edu/person/simon-board/',
 'https://economics.ucla.edu/person/martha-bailey/',
 'https://economics.ucla.edu/person/shuyang-sheng/',
 'https://economics.ucla.edu/person/kathleen-mcgarry/',
 'https://economics.ucla.edu/person/michela-giorcelli/',
 'https://economics.ucla.edu/person/francois-geerolf/',
 'https://economics.ucla.edu/person/rosa-liliana-matzkin/',
 'https://economics.ucla.edu/person/pablo-fajgelbaum/',
 'https://economics.ucla.edu/pe

In [119]:
name_list = []

for url in urlslist:
    soup = lovely_soup(url)
    for h in soup.findAll("h1"):
        name_list.append(h.text)

for n in name_list:
    if " " in n:
        name_list.remove(n)

In [120]:
emails = []
for url in urlslist:
    soup = lovely_soup(url)
    email = []
    for i in soup.findAll("a", href = True):
        if "@" in i["href"]:
            email.append(i.text)
    if email == ["Contact Webmaster"]:
        email.append("NO EMAIL LISTED")
    emails.append(email)

for es in emails:
    for e in es:
        if e == "Contact Webmaster":
            es.remove(e)

for i in range(len(emails)):
    if len(emails[i]) == 1:
        emails[i] = emails[i][0]

In [121]:
phone_list = []
for url in urlslist:
    soup = lovely_soup(url)
    numberexist = 0
    phonenum = []
    for h in soup.findAll("h4"):
        if h.text == "Phone:":
            numberexist = 1
            text = str(h.parent.text)
            number = re.findall(r"Phone: (.*?)Office", text)
            phonenum.append(number)
            if number == []:
                phonenum.remove([])
                number = re.findall(r"Phone: (.*?)Email", text)
                phonenum.append(number[0])
    if numberexist == 0:
        phonenum.append("NO PHONE NUMBER LISTED")
    phone_list.append(phonenum)
phonenumber = [i for i in _flatten(phone_list)]

In [128]:
df = pd.DataFrame({"Name": name_list, "Email Address": emails, "Phone Number": phonenumber})
df["URL"] = urlslist

In [129]:
df

Unnamed: 0,Name,Email Address,Phone Number,URL
0,Andres Santos,andres@econ.ucla.edu,NO PHONE NUMBER LISTED,https://economics.ucla.edu/person/andres-santos/
1,Jay Lu,jay@econ.ucla.edu,(310) 825-7380,https://economics.ucla.edu/person/jay-lu/
2,Tomasz Sadzik,tsadzik@econ.ucla.edu,(310) 206-2833,https://economics.ucla.edu/person/tomasz-sadzik/
3,Saki Bigio,sbigio@econ.ucla.edu,(310) 825-9397,https://economics.ucla.edu/person/saki-bigio/
4,Daniel Haanwinckel,haanwinckel@econ.ucla.edu,NO PHONE NUMBER LISTED,https://economics.ucla.edu/person/daniel-haanw...
5,Michael Rubens,NO EMAIL LISTED,NO PHONE NUMBER LISTED,https://economics.ucla.edu/person/michael-rubens/
6,William R. Zame,zame@econ.ucla.edu,(310) 206-9463,https://economics.ucla.edu/person/william-r-zame/
7,Rodrigo Pinto,rodrig@econ.ucla.edu,(310) 825-0849,https://economics.ucla.edu/person/rodrigo-pinto/
8,Jonathan Vogel,jvogel@econ.ucla.edu,NO PHONE NUMBER LISTED,https://economics.ucla.edu/person/jonathan-vogel/
9,Jinyong Hahn,"[hahn@econ.ucla.edu, chair@econ.ucla.edu]",(310) 825-2523,https://economics.ucla.edu/person/jinyong-hahn/


# 4. WebScraping using Selenium.

The World Economic Outlook (WEO) is a report published by the International Monetary Fund (IMF) that provides an assessment of the global economic situation and outlook. It includes analysis and projections of key economic indicators such as economic growth, inflation, trade, and financial markets, as well as commentary on policy challenges and risks to the global economy. The report covers both developed and developing countries, and it often includes special features on specific topics of interest. The WEO is widely regarded as a reliable source of information and analysis on the global economy and is used by policymakers, investors, and researchers around the world to inform their decisions. 

We use Selenium to scrape the information of the outlook reports in recent 7 years, including title, published date, description, and link of these reports. It is valuable information in the following two situations:

(1) It could be used in natural language processing (NLP) for macroeconomic and microeconomic status quo and trend forecast, where Natural Language Processing (NLP) is a technique that can be used to extract and analyze this information from the text of the report. For example, we could use NLP techniques such as text parsing and entity recognition to extract relevant data from the information of WEO report. This can include key macroeconomic indicators such as GDP growth, inflation, and trade balances, as well as microeconomic trends in specific industries or regions.

(2) It could be used for reference retrival as well, helping retrieve relevant documents or information sources that are cited or referenced in a given text or document, and identify or locating additional resources that may be useful to the user.

In [17]:
PATH = "D:\chromedriver_win32\chromedriver"
URL = "https://www.imf.org"
driver = webdriver.Chrome(PATH)
driver.get(URL)
driver.maximize_window()

In [18]:
driver.find_element_by_css_selector('a[href = "/en/Publications"]').click()
time.sleep(5)
driver.find_element_by_css_selector('a[href = "https://www.imf.org/en/publications/weo"]').click()
time.sleep(5)

In [19]:
url = driver.current_url

def get_df(url):
    page = requests.get(url, headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"})
    soup = BeautifulSoup(page.content, 'html.parser')
    
    passages = soup.findAll("div", attrs = {"class": "result-row pub-row"})
    title_list = []
    link_list = []
    for div in passages:
        title = div.find("a", href = True)
        title_list.append(title.text)
        link_list.append(title["href"])
    title_list = [t.replace("\r", "") for t in title_list]
    title_list = [t.replace("\n", "") for t in title_list]
    title_list = [t.replace("        ", "") for t in title_list]
    title_list = [t.replace("\xa0", " ") for t in title_list]
    link_list = [URL + l for l in link_list]

    date_list = []
    for div in passages:
        date = div.find("p")
        date_list.append(date.text)
    date_list = [t.replace("\r\n                                                ", "") for t in date_list]
    date_list = [t.replace("\r\n                                            ", "") for t in date_list]

    desc_list = []
    for div in passages:
        desc_list.append(div.find("span").text)
    
    return(title_list, date_list, desc_list, link_list)

In [20]:
title_list, date_list, desc_list, link_list = get_df(url)
df = pd.DataFrame({"Title": title_list, "Date": date_list, "Description": desc_list, 
                   "Link": link_list})

In [21]:
driver.find_element_by_css_selector('a[class = "next"]').click()
time.sleep(5)

url = driver.current_url
title_list, date_list, desc_list, link_list = get_df(url)
df2 = pd.DataFrame({"Title": title_list, "Date": date_list, "Description": desc_list, 
                   "Link": link_list})

df = pd.concat([df, df2], axis = 0, ignore_index = True)

In [22]:
driver.find_element_by_css_selector('a[class = "next"]').click()
time.sleep(5)

url = driver.current_url
title_list, date_list, desc_list, link_list = get_df(url)
df3 = pd.DataFrame({"Title": title_list, "Date": date_list, "Description": desc_list, 
                   "Link": link_list})

df = pd.concat([df, df3], axis = 0, ignore_index = True)

In [23]:
df

Unnamed: 0,Title,Date,Description,Link
0,"World Economic Outlook, April 2023: A Rocky Re...","April 11, 2023",The baseline forecast is for growth to fall fr...,https://www.imf.org/en/Publications/WEO/Issues...
1,Inflation Peaking amid Low Growth,"January 30, 2023",The January 2023 World Economic Outlook Update...,https://www.imf.org/en/Publications/WEO/Issues...
2,"World Economic Outlook, October 2022: Counteri...","October 11, 2022",Global economic activity is experiencing a bro...,https://www.imf.org/en/Publications/WEO/Issues...
3,"World Economic Outlook Update, July 2022: Gloo...","July 26, 2022",A tentative recovery in 2021 has been followed...,https://www.imf.org/en/Publications/WEO/Issues...
4,"World Economic Outlook, April 2022: War Sets B...","April 19, 2022",The war in Ukraine has triggered a costly huma...,https://www.imf.org/en/Publications/WEO/Issues...
5,"World Economic Outlook Update, January 2022: R...","January 25, 2022",Global growth is expected to moderate from 5.9...,https://www.imf.org/en/Publications/WEO/Issues...
6,"World Economic Outlook, October 2021: Recovery...","October 12, 2021",This report shows that the global economic rec...,https://www.imf.org/en/Publications/WEO/Issues...
7,"World Economic Outlook Update, July 2021: Faul...","July 27, 2021",The global economy is projected to grow 6.0 pe...,https://www.imf.org/en/Publications/WEO/Issues...
8,"World Economic Outlook, April 2021: Managing D...","March 23, 2021","Global growth is projected at 6% in 2021, mode...",https://www.imf.org/en/Publications/WEO/Issues...
9,"World Economic Outlook Update, January 2021: P...","January 20, 2021",Although recent vaccine approvals have raised ...,https://www.imf.org/en/Publications/WEO/Issues...
