## Web Scrapping Job from Upwork

### Objective:

- Collect following data from the website: https://industryexpert.net/expert-directory/
    - "Name (First Last)", "Title", "Company Name", "Categories", "Website", "E-Mail"
- Export the data as an excel, including one worksheet with all categories as well as each category per worksheet

In [None]:
import pandas as pd
import numpy as np
import re
import os
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from pprint import pprint

In [None]:
# Get subdomains and category names of each category

url = f"https://industryexpert.net/expert-directory/"
req = Request(url , headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, "html.parser")

cat_urls = soup.find_all("ul", "cn-cat-tree dropdown-menu")
subdomain_list = []
category_list = []

for urls in cat_urls:
    for url in urls.find_all("a"):
        subdomain = url.get("href").rsplit("/", 2)[-2]
        cat = url.get("title")
        category_list.append(cat)
        subdomain_list.append(subdomain)

cat_subdom_dict = dict(zip(category_list, subdomain_list))

In [None]:
# Create list of lists of experts information

def create_experts_info_list(url):
    req = Request(url , headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    soup = BeautifulSoup(webpage, "html.parser")
    experts_info = []
    experts = soup.find_all("div", "cn-entry expert")
    
    for expert in experts:

        # Name
        fname = expert.find("span", "given-name")
        mname = expert.find("span", "additional-name")
        lname = expert.find("span", "family-name")
        if mname:
            name_text = f"{fname.text} {mname.text} {lname.text}"
        else:
            name_text = f"{fname.text} {lname.text}"

        # Title
        title = expert.find("span", "title notranslate")
        title_text = title.text if title else "n/a"

        # Company name
        org = expert.find("span", "org")
        org_text = org.text if org else "n/a"

        # Categories
        cats = expert.find_all("span", "cn-category-name")
        if len(cats) == 1: # only 1 
            cats_text = cats[0].text

        else:
            cats_text = [cat.text.replace(", ", "") for cat in cats]

        # Website
        websites = expert.find_all("a", "url")
        if websites:
    
            for website in websites:
                website_link = website.get("href") 
        else:
            website_link = "n/a"

        # Email
        emails = expert.find_all("span", "email-address")
        if len(emails) == 0 :
            emails_text = "n/a"
        elif len(emails) == 1: # only 1 
            emails_text = emails[0].text
        else:
            emails_text = ", ".join([emails[0].text for email in emails])

        expert_info = [name_text, title_text, org_text, cats_text, website_link, emails_text]
        experts_info.append(expert_info)

    print("----")
        
    return experts_info

In [None]:
# Create dataframe with the list of lists experts info

def create_df(subdomain, cat="overall"):
    keys = ["Name (First Last)", "Title", "Company Name", "Categories", "Website", "E-Mail"]
    experts_info = []
    base_url = "https://industryexpert.net/expert-directory/"
    
    if cat == "overall":
        print(f"Creating {cat} dataframe")

        for i in range(1, 24):

            print(f"----PAGE {i}-----")
            print("\n")
            url = os.path.join(base_url, f"?cn-pg={i}")
            print(url)
            new_list = create_experts_info_list(url)
            print(new_list)
            experts_info.extend(new_list)

    else:
        print(f"Creating {cat} dataframe")
        url = os.path.join(base_url, "cat", subdomain)
        print(url)
        
        experts_info = create_experts_info_list(url)

    pprint(experts_info)
    print(f"Length: {len(experts_info)}")
    
    try:
        df = pd.DataFrame(np.array(experts_info), columns=keys)
    except ValueError:
        print("-- No data in this category. --")
        df = None
    print("--- Data Frame created successfully ---")    
    print("--------------------------")    
    return df

In [None]:
# Create the dataframe dictionary:
# key = category and value = dataframe

df_dict = dict()
for c, subdom in cat_subdom_dict.items():
    df_c = create_df(subdom, cat=c)
    df_dict[c] = df_c

In [None]:
# Create a dataframe with all values
df_dict['OVERALL'] = create_df(subdomain, cat="overall")

In [None]:
df_overall = df_dict['OVERALL'].copy()

In [None]:
# Write to excel
with pd.ExcelWriter('experts.xlsx') as writer:
    df_overall.to_excel(writer, sheet_name='OVERALL')
    for cat, df in df_dict.items():
        if len(cat) > 31:
            cat = "".join(cat.split(" ")[:2])
        try:
            df.to_excel(writer, sheet_name=re.sub(r'[\[\]\:\*\?\/\\]', '', cat))
        except AttributeError:
            keys = ["Name (First Last)", "Title", "Company Name", "Categories", "Website", "E-Mail"]
            df = pd.DataFrame(columns=keys)
            df.to_excel(writer, sheet_name=re.sub(r'[\[\]\:\*\?\/\\]', '', cat))