In [1]:
from bs4 import BeautifulSoup
import requests

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import *

import pandas as pd
import numpy as np
from collections import defaultdict
import re
import os
import glob
import time
import pickle

In [2]:
class scraper(object):
    
    def __init__(self, homepage, login_page, filename):
        self.homepage = homepage
        self.login_page = login_page
        self.filename = filename
        self.driver = self.login()
        self.data_dir = os.getcwd() + '/data/'
    
    def load_file(self, filename):
        if os.path.isfile(filename):
            val = input('{0} exists. Run and overwrite the existed file(Y/N)? Your Answer: '.format(filename))
            if val.lower() == 'y':
                result = False
            else:
                print('Keep File')
                result = True
        else:
            result = False
        return result
    
    def login(self):
        Options().add_argument("--headless")
        driver = webdriver.Chrome('/usr/local/bin/chromedriver')
        driver.get(self.login_page)
        u = driver.find_element_by_name('email')
        u.send_keys('kim@fantasticimport.com')
        p = driver.find_element_by_name('pwd')
        p.send_keys('Aa875104')
        driver.find_element_by_name('btn_submit_login').click()
        return driver
    
    def gen_soup(self, url):
        page = requests.get(url, headers={"User-Agent": "XY"}).text
        soup = BeautifulSoup(page, 'html.parser')
        return soup
    
    def dump_file(self, content, filename):
        with open(filename, 'wb') as file:
            pickle.dump(content, file)
    
    def gen_prodcat(self, return_ = False):
        
        cat_list = []
        
        soup = self.gen_soup(self.homepage)
        section_link = [l for l in [s.a.get('href') for s in soup.find_all('li', class_ = 'dropdown-submenu')] if 'category' in l]
        section_name = [t.get_text().strip() for t in soup.find_all('li', class_ = 'dropdown-submenu') if 'category' in t.a.get('href')]
        section_list = [self.homepage + u for u in section_link]
        
        category_list = []
        for name,url in zip(section_name, section_list):
            soup = self.gen_soup(url)
            for tag in [t for t in soup.find_all('span', class_='panel-title')]:
                page = tag.a.get('href')
                text = tag.a.get_text()
                pl = self.homepage + page
                soup = self.gen_soup(pl)
                
                sublist = soup.find_all('span', class_='panel-title')
                if not sublist:
                    if soup.find('p', text = "Sorry, we're all sold out!"):
                        pass
                    else:
                        cat_str = name + ' > ' + text
                        category_list.append((cat_str, pl))
                else:
                    for sub_p in [t.a for t in sublist]:
                        sub_pl = self.homepage + sub_p.get('href')
                        cat_str = name + ' > ' + text + ' > ' + sub_p.get_text()
                        category_list.append((cat_str, sub_pl))
                        
        product_list = []
        for cat in category_list:
            self.driver.get(cat[1])
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            for prod in soup.find_all('div', class_='item-image'):
                link = self.homepage + prod.a.get('href')
                product_list.append(link)
                cat_list.append(cat[0])
            page_no = int(self.driver.find_element_by_class_name('total-page').text.split(' ')[-1])
            for i in range(page_no - 1):
                try:
                    self.driver.find_element_by_xpath("//img[@src='images/next-page.png']").click()
                    soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                    for prod in soup.find_all('div', class_='item-image'):
                        link = self.homepage + prod.a.get('href')
                        product_list.append(link)
                        cat_list.append(cat[0])
                except NoSuchElementException:
                       break
        
        for file, name in zip([section_list, category_list, cat_list, product_list], ['section_list.pkl', 'category_list.pkl', 'category.pkl', 'product_list.pkl']):
            self.dump_file(file, name)    
    
    def check_textExist(self, soup_output):
        if soup_output:
            return soup_output.text.strip()
        else:
            return np.nan
            
    def gen_batch_prodInfo(self, restart_no = 0):
        with open('product_list.pkl', 'rb') as file:
            product_list = pickle.load(file)
            
        batch_size = 100
        
        for batch in range(restart_no, len(product_list)//batch_size + 1):
            product_info = defaultdict(list)
            
            start = batch*batch_size
            end = (batch+1)*batch_size if batch < len(product_list)//batch_size else len(product_list)
            
            for link in product_list[start : end]:
                self.driver.get(link)
                soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                ## description
                product_info['Title'].append(self.check_textExist(soup.find('div', class_="product-name fontMontserrat")))
                ## product number
                product_info['Product Number'].append(self.check_textExist(soup.find('span', {'id':'pr_number'})))
                # inner price
                product_info['Inner Price'].append(self.check_textExist(soup.find('span', {'id':'pr_inner_price'})))
                ## case price
                product_info['Case Price'].append(self.check_textExist(soup.find('span', {'id':'pr_case_price'})))
                ## upc
                product_info['UPC'].append(self.check_textExist(soup.find('span', {'id':'pr_upc'})))
                ## availability
                product_info['Availability'].append(self.check_textExist(soup.find('span', {'id':'pr_availability'})))
                ## image link
                if soup.find('img', {'id':'product-image-file'}):
                    product_info['Image Link'].append(soup.find('img', {'id':'product-image-file'}).get('src'))
                elif not soup.find('img', {'id':'product-image-file'}):
                    product_info['Image Link'].append(np.nan)
                ## size
                if soup.find('span', text = 'SIZE:'):
                    product_info['Size'].append(soup.find('span', text = 'SIZE:').next_sibling.strip())
                elif not soup.find('span', text = 'SIZE:'):
                    product_info['Size'].append(np.nan)
                ## packaging
                if soup.find('span', text = 'PACKAGING:'):
                    product_info['Packaging'].append(soup.find('span', text = 'PACKAGING:').next_sibling.strip())
                elif not soup.find('span', text = 'PACKAGING:'):
                    product_info['Packaging'].append(np.nan)
                ## description
                if soup.find('div', class_ = 'description-text').find_all('p'):
                    product_info['Description'].append(' '.join([p.text.strip() for p in soup.find('div', class_ = 'description-text').find_all('p')]).strip())
                elif not soup.find('div', class_ = 'description-text').find_all('p'):
                    product_info['Description'].append(np.nan)
                ## features
                if soup.find('div', class_ = 'description-text').find('ol'):
                    product_info['Features'].append(str([b.text for b in soup.find('div', class_ = 'description-text').find('ol').find_all('li')]))
                elif not soup.find('div', class_ = 'description-text').find('ol'):
                    product_info['Features'].append(np.nan)
            
            with open(self.data_dir + 'product_info-' + repr(batch) + '.pkl', 'wb') as prod_info:
                pickle.dump(product_info, prod_info)
            time.sleep(15)
    
    def gen_df(self, filename):
        with open(filename, 'rb') as file:
            prod_info = pickle.load(file)   
        df = pd.DataFrame(prod_info)
        return df
    
    def df2csv(self):
        prod_info_list = glob.glob(self.data_dir + '*.pkl')
        prod_info_list.sort(key = lambda x:int(x.split('-')[-1].split('.')[0]))
        with open(prod_info_list[0],'rb') as file:
            prod_sample = pickle.load(file)
        cols = list(prod_sample.keys())
        df = pd.DataFrame(columns = cols)

        for prod_info in prod_info_list:
            df = df.append(self.gen_df(prod_info))
            
        with open('category.pkl', 'rb') as file:
            cat_list = pickle.load(file)
        df['Category'] = cat_list

        if os.path.isfile(self.data_dir + self.filename):
            val = input('File exists. Run and overwrite the existed file(Y/N)? Your Answer: ')
            if val.lower() == 'y':
                df.to_csv(self.data_dir + self.filename, index=False)
            else:
                print('Keep File')
        else:
            df.to_csv(self.data_dir + self.filename, index=False)
    
    def load_csv(self, file_name):
        file = self.data_dir + file_name
        if os.path.isfile(file):
            df = pd.read_csv(file)
            return df
        else:
            print('File Does Not Exist!')

In [3]:
homepage = 'https://www.portofinointl.com/'
filename = 'portofinointl.csv'

In [4]:
login_link = 'https://www.portofinointl.com/signup.php'

In [5]:
scraper = scraper(homepage, login_link, filename)

In [6]:
# scraper.gen_prodcat()

In [7]:
# scraper.gen_batch_prodInfo()

In [8]:
# scraper.gen_batch_prodInfo(47)

In [9]:
scraper.df2csv()

In [10]:
df = scraper.load_csv(filename)

In [11]:
df.head()

Unnamed: 0,Title,Product Number,Inner Price,Case Price,UPC,Availability,Image Link,Size,Packaging,Description,Features,Category
0,Artificial Grand Arch Tree 7½ ft - Green,25-0667GR,$995.00 EA,$995.00 EA,745910735160,8 Cases and 0 Inner,https://portofino.azureedge.net/25-0667GR.jpg,7½ ft,1EA/INNER / 1EA/CASE,,,Event Decor > Artificial Flowers and Greenery ...
1,Artificial Triple Ball Boxwood Plant In Pot - 5ft,25-0704,$99.95 EA,$99.95 EA,745910745442,20 Cases and 0 Inner,https://portofino.azureedge.net/25-0704pic.jpg,"13"" x 13' x 60""",1EA/INNER / 1EA/CASE,A proper English garden would not be complete ...,,Event Decor > Artificial Flowers and Greenery ...
2,Artificial Grand Heart Tree 11½ ft - White,25-0664WH,$950.00 EA,$950.00 EA,745910734248,6 Cases and 0 Inner,https://portofino.azureedge.net/25-0664WH.jpg,11½ ft,1EA/INNER / 1EA/CASE,,,Event Decor > Artificial Flowers and Greenery ...
3,Artificial Grand Arch Tree 8½ ft - Green,25-0668GR,$895.00 EA,$895.00 EA,745910735177,4 Cases and 0 Inner,https://portofino.azureedge.net/25-0668GR.jpg,8½ ft,1EA/INNER / 1EA/CASE,,,Event Decor > Artificial Flowers and Greenery ...
4,Artificial Grand Arch Tree 10½ ft - Green,25-0661GR,$795.00 EA,$795.00 EA,745910734217,10 Cases and 0 Inner,https://portofino.azureedge.net/25-0661GR.jpg,10½ ft,1EA/INNER / 1EA/CASE,,,Event Decor > Artificial Flowers and Greenery ...


In [12]:
df.shape

(5440, 12)

In [15]:
df[df['Product Number'] == '75-1529BK']

Unnamed: 0,Title,Product Number,Inner Price,Case Price,UPC,Availability,Image Link,Size,Packaging,Description,Features,Category
5146,Metallic Foil Table Skirt 14ft - Black,75-1529BK,$2.95 EA,$2.75 EA,745910729886,Please contact us for availability.,https://portofino.azureedge.net/75-1529BK.jpg,"30"" x 14ft",12EA/INNER / 24EA/CASE,,,Party & Craft > Plastic Table Covers > Table S...
