# Import & generic

In [1]:
import configparser
import logging
import tempfile
import time
from datetime import  datetime
import dateparser
import hashlib

import requests
from bs4 import BeautifulSoup


import pandas as pd
import numpy as np
import glob


from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextBoxHorizontal, \
        LTTextLine, LTTextLineHorizontal, LTChar

def timeit(method):
    """ Get method execution time """
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
          name = kw.get('log_name', method.__name__.upper())
          kw['log_time'][name] = int((te - ts))
        else:
            print('%r - %d heures %d minutes %2.2f secondes.'\
                  % (method.__name__, (te - ts) / 3600, (te - ts)%3600 / 60, (te - ts) % 60))
        return result
    return timed


# Get Urls

In [2]:
class UrlsManager:
    def __init__(self):
        self.config = configparser.ConfigParser()
        self.config.read('encans.ini')
        self.deals_home_urls = self.config.get('site', 'deals_home_urls')
        self.deals_histored_urls = self.config.get('site', 'deals_histored_urls')
        self.deals_caming_urls = self.config.get('site', 'deals_caming_urls')
        self.logger = logging.getLogger('encans')
        
        logfile = self.config.get('dir', 'datadir') + 'logs\encans_urls_' +\
        str(datetime.now().date()) + '.log'
        hdlr = logging.FileHandler(logfile)
        formatter = logging.Formatter('%(asctime)s -- %(funcName)s -- %(levelname)s '\
                                      '-- %(message)s')
        hdlr.setFormatter(formatter)
        self.logger.addHandler(hdlr) 
        self.logger.setLevel(logging.INFO)
        
        self.logger.info('*********     *********     *********     *********     *********')
        self.logger.info('*********     UrlsManager Started     *********')
        
      
    def processUrls(self):
        #Extract all histored urls
        hUrls = self.extactHistoredUrls(self.deals_home_urls, self.deals_histored_urls)
        col_names = ["id", "city", "address", "url", "date"]
        hdf = pd.DataFrame(hUrls, columns=col_names)
        hdf["status"] = "waiting"
        hdf["scraping_date"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        #Extract all news urls and merge them with histored urls
        nUrls = self.extactNewUrls(self.deals_home_urls, self.deals_caming_urls)
        ndf = pd.DataFrame(nUrls, columns=col_names)
        ndf["status"] = "new"
        ndf["scraping_date"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        urls_df = hdf.append(ndf, sort=True)
        
        urls_repo_path = self.config.get('dir', 'datadir') + \
        'urls_to_scrape\encan_urls_repo.csv'
        
        #Merge all extracted urls with the repo
        old_urls_df = pd.read_csv(urls_repo_path, sep=self.config.get('conf', 'default_sep'), \
                              delimiter=None, header='infer')
        self.logger.info('%d urls in the old Dataframe Repo' %len(old_urls_df.index))
        new_urls_df = urls_df[~urls_df.id.isin(old_urls_df.id)]
        self.logger.info('%d news urls to add to the Dataframe Repo' %len(new_urls_df.index))
        urls_df = old_urls_df.append(new_urls_df, sort=True)
        
        urls_df.to_csv(urls_repo_path, sep=self.config.get('conf', 'default_sep'),\
                       index = False, encoding='utf-8')
        self.logger.info('%d urls now in the Dataframe Repo' %len(urls_df.index))
        print("Processing urls done !")
        return urls_df
    
    def extactHistoredUrls(self, homeUrl, historedUrl):
        """Extract histored deals urls"""
        self.logger.info('Extracting deals histred urls form ' + historedUrl)
        page = requests.get(historedUrl)
        soup = BeautifulSoup(page.content, 'html.parser')
        urls = []
        htmlList = soup.find("table", "lots").find_all("tr")
        
        for p in htmlList:
            if p and p.th.get_text('strip=True') != 'Lieu':
                city_adr = p.th.get_text('strip=True')
                city = city_adr.split("-")[0].strip()
                address = "" if len(city_adr.split("-")) == 1 else \
                city_adr.split("-")[1].strip()          
                date_txt = p.td.get_text('strip=True')[:-6].strip()
                date = dateparser.parse(date_txt)

                url = homeUrl + p.select("td a")[0]["href"]
                id = hashlib.sha1(url.encode('utf-8')).hexdigest()
                urls.append([id, city, address, url, date])
                
        self.logger.info("%d histored urls extracted." %len(urls) )
          
        return urls
    
    def extactNewUrls(self, homeUrl, comingUrls):
        """Extract coming urls deals urls"""
        self.logger.info('Extract coming urls deals urls ' + comingUrls)
        page = requests.get(comingUrls)
        soup = BeautifulSoup(page.content, 'html.parser')
        newUrls = []
        htmlTRList = soup.find("table", "lots").find_all("tr")
        
        for tr in htmlTRList:
            if tr:
                if tr.th:
                    th = tr.th
                    if th.has_attr('class'):
                        city_adr = th.a.get_text('strip=True')
                        city = city_adr.split("-")[0].strip()
                        address = "" if len(city_adr.split("-")) == 1 else\
                        city_adr.split("-")[1].strip()     
                        date_td = th.find_next_sibling("td")
                        date_txt = date_td.get_text('strip=True')[:-6].strip()
                        date = dateparser.parse(date_txt)
                        nextTDs = date_td.find_next_siblings("td")
                        for td in nextTDs:
                            if td.a:
                                if "Catalogue_internet" in td.select("a")[0]["href"]:
                                    url = homeUrl + td.select("a")[0]["href"]
                                    id = hashlib.sha1(url.encode('utf-8')).hexdigest()
                                    newUrls.append([id, city, address, url, date])
          
        self.logger.info("%d coming urls extracted." %len(newUrls) )
        return newUrls
    
    

# Extract Data from pdfs to text

In [3]:
class PdfExtractor:
    def __init__(self):
        self.config = configparser.ConfigParser()
        self.config.read('encans.ini')
        self.deals_home_urls = self.config.get('site', 'deals_home_urls')
        self.deals_histored_urls = self.config.get('site', 'deals_histored_urls')
        self.deals_caming_urls = self.config.get('site', 'deals_caming_urls')
        self.logger = logging.getLogger('encans')
        
        logfile = self.config.get('dir', 'datadir') + 'logs\encans_pdf_'\
        + str(datetime.now().date()) + '.log'
        hdlr = logging.FileHandler(logfile)
        formatter = logging.Formatter('%(asctime)s -- %(funcName)s -- %(levelname)s '\
                                      '-- %(message)s')
        hdlr.setFormatter(formatter)
        self.logger.addHandler(hdlr) 
        self.logger.setLevel(logging.INFO)
        
        self.logger.info('*********     *********     *********     *********     *********')
        self.logger.info('*********     PdfExtractor Started     *********')
    
    
    def flatten(self, lst):
        """Flattens a list of lists"""
        return [subelem for elem in lst for subelem in elem]


    def extract_characters(self, element):
        """
        Recursively extracts individual characters from 
        text elements. 
        """
        TEXT_ELEMENTS = [LTTextBox, LTTextBoxHorizontal, LTTextLine, LTTextLineHorizontal]

        if isinstance(element, LTChar):
            return [element]

        if any(isinstance(element, i) for i in TEXT_ELEMENTS):
            return self.flatten([self.extract_characters(e) for e in element])

        if isinstance(element, list):
            return self.flatten([self.extract_characters(l) for l in element])

        return []
    
    @timeit
    def extract_layout_by_page(self, pdf_url):
        """
        Extracts LTPage objects from a pdf file.
        Slightly modified from
        https://euske.github.io/pdfminer/programming.html
        """
        laparams = LAParams()
        req =  requests.get(pdf_url, stream = True) #Request the pdf content        
        temp = tempfile.TemporaryFile() #Create a temporary file
        temp.write(req.raw.data)
        temp.seek(0)
        parser = PDFParser(temp)
        document = PDFDocument(parser)

        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed

        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        layouts = []
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layouts.append(device.get_result())
            
        # Close object
        temp.close()
        device.close()

        return layouts
    
    
    def  getPageContent(self, page_number, page, pdf_id):
        """
            Extact a pdf page to a DataFrame
        """
        self.logger.info("Processing pdf %s - page %d ." %(pdf_id, page_number))
        texts = []
        rects = []

        #Get page's text and rectangles. Seperate text and rectangle elements in 2 lists
        for e in page:
            if isinstance(e, LTTextBoxHorizontal):
                texts.append(e)
           
            if isinstance(e, LTRect):
                rects.append([e.x0, e.y0, e.x1, e.y1])
                
        recsDF =  pd.DataFrame(rects, columns=["x0", "y0", "x1", "y1"])
        recsDF["page_number"] = page_number
        recsDF["pdf_id"] = pdf_id
        
        characters = self.extract_characters(texts)

        charactersList = list(map(lambda x : [x.x0, x.y0, x.x1, x.y1, x.get_text(),\
                                              x.fontname, \
                                              x.adv, x.matrix], characters))
        
        charactersDF =  pd.DataFrame(charactersList, \
                                       columns=["x0", "y0", "x1", "y1", "text",\
                                                "fontname", "adv", "matrix"])
        charactersDF["page_number"] = page_number
        charactersDF["pdf_id"] = pdf_id
        
        return [charactersDF, recsDF]

    
    @timeit
    def  downloadPDFData(self, url_id, pdf_url):
        """ Download a single pdf and store the result to csv files """
        self.logger.info("Downloading %s" %pdf_url)

        page_layouts = self.extract_layout_by_page(pdf_url)
        self.logger.info('%s pages found in this pdf file' %str(len(page_layouts)))
        
        textDF = pd.DataFrame(columns=["x0", "y0", "x1", "y1", "text", "fontname",\
                                       "adv", "matrix"])
        rectsDF = pd.DataFrame(columns=["x0", "y0", "x1", "y1"])
        for page_num, current_page in enumerate(page_layouts, 1):
            page_data = self.getPageContent(page_num, current_page, url_id)
            textDF = textDF.append(page_data[0], sort=True)
            rectsDF = rectsDF.append(page_data[1], sort=True)
            
        textFilesDir = self.config.get('dir', 'datadir') + \
            'scraping\encans_text__' + url_id + '__' + str(datetime.now().date()) + '.csv'
        rectsFilesDir = self.config.get('dir', 'datadir') + \
            'scraping\encans_rects__' + url_id + '__' + str(datetime.now().date()) + '.csv'
        
        textDF.to_csv(textFilesDir, sep = self.config.get('conf', 'default_sep'), \
                      index = False, encoding='utf-8')
        rectsDF.to_csv(rectsFilesDir, sep = self.config.get('conf', 'default_sep'), \
                       index = False, encoding='utf-8')
        
        return True

      
    @timeit
    def  downloadDeals(self):
        print('Downloading pdf files ...')
        """ List all pdf to download and orchestrate the downloading processs """
        urls_file = self.config.get('dir', 'datadir')\
            +'/urls_to_scrape/encan_urls_repo.csv'

        urls_repo_df = None
        urls_to_scrape_df = None
        
        self.logger.info("Preparing to download pdf from file %s" %urls_file)
        
        try:
            urls_repo_df = pd.read_csv(urls_file, sep=self.config.get('conf', 'default_sep'), \
                              delimiter=None, header='infer')
        except Exception  as e:
            self.logger.error("ERROR when reading urls file")
            self.logger.error("Error: %s " %str(e.args))
            exc_type, exc_value, exc_traceback = sys.exc_info()          
            self.logger.error(traceback.print_exception(exc_type, \
                                                        exc_value, exc_traceback, \
                                                        limit=2, file=sys.stdout))
            return False
            
        urls_to_scrape_df = urls_repo_df[urls_repo_df['status'] != 'scraped']
        self.logger.info("%d pdf to download." %len(urls_to_scrape_df.index))
        
        for index, row in urls_to_scrape_df.iterrows():
            result_ = 'scraped' if self.downloadPDFData(row['id'],\
                                                        row['url']) else 'failed_download'
            urls_repo_df.loc[urls_repo_df['id'] == row['id'], 'status'] = result_
            urls_repo_df.loc[urls_repo_df['id'] == row['id'], 'scraping_date'] = \
                datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            urls_to_scrape_df.loc[urls_to_scrape_df['id'] == row['id'],\
                                  'status'] = result_
            
            
            self.logger.info("   --------------------------------------------   ")
            self.logger.info("File %d over %d" %(index + 1, \
                                                 len(urls_to_scrape_df.index)))
            self.logger.info("   --------------------------------------------   ")
            print("File %d over %d" %(index + 1, \
                                                 len(urls_to_scrape_df.index)))
            
        
        urls_repo_df.to_csv(urls_file, sep=self.config.get('conf', 'default_sep'),\
                       index = False, encoding='utf-8')
        
        self.logger.info(". . . . . . . . S T A T S . . . . . . . .")
        self.logger.info(str(urls_to_scrape_df.groupby('status').size()))
        self.logger.info(" . . . . END . . . . ")
        print('Extracting pdf done')


# Process pdfs

In [4]:
class EncansProcessor:
    def __init__(self):
        self.config = configparser.ConfigParser()
        self.config.read('encans.ini')
        self.deals_home_urls = self.config.get('site', 'deals_home_urls')
        self.deals_histored_urls = self.config.get('site', 'deals_histored_urls')
        self.deals_caming_urls = self.config.get('site', 'deals_caming_urls')
        self.logger = logging.getLogger('encans')
        
        logfile = self.config.get('dir', 'datadir') + 'logs\encans_proc__' \
        + str(datetime.now().date()) + '.log'
        hdlr = logging.FileHandler(logfile)
        formatter = logging.Formatter('%(asctime)s -- %(funcName)s -- '\
                                      '%(levelname)s -- %(message)s')
        hdlr.setFormatter(formatter)
        self.logger.addHandler(hdlr) 
        self.logger.setLevel(logging.INFO)
        
        self.logger.info('*********     *********     *********     *********     *********')
        self.logger.info('*********     '+ str(self.__class__) +' Started     *********')
    
    @timeit
    def processDeals(self):
        print("Processing ...")
        scraping_dir = self.config.get('dir', 'datadir') + '/scraping/'
        #scraping_dir = "C:/Baawngal/Data Analysis/encans/Temps/"
        ### Read all .csv (rects and texts) to one pandans Dataframe
        text_files_wildcard = glob.glob(scraping_dir + "encans_text__*.csv")
        text_files_list = []
        
        rect_files_wildcard = glob.glob(scraping_dir + "encans_rects__*.csv")
        rect_files_list = []

        for file_ in text_files_wildcard:
            t_df = pd.read_csv(file_,index_col=None, header=0, \
                             sep=self.config.get('conf', 'default_sep'))
            text_files_list.append(t_df)
            
        self.logger.info(str(len(text_files_list)) + ' text files to process')
            
        for f in rect_files_wildcard:
            r_df = pd.read_csv(f, index_col=None, header=0, \
                             sep=self.config.get('conf', 'default_sep'))
            rect_files_list.append(r_df)
            
        self.logger.info(str(len(rect_files_list)) + ' rect files to process')
            
        """
        """""" Processing Texts
        """
        df_text = pd.concat(text_files_list, axis = 0, ignore_index = True)
        
        # Separate pdf right columns to left columns
        df_text_agg = df_text.groupby(['pdf_id', 'page_number'], as_index=False)\
                             .agg({'x0':min, 'x1':max})
        df_text_agg = df_text_agg.rename(columns={'x0':'page_min_x0', 'x1':'page_max_x1'})
        df_text = pd.merge(df_text, df_text_agg, on=['pdf_id', 'page_number'], how='left')
        df_text = df_text.assign(mid_x = lambda x: (x.page_min_x0 + x.page_max_x1)/2)
        df_text["col_type"] = np.where(df_text['x0']<= df_text['mid_x'], 'left', 'right')
        
        # Group text by lines        
        df_lines = df_text.groupby(['pdf_id', 'page_number', 'y0', 'y1', 'col_type'], \
                                   as_index=False)\
                          .agg({'text': sum, 'x0':min, 'x1':max, 'fontname':max})
        df_text = None
        
        """
        """""" Processing Rects
        """
        
        df_rects = pd.concat(rect_files_list, axis = 0, ignore_index = True)
        # Calculate width, hight, min - max - dix x and column type (roght or left)
        df_rects = df_rects.assign(height = lambda x: x.y1 - x.y0) \
                           .assign(width = lambda x: x.x1 - x.x0) 
        df_rects_agg = df_rects.groupby(['pdf_id', 'page_number'], as_index=False)\
                               .agg({'x0':min, 'x1':max})
        df_rects_agg = df_rects_agg.rename(columns={"x0":"rect_page_min_x0",\
                                                    "x1":"rect_page_max_x1"})
        
        df_rects = pd.merge(df_rects, df_rects_agg, on=['pdf_id', 'page_number'], how='left')
        df_rects = df_rects.assign(rect_mid_x = lambda x: \
                                   (x.rect_page_min_x0 + x.rect_page_max_x1)/2)
        df_rects["col_type"] = \
            np.where(df_rects['x0']<= df_rects['rect_mid_x'], 'left', 'right')
        
        df_rects_agg = df_rects.groupby(['pdf_id','page_number','col_type'],as_index=False)\
                               .agg({'height':max, 'width':max})
        df_rects_agg = df_rects_agg.rename(columns={"height":"rect_page_max_height", \
                                                    "width":"rect_page_max_width"})
        df_rects = pd.merge(df_rects, df_rects_agg, on=['pdf_id','page_number','col_type'],\
                            how='left')
        """
        """""" Keep only interesting rects
        """
        # Keep rects with width enought large
        df_rects = df_rects[df_rects.width >= df_rects.rect_page_max_width * 0.7]
        
        # Sort rects and get the following rect.y0 for each rect, calculate the gap between
        # this rect and it's following. Then get the max gap by page in order to eliminate
        # non useful rects
        df_rects = df_rects.sort_values(['pdf_id','page_number','col_type', 'y0'],\
                                            ascending=[True, True, True, False])
        df_rects['rect_next_y0'] = df_rects['y0'].shift(-1)
        df_rects['rect_gap_y0'] = df_rects['y0'] - df_rects['rect_next_y0']
        
        df_rects_agg = df_rects.groupby(['pdf_id','page_number','col_type'],as_index=False)\
                               .agg({'rect_gap_y0':max})
        df_rects_agg = df_rects_agg.rename(columns={"rect_gap_y0":"page_max_gap_y0"})
        df_rects = pd.merge(df_rects, df_rects_agg, on=['pdf_id','page_number','col_type'],\
                            how='left')
        
        # Keep rects with enought large gap (ignore rects wich are too close to
        # their follower)
        df_rects = df_rects[df_rects.rect_gap_y0 >= df_rects.page_max_gap_y0 * 0.7]

        # Join texts and rects on pdf_id, page_number, col_type and text y0 between 
        # rect rect y0 and next rect y0
        df_lines = pd.merge(df_lines, df_rects, on=['pdf_id', 'page_number', 'col_type'],\
                            how='left', suffixes=('', '_rect'))
        
        df_lines = df_lines[(df_lines.y0 <= df_lines.y0_rect) & \
                            (df_lines.y0 >= df_lines.rect_next_y0)] 
        
        #Pivot data
        df_lines['col_id'] =\
            df_lines.groupby(['pdf_id','page_number','col_type', 'y0_rect'])\
                    .cumcount(ascending=False)+1
        
        df_lines = df_lines.sort_values(['pdf_id','page_number','col_type', 'y0'],\
                                            ascending=[True, True, True, False])
        df_lines = pd.pivot_table(df_lines,\
                                  index=['pdf_id','page_number','col_type', 'y0_rect'],\
                                  values=['text'], columns=['col_id'], aggfunc=np.max)
        df_lines.reset_index()
        df_lines = pd.DataFrame(df_lines.to_records())
        df_lines.columns = [name.replace("('text', ", "col").replace(")", "")\
                            for name in df_lines.columns]
        
        self.logger.info('Dataframe generated =======>> ')
        
        self.logger.info(str(df_lines.info()))
        
        dataFile = self.config.get('dir', 'datadir') + \
            'scraping\encans_data_1__' + str(datetime.now().date()) + '.csv'
        df_lines.to_csv(dataFile, sep = self.config.get('conf', 'default_sep'), \
                      index = False, encoding='utf-8')
        self.logger.info('Data saved to ' + dataFile)
        
        print('Processing done!')

        return df_lines

In [5]:
#Process urls
um = UrlsManager()
df = um.processUrls()

pe = PdfExtractor()
pe.downloadDeals()
ep = EncansProcessor()
df = ep.processDeals()

Processing urls done !
Downloading pdf files ...
'extract_layout_by_page' - 0 heures 0 minutes 8.03 secondes.
'downloadPDFData' - 0 heures 0 minutes 9.31 secondes.
File 68 over 1
Extracting pdf done
'downloadDeals' - 0 heures 0 minutes 9.38 secondes.
Processing ...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1456 entries, 0 to 1455
Data columns (total 15 columns):
pdf_id         1456 non-null object
page_number    1456 non-null float64
col_type       1456 non-null object
y0_rect        1456 non-null float64
col1           1456 non-null object
col2           1456 non-null object
col3           1456 non-null object
col4           1455 non-null object
col5           1445 non-null object
col6           1434 non-null object
col7           1403 non-null object
col8           1363 non-null object
col9           1200 non-null object
col10          663 non-null object
col11          77 non-null object
dtypes: float64(2), object(13)
memory usage: 170.7+ KB
Processing done!
'processDeals' - 0 heures 0 minutes 5.41 secondes.
