# Συλλογή δεδομένων

Επικεντρωθήκαμε στην κοινότητα "Φυτά, Βότανα Μυστικά" του Facebook όπου οι διαχειριστές κοινοποιούν άρθρα δημοσιευμένα σε σελίδες και blogs. Η συντριπτική πλειψηφία των αναρτήσεων προέρχεται από τη σελίδα ["Προϊόντα της Φύσης"](https://www.proionta-tis-fisis.com/). Επίσης, υπάρχουν λίγες αναρτήσεις από τις εξής πηγές [medialabnews.gr IATRIKANEA](https://medlabgr.blogspot.com/), [itrofi](https://www.itrofi.gr/), οι οποίες έχουν ειδικές κατηγορίες για βότανα και διατροφή αλλά και τις [Εναλλακτική Δράση](https://enallaktikidrasi.com/), [awekengr.com](https://www.awakengr.com/), [EarthShareMe.com](https://earthshareme.com/), [flowmag](https://www.flowmagazine.gr/), στις οποίες αναρτώνται καποια σχετικά άρθρα ανάμεσα σε άλλα. Σκεφτήκαμε να συλλέξουμε τα δεδομένα κατευθείαν από τις σελίδες/blogs.

In [1]:
import httplib2
from bs4 import BeautifulSoup, SoupStrainer
import numpy as np
import pandas as pd
import requests
import re        



def get_embedded_links_in_a_webpage(webpage, url_prefix):
    """
    Gets the links that are embedded in a webpage.

    Parameters
    ----------
    webpage : str
        the url of the webpage that we want to scrap
        
    url_prefix : str
        the beginning of the links that is not included
        in the 'href' attribute if any

    Returns
    -------
    list
        the list of the embedded links in the given webpage
    """
    http = httplib2.Http()
    status, response = http.request(webpage)
    #discard irrelevant links
    return [url_prefix + link['href'] for link in BeautifulSoup(response, parse_only=SoupStrainer('a')).find_all('a', href=True)]



def get_embedded_links_in_multiple_pages(homepage, start_page_num, end_page_num, slash_at_the_end, url_prefix):
    """
    Iterates over multiple pages in a website and retrieves
    the embedded links in each of them.

    Parameters
    ----------
    homepage : str
        the standard invariable part of the link for every page
        
    start_page_num : int
        the number of the first page
        
    end_page_num : int
        the number of the last page
        
    slash_at_the_end : boolean
        True if there is a slash at the end of the page url
        else False
    
    url_prefix : str
        the beginning of the retrieved links that is not 
        included in the 'href' attribute if any

    Returns
    -------
    list
        the list of the embedded links in the desired pages
        of a given website
    """
    links = []
    for counter in range(start_page_num,end_page_num):
        webpage = homepage + str(counter) + ('/' if slash_at_the_end else '')
        links += get_embedded_links_in_a_webpage(webpage, url_prefix)
    links = np.unique(links)
    return links



def remove_irrelevant_links(links):
    """
    Removes irrelevant embedded links like ads, menu options etc.

    Parameters
    ----------
    links : list
        the initial list of links that probably contains
        irrelevant links

    Returns
    -------
    list
        the cleaned list that is consisted only of useful links
    """
    return [link for link in links if (('https://www.proionta-tis-fisis.com' in link and ('category' not in link) and 
                                        ('#' not in link) and ('epikoinonia' not in link) and ('shop' not in link) and
                                        ('twitter' not in link) and ('facebook' not in link) and ('author' not in link) and 
                                        ('javascript' not in link) and ('mailto' not in link) and 
                                        ('oroi-xrisis-istotopou' not in link) and 
                                        (link != 'https://www.proionta-tis-fisis.com/prosoxi') and 
                                        (link != 'https://www.proionta-tis-fisis.com/') and
                                        (link != 'https://www.proionta-tis-fisis.com') and
                                        (link != 'https://www.proionta-tis-fisis.com/i-zoi-einai-mikri-gia-na-zoume-kleismenoi-sto-asfales-koutaki-mas/') and
                                        (link != 'https://www.proionta-tis-fisis.com/synaisthimatiki-yperfagia-aisthima-katoterotitas-kai-antimetopisi/') and
                                        (link != 'https://www.proionta-tis-fisis.com/to-synoliko-oikologiko-apotypoma-ton-ananeosimon-pigon-energeias-den-ehei-dierevnithei-eparkos/') and
                                        (link != 'https://www.proionta-tis-fisis.com/tropoi-syntirisis-ton-trofimon-pro-psygeiou/') and
                                        (link != 'https://www.proionta-tis-fisis.com/yparhei-zoi-prin-ton-thanato/'))
                                       or ('https://www.itrofi.gr/fytika/votana/article/' in link))]



def get_article_in_url(url):
    """
    Retrieves text in the page of the given url.

    Parameters
    ----------
    url : str
        the url of the webpage that is about to be scrapped

    Returns
    -------
    str
        the text displayed in the given url
    """
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))



def get_articles(links):
    """
    Retrieves all the articles from the given urls.

    Parameters
    ----------
    links : list
        the list of the links of the webpages that will be
        scrapped

    Returns
    -------
    list
        the list of the articles-texts located in the given urls
    """
    articles = []
    for link in links:
        article = get_article_in_url(link)
        articles.append(article)
    return articles



def clean_articles(articles, start_phrase, end_phrase):
    """
    Removes irrelevant content in the beginning and at the end
    of an article.

    Parameters
    ----------
    articles : list
        the list of the given articles
        
    start_phrase : str
        the first breaking point that the text will be splitted
        
    end_phrase : str
        the second breaking point that the text will be splitted

    Returns
    -------
    list
        the list of the cleaned articles after removing the unnecessary
        content in the beginning and at the end of each one of them
    """
    cleaned = []
    for article in articles:
        article = article.split(start_phrase, 1)[1]
        article = article.split(end_phrase, 1)[0]
        cleaned.append(article.strip())
    return cleaned



def create_dataset(homepage, start_page_num, end_page_num, slash_at_the_end, url_prefix, output_csv_filename, start_phrase, end_phrase):
    """
    Creates the final dataset where each observation consists of
    the link and the relevant text-article.

    Parameters
    ----------
    homepage : str
        the standard invariable part of the link for every page
        
    start_page_num : int
        the number of the first page
        
    end_page_num : int
        the number of the last page
        
    slash_at_the_end : boolean
        True if there is a slash at the end of the page url
        else False
        
    url_prefix : str
        the beginning of the retrieved links that is not 
        included in the 'href' attribute if any
        
    output_csv_filename : str
        the path of the output csv file where the dataset
        will be stored
        
    start_phrase : str
        the first breaking point that the text will be splitted
        
    end_phrase : str
        the second breaking point that the text will be splitted
    """
    links = get_embedded_links_in_multiple_pages(homepage, start_page_num, end_page_num, slash_at_the_end, url_prefix)
    links = remove_irrelevant_links(links)
    articles = get_articles(links)
    articles = clean_articles(articles, start_phrase, end_phrase)    
    df = pd.DataFrame(list(zip(links, articles)), columns =['link', 'text']) 
    df.to_csv(output_csv_filename, index=False)

In [2]:
create_dataset('https://www.itrofi.gr/fytika/votana?page=', 0, 10, False, 'https://www.itrofi.gr', '../data/itrofi_votana.csv',
              'ΒΟΤΑΝΑ          ', 'Tags:')

In [3]:
create_dataset('https://www.proionta-tis-fisis.com/category/votana/page/', 1, 68, True, '', '../data/proionta_tis_fisis_votana.csv',
              'ΠερισσοτεραΑναζήτησηΑρχική', 'Προηγούμενο άρθρο')