# **1. Install Lib**

In [None]:
!pip install facebook-scraper

Collecting facebook-scraper
  Downloading https://files.pythonhosted.org/packages/64/86/481a196ae84941a34a2172be709c61f12c528ae952995cdedf2d348f47ab/facebook_scraper-0.2.3-py3-none-any.whl
Collecting html2text<2021.0.0,>=2020.1.16
  Downloading https://files.pythonhosted.org/packages/ae/88/14655f727f66b3e3199f4467bafcc88283e6c31b562686bf606264e09181/html2text-2020.1.16-py3-none-any.whl
Collecting requests-html<0.11.0,>=0.10.0
  Downloading https://files.pythonhosted.org/packages/24/bc/a4380f09bab3a776182578ce6b2771e57259d0d4dbce178205779abdc347/requests_html-0.10.0-py3-none-any.whl
Collecting fake-useragent
  Downloading https://files.pythonhosted.org/packages/d1/79/af647635d6968e2deb57a208d309f6069d31cb138066d7e821e575112a80/fake-useragent-0.1.11.tar.gz
Collecting pyquery
  Downloading https://files.pythonhosted.org/packages/78/43/95d42e386c61cb639d1a0b94f0c0b9f0b7d6b981ad3c043a836c8b5bc68b/pyquery-1.4.1-py2.py3-none-any.whl
Collecting pyppeteer>=0.0.14
[?25l  Downloading https://fil

In [None]:
!pip install xlsxwriter

Collecting xlsxwriter
[?25l  Downloading https://files.pythonhosted.org/packages/c8/a2/e5f28b67b7d883c9a6585c0ef32b4bb002bff0292b3008f3d6d3fc7eee59/XlsxWriter-1.2.9-py2.py3-none-any.whl (141kB)
[K     |██▎                             | 10kB 17.7MB/s eta 0:00:01[K     |████▋                           | 20kB 1.7MB/s eta 0:00:01[K     |███████                         | 30kB 2.2MB/s eta 0:00:01[K     |█████████▎                      | 40kB 2.5MB/s eta 0:00:01[K     |███████████▋                    | 51kB 2.0MB/s eta 0:00:01[K     |█████████████▉                  | 61kB 2.3MB/s eta 0:00:01[K     |████████████████▏               | 71kB 2.5MB/s eta 0:00:01[K     |██████████████████▌             | 81kB 2.7MB/s eta 0:00:01[K     |████████████████████▉           | 92kB 2.9MB/s eta 0:00:01[K     |███████████████████████▏        | 102kB 2.7MB/s eta 0:00:01[K     |█████████████████████████▌      | 112kB 2.7MB/s eta 0:00:01[K     |███████████████████████████▊    | 122kB 2.7M

# **2. Init Lib**

In [None]:
from facebook_scraper import get_posts
import pandas as pd
import time

# **3. Define Helper Function**

In [None]:
# function to display time more elegantly
intervals = (
    ('weeks', 604800),  # 60 * 60 * 24 * 7
    ('days', 86400),    # 60 * 60 * 24
    ('hours', 3600),    # 60 * 60
    ('minutes', 60),
    ('seconds', 1),
    )

def display_time(seconds, granularity=2):
    result = []

    for name, count in intervals:
        value = seconds // count
        if value:
            seconds -= value * count
            if value == 1:
                name = name.rstrip('s')
            result.append("{} {}".format(value, name))
    return ', '.join(result[:granularity])

In [None]:
import re
from bs4 import BeautifulSoup
import lxml
from nltk.tokenize import WordPunctTokenizer

def text_cleaner(text):
  tok = WordPunctTokenizer()
  pat1 = r'@[A-Za-z0-9]+'
  pat2 = r'\w+:\/\/\S+'
  pat3 = r'\S*twitter.com\S*' 
  combined_pat = r'|'.join((pat1, pat2, pat3))
  soup = BeautifulSoup(text, 'lxml')
  souped = soup.get_text()
  stripped = re.sub(combined_pat, '', souped)
  try:
    clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
  except:
    clean = stripped
  letters_only = re.sub("[^a-zA-Z0-9]", " ", clean)
  lower_case = letters_only.lower()
  words = tok.tokenize(lower_case)
  return (" ".join(words)).strip()

# **4. Define Crawling Function**

In [None]:
def generate_excel(generator, id, clean):
  data = []
  for post in generator:
    if post['text'] == '':
      pass
    else:
      data.append(post)
  df = pd.DataFrame(data)
  df.drop(['post_text'], axis=1, inplace=True)

  if clean:
    df['text'] = df['text'].apply(lambda text: text_cleaner(text))

  path = "/content/"
  writer = pd.ExcelWriter(f"{path}{id}.xlsx", engine = 'xlsxwriter')
  df.to_excel(writer, index=False)
  writer.save()
  writer.close()

  print(f"Crawling is complete! get: {df.shape[0]} datas")
  return df

In [None]:
def get_group_post(group_id, pages=1, clean=False):
  """
  - param 'group_id' diisi dengan id group yang di dapat dari url group
    contoh https://www.facebook.com/groups/101812157281468
    maka id groupnya = 101812157281468

  - param 'pages' berisi angka, berapa halaman dari group tersebut yang
    ingin di crawling

  - param 'clean' secara default False, 
    set True untuk membersihkan hasil post
  """
  start_time = time.time()
  post_result = get_posts(group=group_id, pages=pages)
  
  generate_excel(post_result,group_id, clean)
  print("Exe Time : ", display_time(time.time() - start_time))

In [None]:
def get_page_post(page_id, pages=1, clean=False):
  """
  - param 'page_id' diisi dengan id page yang didapat dari url page
    contoh https://www.facebook.com/BRIofficialpage/
    maka id pagenya = BRIofficialpage

  - param 'pages' berisi angka, berapa halaman dari page tersebut yang
    ingin di crawling

  - param 'clean' secara default False, 
    set True untuk membersihkan hasil post
  """

  start_time = time.time()
  post_result = get_posts(page_id, pages=pages)
  
  generate_excel(post_result, page_id, clean)
  print("Exe Time : ", display_time(time.time() - start_time))

# **5. Run Code!!!**

In [None]:
df = get_group_post('101812157281468', 2, True)

Crawling is complete! get: 31 datas
Exe Time :  1.0 second


In [None]:
df = get_page_post('BRIofficialpage', 2)

Crawling is complete! get: 6 datas
Exe Time :  1.0 second
