# Scraping NextDoor NL

The script is used to scrape Nextdoor feed.

In [70]:
import bs4
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import itertools

# Scraping on NextDoor

It is not possible to use scrapy and bs4 alone. Selenium will be used to simulate scrolling.

In [311]:
import selenium
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

#initiate chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) #install webdriver

driver.get("https://nextdoor.nl/news_feed/") #open nextdoor on chrome driver

#Manual login and go to neighbourhood page

In [320]:
#get oldest date for scrolling
import time

def find_min_date(target_date):
    list_of_byline = driver.find_elements(By.CLASS_NAME, "post-byline-redesign") #find by-line (containing neighbourhood name and date)
    list_of_date = []

    target = datetime.datetime.strptime(target_date,"%d/%m/%Y") #pass target date to target 

    for i in range(1, len(list_of_byline), 2): #for loop with 2 as increment interval (index of odd numbers are dates)
        temp_date = list_of_byline[i].text #pass date to temp_date 

        temp_date = dutch_to_eng(temp_date) #change dutch date to eng date
        temp_date = changedate(temp_date) #change date format

        list_of_date.append(temp_date) #append date to list of dates

    min_date = min(list_of_date) #find oldest date 

    min_date = datetime.datetime.strptime(min_date,"%d/%m/%Y") #pass formatted oldest date to min_date
    days_diff = min_date - target #calculate the date difference between oldest date and target date 

    return days_diff

#scroll by date
def scroll_by_date(target_date): #user input has to be %d/%m/%Y
    days_diff = find_min_date(target_date) #find date difference

    while days_diff.days > 0: #if there is still date difference (i.e., diff >= 0), keep scrolling
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight)") #scroll
        time.sleep(7) #wait 7 seconds for the page to load

        days_diff = find_min_date(target_date) #find again the oldest date in the updated list of dates 

In [321]:
scroll_by_date("02/02/2022") #

180 days, 0:00:00


In [219]:
#Format weird date to normal date
import re 
def format_weird_date(weird_date):
    today = datetime.datetime.today()

    days_from_now = int(re.sub(r"[A-Za-z]","",weird_date).strip()) #replace all letters with spaces and trim trailing whitespace
    
    pretty_date = today - datetime.timedelta(days_from_now) #find date with day difference from Nextdoor format (x dag geleden)
    
    pretty_date = datetime.datetime.strftime(pretty_date,"%d/%m/%Y") #format dates into dmY format 

    return pretty_date

In [220]:
#change Dutch spelling into english
import datetime

def replace_multiple(text, replace):
    for org, subs in replace.items(): 
        text = text.replace(org, subs) 
    return text

def dutch_to_eng(date):
    global eng_date

    replace_words = {'mrt':'mar','mei':'may','okt':'oct'} #dict for replacement 

    if bool(re.match(r".*geleden$", date)) == False: #simply replace words if the format is regular
        eng_date = replace_multiple(date,replace_words)
    else:
        eng_date = format_weird_date(date) #use format_weird_date function if the format is unstandard
    
    return eng_date

In [222]:
#parse date and time of post 
def changedate(date):
    try:
        changed_date = datetime.datetime.strptime(date, '%d %b.').strftime('%d/%m/2022') #format date 

        return changed_date

    except ValueError:
        return date
    except TypeError:
        pass

In [223]:
#get reaction
def get_reaction():
    try:
        temp_reaction_count = post_grid[i].find_element(By.CLASS_NAME, "_3n6Fnd22").text
        return temp_reaction_count
    except NoSuchElementException:
        temp_reaction_count = 0
        return temp_reaction_count
        

In [224]:
#get links from result lists
from bs4 import element
from pandas import DataFrame
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

post_grid = driver.find_elements(By.CLASS_NAME, "css-15wtqd7") #pass list of individual posts to post_grid

post = []
text = []
comment_count = []
post_date = []
reaction_count = []


for i in range(len(post_grid)):
    try:
        while i == 0: #click on all "more comments" button for the first iteration
            post_grid[i].find_element(By.CLASS_NAME, "css-adjc8x").click() #class css-adjc8x is the class name for more comments

        post_id = post_grid[i].get_attribute("id") #get the post id 

        #get post text
        post_text = post_grid[i].find_element(By.CLASS_NAME, "content-body").text #get body text from post[i]

        #get post comment counts 
        comment = post_grid[i].find_element(By.CLASS_NAME, "comment-container").find_elements(By.CLASS_NAME, "Linkify") #return list of comment (if any)

        temp_comment_count = len(comment)
            
        #get post date
        byline = post_grid[i].find_elements(By.CLASS_NAME, "post-byline-redesign") #getting two elements from byline

        date = byline[1].text #pass the second element to date 

        eng_date = dutch_to_eng(date) #change dutch date to eng date

        changed_date = changedate(eng_date) #reformat date 
        
        #get reactions, including likes and other emojis
        temp_reaction_count = get_reaction()

        post.append(post_id) 
        text.append(post_text)
        comment_count.append(temp_comment_count)
        reaction_count.append(temp_reaction_count)
        post_date.append(changed_date)

        print(f"Processed post: {i}")
        

    except NoSuchElementException:
        pass

Processed post: 1
Processed post: 2
Processed post: 3
Processed post: 4
Processed post: 5
Processed post: 6
Processed post: 7
Processed post: 8
Processed post: 9
Processed post: 10
Processed post: 11
Processed post: 12
Processed post: 13
Processed post: 14
Processed post: 15
Processed post: 16
Processed post: 17
Processed post: 18
Processed post: 19
Processed post: 20
Processed post: 21
Processed post: 22
Processed post: 23
Processed post: 24
Processed post: 25
Processed post: 26
Processed post: 27
Processed post: 28
Processed post: 29
Processed post: 30
Processed post: 31
Processed post: 32
Processed post: 33
Processed post: 34
Processed post: 35
Processed post: 36
Processed post: 37
Processed post: 38
Processed post: 39


In [225]:
#Create data frame
all_post = pd.DataFrame(columns=['ID','Text','Comment Count'])

all_post['ID'] = post
all_post['Text'] = text
all_post['Comment Count'] = comment_count
all_post['Date'] = post_date
all_post['Reaction Count'] = reaction_count

all_post.head(20)

Unnamed: 0,ID,Text,Comment Count,Date,Reaction Count
0,s_17592212454729,Douvle Snak NC is zojuist lid geworden van Da...,0,15/10/2022,0
1,s_17592212390557,Hallo allemaal. 👋 Ik ben Marieke. Ik woon in d...,1,12/10/2022,0
2,s_17592212424616,"Ik ben op zoek naar behang, liefst een lichte ...",1,14/10/2022,0
3,s_17592212409493,"Ik ben Sofi,\nWij bieden u onze schoonmaakser...",0,13/10/2022,1
4,s_17592212390716,Onze kat Jopie is vermist sinds 4 oktober. We ...,1,12/10/2022,5
5,s_17592212155269,Hi! \nWe live in Dapperbuurt. We are a music t...,1,29/09/2022,4
6,s_17592212265779,"Hallo buren, hi neighbours,\nMijn dochter heef...",1,05/10/2022,4
7,s_17592212246236,Heb je hulp nodig bij het schoonmaken van je h...,0,04/10/2022,1
8,s_17592211940332,"Beste buren,\nWe are a musician couple who liv...",1,17/09/2022,18
9,s_17592212129961,"Morgen om 18u at Nieuwland, Pieter Nieuwlandst...",0,28/09/2022,1


In [226]:
#Specific function for actual comment as it requires a set of different data structure 

comment = []
comment_post_id = []

for i in range(len(post_grid)):
    try:
        post_id = post_grid[i].get_attribute("id") #get post id

        temp_comment_count = post_grid[i].find_element(By.CLASS_NAME, "comment-container").find_elements(By.CLASS_NAME, "Linkify") #return list of comment (if any)
        
        if temp_comment_count != 0:
            for comment_num in range(len(temp_comment_count)):
                comment_text = post_grid[i].find_element(By.CLASS_NAME, "comment-container").find_elements(By.CLASS_NAME, "Linkify")[comment_num].text

                comment.append(comment_text)
                comment_post_id.append(post_id)
    except NoSuchElementException:
        pass

In [227]:
#Create comment texts dataframe
detailed_comment = pd.DataFrame(columns=["ID","Comment"])

detailed_comment['ID'] = comment_post_id
detailed_comment['Comment'] = comment

#Merge dataframe
all_post = all_post.merge(detailed_comment, on = "ID", how = "left")

all_post.head(20)

Unnamed: 0,ID,Text,Comment Count,Date,Reaction Count,Comment
0,s_17592212454729,Douvle Snak NC is zojuist lid geworden van Da...,0,15/10/2022,0,
1,s_17592212390557,Hallo allemaal. 👋 Ik ben Marieke. Ik woon in d...,1,12/10/2022,0,Ok ik houd het in de gaten voor je. Sterkte!
2,s_17592212424616,"Ik ben op zoek naar behang, liefst een lichte ...",1,14/10/2022,0,Tip: Ik zag wat rolletjes behang liggen in de ...
3,s_17592212409493,"Ik ben Sofi,\nWij bieden u onze schoonmaakser...",0,13/10/2022,1,
4,s_17592212390716,Onze kat Jopie is vermist sinds 4 oktober. We ...,1,12/10/2022,5,Enorm zielig. Wat n mooierdje.
5,s_17592212155269,Hi! \nWe live in Dapperbuurt. We are a music t...,1,29/09/2022,4,Heb je all imand ik heb wel tijd
6,s_17592212265779,"Hallo buren, hi neighbours,\nMijn dochter heef...",1,05/10/2022,4,Dank voor uw bericht!
7,s_17592212246236,Heb je hulp nodig bij het schoonmaken van je h...,0,04/10/2022,1,
8,s_17592211940332,"Beste buren,\nWe are a musician couple who liv...",1,17/09/2022,18,Hi Ela. Please add me for a PM - interested in...
9,s_17592212129961,"Morgen om 18u at Nieuwland, Pieter Nieuwlandst...",0,28/09/2022,1,
