# Scraping NextDoor NL

The script is used to scrape Nextdoor feed.

In [57]:
import bs4
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import itertools
import selenium
import datetime #for general date manipulation
import dateutil #for calculating year differences 

# Scraping on NextDoor

It is not possible to use scrapy and bs4 alone. Selenium will be used to simulate user clicks.

In [293]:
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import element
from pandas import DataFrame
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from dateutil.relativedelta import relativedelta #for calculating year differences 

#initiate chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) #install webdriver

driver.get("https://nextdoor.nl/news_feed/") #open nextdoor on chrome driver

#Manual login and go to neighbourhood page

In [216]:
#Parse across-year date
def same_year_check(date):
    year_now = datetime.datetime.today().strftime("%Y") #get year now
    year_now = datetime.datetime.strptime(year_now, "%Y")
    
    today = datetime.datetime.today() #pass today date to var. today 
    
    diff_date = today - date
    diff_date = diff_date.days

    return diff_date

In [257]:
#Format weird date to normal date
def format_weird_date(weird_date):

    today = datetime.datetime.today() #set today

    #check if uur / minuten included
    uur_minute_check = ['minuten','uur'] #to list 

    if any(word in weird_date for word in uur_minute_check) == True: #check whether uur / minute is included
        pretty_date = today.strftime("%d/%m/%Y") # pass today date to pretty_date if uur/ minute is returned 
    else:
        days_from_now = int(re.sub(r"[A-Za-z]","",weird_date).strip()) #replace all letters with spaces and trim trailing whitespace
        
        pretty_date = today - datetime.timedelta(days_from_now) #find date with day difference from Nextdoor format (x dag geleden)
        
        pretty_date = datetime.datetime.strftime(pretty_date,"%d/%m/%Y") #format dates into dmY format 

    return pretty_date

In [258]:
#change Dutch spelling into english
def replace_multiple(text, replace):
    for org, subs in replace.items(): 
        text = text.replace(org, subs) 
    return text

def dutch_to_eng(date):
    global eng_date

    replace_words = {'mrt':'mar','mei':'may.','okt':'oct'} #dict for replacement 

    if bool(re.match(r".*geleden$", date)) == False: #simply replace words if the format is regular
        eng_date = replace_multiple(date,replace_words)
    else:
        eng_date = format_weird_date(date) #use format_weird_date function if the format is unstandard
    
    return eng_date

In [259]:
#parse date and time of post 
def changedate(date):
    year_now = datetime.datetime.today().strftime("%Y") #get year now (str)
    year_now_dtob = datetime.datetime.strptime(year_now, "%Y") #get year now (datetime object)

    try:
        changed_date = datetime.datetime.strptime(date, '%d %b.').strftime(f'%d/%m/{year_now}') #format date 

        changed_date_dtob = datetime.datetime.strptime(changed_date, "%d/%m/%Y")

        if same_year_check(changed_date_dtob) > 0:
            return changed_date
        else:
            last_year = year_now_dtob - relativedelta(years=1) #get last year 
            last_year = datetime.datetime.strftime(last_year, "%Y") #format last_year into %Y format

            changed_date = datetime.datetime.strptime(changed_date, "%d/%m/%Y").strftime(f'%d/%m/{last_year}') #format date 
            return changed_date

    except ValueError:
        try:
            changed_date = datetime.datetime.strptime(date, '%d %b. %y').strftime('%d/%m/%Y')
            return changed_date
        except ValueError:
            return date
    except TypeError:
        pass

In [264]:
#get oldest date for scrolling
import time

def find_min_date(target_date):
    list_of_byline = driver.find_elements(By.CLASS_NAME, "post-byline-redesign") #find by-line (containing neighbourhood name and date)
    list_of_date = []

    target = datetime.datetime.strptime(target_date,"%d/%m/%Y") #pass target date to target 

    for i in range(1, len(list_of_byline), 2): #for loop with 2 as increment interval (index of odd numbers are dates)
        temp_date = list_of_byline[i].text #pass date to temp_date 

        temp_date = dutch_to_eng(temp_date) #change dutch date to eng date
        temp_date = changedate(temp_date) #change date format 

        list_of_date.append(temp_date) #append date to list of dates

    list_of_date = [datetime.datetime.strptime(x, "%d/%m/%Y") for x in list_of_date] #use datetime object for min() function

    min_date = min(list_of_date) #find oldest date 

    days_diff = min_date - target #calculate the date difference between oldest date and target date 

    print(f"Days from target date: {days_diff}")

    return days_diff

#scroll by date
def scroll_by_date(target_date): #user input has to be %d/%m/%Y
    days_diff = find_min_date(target_date) #find date difference

    while days_diff.days > 0: #if there is still date difference (i.e., diff >= 0), keep scrolling
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight)") #scroll
        time.sleep(7) #wait 7 seconds for the page to load

        days_diff = find_min_date(target_date) #find again the oldest date in the updated list of dates 

In [265]:
#get reaction
def get_reaction():
    try:
        temp_reaction_count = post_grid[i].find_element(By.CLASS_NAME, "_3n6Fnd22").text
        return temp_reaction_count
    except NoSuchElementException:
        temp_reaction_count = 0
        return temp_reaction_count
        

In [279]:
scroll_by_date("01/01/2021") 

Days from target date: 248 days, 0:00:00
Days from target date: 248 days, 0:00:00
Days from target date: 248 days, 0:00:00
Days from target date: 248 days, 0:00:00
Days from target date: 248 days, 0:00:00
Days from target date: 248 days, 0:00:00
Days from target date: 248 days, 0:00:00
Days from target date: 248 days, 0:00:00
Days from target date: 248 days, 0:00:00
Days from target date: 248 days, 0:00:00
Days from target date: 248 days, 0:00:00
Days from target date: 248 days, 0:00:00
Days from target date: 248 days, 0:00:00
Days from target date: 182 days, 0:00:00
Days from target date: 163 days, 0:00:00
Days from target date: 111 days, 0:00:00
Days from target date: 49 days, 0:00:00
Days from target date: -29 days, 0:00:00


In [294]:
#get links from result lists
post_grid = driver.find_elements(By.CLASS_NAME, "css-15wtqd7") #pass list of individual posts to post_grid

post = []
text = []
post_username = []
comment_count = []
post_date = []
reaction_count = []


for i in range(len(post_grid)):
    try:
        while i == 0: #click on all "more comments" button for the first iteration
            post_grid[i].find_element(By.CLASS_NAME, "css-adjc8x").click() #class css-adjc8x is the class name for more comments

        post_id = post_grid[i].get_attribute("id") #get the post id 

        #get post user
        post_user = post_grid[i].find_element(By.CLASS_NAME, "E7NPJ3WK").text

        #get post text
        post_text = post_grid[i].find_element(By.CLASS_NAME, "content-body").text #get body text from post[i]

        #get post comment counts 
        comment = post_grid[i].find_element(By.CLASS_NAME, "comment-container").find_elements(By.CLASS_NAME, "Linkify") #return list of comment (if any)

        temp_comment_count = len(comment)
            
        #get post date
        byline = post_grid[i].find_elements(By.CLASS_NAME, "post-byline-redesign") #getting two elements from byline

        date = byline[1].text #pass the second element to date 

        eng_date = dutch_to_eng(date) #change dutch date to eng date

        changed_date = changedate(eng_date) #reformat date 
        
        #get reactions, including likes and other emojis
        temp_reaction_count = get_reaction()

        post.append(post_id) 
        text.append(post_text)
        post_username.append(post_user)
        comment_count.append(temp_comment_count)
        reaction_count.append(temp_reaction_count)
        post_date.append(changed_date)

        print(f"Processed post: {i}")
        

    except NoSuchElementException:
        pass

Processed post: 1
Processed post: 2
Processed post: 3
Processed post: 4
Processed post: 5
Processed post: 6
Processed post: 7
Processed post: 8
Processed post: 9
Processed post: 10
Processed post: 11
Processed post: 12
Processed post: 13
Processed post: 14
Processed post: 15
Processed post: 16
Processed post: 17
Processed post: 18
Processed post: 19
Processed post: 20
Processed post: 21
Processed post: 22
Processed post: 23
Processed post: 24
Processed post: 25
Processed post: 26
Processed post: 27
Processed post: 28
Processed post: 29
Processed post: 30
Processed post: 31
Processed post: 32
Processed post: 33
Processed post: 34
Processed post: 35
Processed post: 36
Processed post: 37
Processed post: 38
Processed post: 39


In [331]:
#Create data frame
all_post = pd.DataFrame(columns=['ID','Text','Comment Count','Post Author'])

all_post['ID'] = post
all_post['Text'] = text
all_post['Post Author'] = post_username
all_post['Comment Count'] = comment_count
all_post['Date'] = post_date
all_post['Reaction Count'] = reaction_count

all_post.tail(50)

Unnamed: 0,ID,Text,Comment Count,Post Author,Date,Reaction Count
0,s_17592212603205,Oppassen!\n\nAangeboden 2 oppassers in de buur...,0,Hidde Hovemann,22/10/2022,3
1,s_17592212602916,Aangeboden 2 oppassers in de buurt van Amsterd...,0,Mathilda Putten,22/10/2022,5
2,s_17592212602764,Hidde Hovemann is zojuist lid geworden van Da...,0,Hidde Hovemann,22/10/2022,0
3,s_17592212602755,Isabelle Van straaten is zojuist lid geworden...,0,Isabelle Van straaten,22/10/2022,0
4,s_17592212465315,Zijn er schoonmaakwerkzaamheden waar je niet a...,0,Sofi Sovero,15/10/2022,0
5,s_17592212454729,Douvle Snak NC is zojuist lid geworden van Da...,0,Douvle Snak NC,14/10/2022,0
6,s_17592212390716,Onze kat Jopie is vermist sinds 4 oktober. We ...,1,Marieke Voeten,11/10/2022,6
7,s_17592212390557,Hallo allemaal. 👋 Ik ben Marieke. Ik woon in d...,9,Marieke Voeten,11/10/2022,0
8,s_17592212424616,"Ik ben op zoek naar behang, liefst een lichte ...",1,Désirée Venghaus,13/10/2022,0
9,s_17592212409493,"Ik ben Sofi,\nWij bieden u onze schoonmaakser...",0,Sofi Sovero,12/10/2022,1


In [320]:
#Specific function for actual comment as it requires a set of different data structure 

comment = []
comment_author = []
comment_post_id = []
comment_IDs = []

for i in range(len(post_grid)):
    try:
        post_id = post_grid[i].get_attribute("id") #get post id

        temp_comment_count = post_grid[i].find_element(By.CLASS_NAME, "comment-container").find_elements(By.CLASS_NAME, "Linkify") #return list of comment (if any)
        
        if temp_comment_count != 0:
            for comment_num in range(len(temp_comment_count)):
                comment_ID = post_grid[i].find_elements(By.CLASS_NAME, "js-media-comment")[comment_num].get_attribute("id")

                comment_text = post_grid[i].find_elements(By.CLASS_NAME, "js-media-comment")[comment_num].find_element(By.CLASS_NAME, "Linkify").text

                comment_user = post_grid[i].find_elements(By.CLASS_NAME, "js-media-comment")[comment_num].find_element(By.CLASS_NAME, "comment-detail-author-name").text

                comment.append(comment_text)
                comment_post_id.append(post_id)
                comment_author.append(comment_user)
                comment_IDs.append(comment_ID)
                
    except NoSuchElementException:
        pass

In [332]:
#Create comment texts dataframe
detailed_comment = pd.DataFrame(columns=["ID","Comment","Comment Author","Comment ID"])

detailed_comment['ID'] = comment_post_id
detailed_comment['Comment'] = comment
detailed_comment['Comment Author'] = comment_author
detailed_comment['Comment ID'] = comment_IDs

#Merge dataframe
all_post = all_post.merge(detailed_comment, on = "ID", how = "left")

all_post

Unnamed: 0,ID,Text,Comment Count,Post Author,Date,Reaction Count,Comment,Comment Author,Comment ID
0,s_17592212603205,Oppassen!\n\nAangeboden 2 oppassers in de buur...,0,Hidde Hovemann,22/10/2022,3,,,
1,s_17592212602916,Aangeboden 2 oppassers in de buurt van Amsterd...,0,Mathilda Putten,22/10/2022,5,,,
2,s_17592212602764,Hidde Hovemann is zojuist lid geworden van Da...,0,Hidde Hovemann,22/10/2022,0,,,
3,s_17592212602755,Isabelle Van straaten is zojuist lid geworden...,0,Isabelle Van straaten,22/10/2022,0,,,
4,s_17592212465315,Zijn er schoonmaakwerkzaamheden waar je niet a...,0,Sofi Sovero,15/10/2022,0,,,
5,s_17592212454729,Douvle Snak NC is zojuist lid geworden van Da...,0,Douvle Snak NC,14/10/2022,0,,,
6,s_17592212390716,Onze kat Jopie is vermist sinds 4 oktober. We ...,1,Marieke Voeten,11/10/2022,6,Enorm zielig. Wat n mooierdje.,Cathy Du Plessis,comment_17592250589962
7,s_17592212390557,Hallo allemaal. 👋 Ik ben Marieke. Ik woon in d...,9,Marieke Voeten,11/10/2022,0,"Hi Marieke, is dat de kat waar een foto van op...",Désirée Venghaus,comment_17592250515315
8,s_17592212390557,Hallo allemaal. 👋 Ik ben Marieke. Ik woon in d...,9,Marieke Voeten,11/10/2022,0,"Ja, dat klopt. Dat is Jopie. Heb je hem gezien?",Marieke Voeten,comment_17592250538862
9,s_17592212390557,Hallo allemaal. 👋 Ik ben Marieke. Ik woon in d...,9,Marieke Voeten,11/10/2022,0,Ik loop iedere ochtend rond 7 uur wat rondes ...,Désirée Venghaus,comment_17592250539728
