# Scraping NextDoor NL

The script is used to scrape Nextdoor feed.

In [3]:
import pandas as pd
import requests
import re
import itertools
import selenium
import datetime #for general date manipulation
import dateutil #for calculating year differences 

# Scraping on NextDoor

It is not possible to use scrapy and bs4. Selenium will be used to collect the data.

In [1]:
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from pandas import DataFrame
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from dateutil.relativedelta import relativedelta #for calculating year differences 

#initiate chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) #install webdriver

driver.get("https://nextdoor.nl/news_feed/") #open nextdoor on chrome driver

#Manual login and go to neighbourhood page

In [4]:
#Parse across-year date
def same_year_check(date):
    year_now = datetime.datetime.today().strftime("%Y") #get year now
    year_now = datetime.datetime.strptime(year_now, "%Y")
    
    today = datetime.datetime.today() #pass today date to var. today 
    
    diff_date = today - date
    diff_date = diff_date.days

    return diff_date

In [5]:
#Format weird date to normal date
def format_weird_date(weird_date):

    today = datetime.datetime.today() #set today

    #check if uur / minuten included
    uur_minute_check = ['minuten','uur'] #to list 

    if any(word in weird_date for word in uur_minute_check) == True: #check whether uur / minute is included
        pretty_date = today.strftime("%d/%m/%Y") # pass today date to pretty_date if uur/ minute is returned 
    else:
        days_from_now = int(re.sub(r"[A-Za-z]","",weird_date).strip()) #replace all letters with spaces and trim trailing whitespace
        
        pretty_date = today - datetime.timedelta(days_from_now) #find date with day difference from Nextdoor format (x dag geleden)
        
        pretty_date = datetime.datetime.strftime(pretty_date,"%d/%m/%Y") #format dates into dmY format 

    return pretty_date

In [6]:
#change Dutch spelling into english
def replace_multiple(text, replace): #function to perform multi-value replacement
    for org, subs in replace.items(): #for loop to replace original item by substitutes
        text = text.replace(org, subs) #pass replaced version of text to text
    return text #return text

def dutch_to_eng(date):
    global eng_date

    replace_words = {'mrt':'mar','mei':'may.','okt':'oct', ' bewerkt':''} #dict for replacement (ONLY DUTCH NOW)

    if bool(re.match(r".*geleden", date)) == False: #simply replace words if the format is regular
        eng_date = replace_multiple(date,replace_words)
    else:
        eng_date = format_weird_date(date) #use format_weird_date function if the format is unstandard
    
    return eng_date

In [7]:
#parse date and time of post 
def changedate(date):
    year_now = datetime.datetime.today().strftime("%Y") #get year now (str)
    year_now_dtob = datetime.datetime.strptime(year_now, "%Y") #get year now (datetime object)

    try:
        changed_date = datetime.datetime.strptime(date, '%d %b.').strftime(f'%d/%m/{year_now}') #format date 

        changed_date_dtob = datetime.datetime.strptime(changed_date, "%d/%m/%Y") #make datetime object for format date

        if same_year_check(changed_date_dtob) > 0:
            return changed_date
        else:
            last_year = year_now_dtob - relativedelta(years=1) #get last year 
            last_year = datetime.datetime.strftime(last_year, "%Y") #format last_year into %Y format

            changed_date = datetime.datetime.strptime(changed_date, "%d/%m/%Y").strftime(f'%d/%m/{last_year}') #format date 
            return changed_date

    except ValueError:
        try:
            changed_date = datetime.datetime.strptime(date, '%d %b. %y').strftime('%d/%m/%Y') #alternative date formatting if parsed datatime is not %d/%m/%Y
            return changed_date
        except ValueError:
            return date #return date if the date is already formatted on NextDoor
    except TypeError:
        pass

In [8]:
#get oldest date for scrolling
import time

def find_min_date(target_date):
    list_of_byline = driver.find_elements(By.CLASS_NAME, "post-byline-redesign") #find by-line (containing neighbourhood name and date)
    list_of_date = []

    target = datetime.datetime.strptime(target_date,"%d/%m/%Y") #pass target date to target 

    for i in range(1, len(list_of_byline), 2): #for loop with 2 as increment interval (index of odd numbers are dates)
        temp_date = list_of_byline[i].text #pass date to temp_date 

        temp_date = dutch_to_eng(temp_date) #change dutch date to eng date
        temp_date = changedate(temp_date) #change date format 

        list_of_date.append(temp_date) #append date to list of dates

    list_of_date = [datetime.datetime.strptime(x, "%d/%m/%Y") for x in list_of_date] #use datetime object for min() function

    min_date = min(list_of_date) #find oldest date 

    days_diff = min_date - target #calculate the date difference between oldest date and target date 

    print(f"Days from target date: {days_diff}") #print how far we are from the target date 

    return days_diff #return the day

#scroll by date
def scroll_by_date(target_date): #user input has to be %d/%m/%Y
    days_diff = find_min_date(target_date) #find date difference

    while days_diff.days > 0: #if there is still date difference (i.e., diff >= 0), keep scrolling
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight)") #scroll
        time.sleep(7) #wait 7 seconds for the page to load

        days_diff = find_min_date(target_date) #find again the oldest date in the updated list of dates 

In [9]:
#get reaction
def get_reaction():
    try:
        temp_reaction_count = post_grid[i].find_element(By.CLASS_NAME, "_3n6Fnd22").text #pass number of reactions to temp_reaction_count 
        return temp_reaction_count #return value
    except NoSuchElementException: #handle no such element exception
        temp_reaction_count = 0 #pass 0 to temp_reaction_count if there is no element to be extracted 
        return temp_reaction_count #return 0 
        

In [10]:
scroll_by_date("01/01/2023") 

Days from target date: -4 days, 0:00:00


In [46]:
#get links from result lists
post_grid = driver.find_elements(By.CLASS_NAME, "css-n4lbsh") #pass list of individual posts to post_grid

post = []
text = []
neighbourhood_name = []
post_username = []
comment_count = []
post_date = []
reaction_count = []


for i in range(len(post_grid)):
    try: 
        print(f"Current post: {i}") #Tell us which post the script is processing :)

        #click on  "more comments" and "more content" button and find comments count
        try:
            comment_button = post_grid[i].find_element(By.CLASS_NAME, "css-mn71l5").find_element(By.CSS_SELECTOR, 'button') #find more comments button
            driver.execute_script("arguments[0].click();", comment_button) #use javascript to click the button

            expand_text_button = post_grid[i].find_element(By.CLASS_NAME, "css-zglld0").find_element(By.CSS_SELECTOR, 'button') #find expand text button
            driver.execute_script("arguments[0].click();", expand_text_button) #use javascript to click the button
            
            time.sleep(1) #wait one second for comments to load in

        except NoSuchElementException:
            pass
  
        #get post comment counts 
        temp_comment_count = post_grid[i].find_element(By.CLASS_NAME, "css-13yklgh").find_elements(By.CLASS_NAME, "css-1qica3u") #return list of comment (if any)
        comment_count.append(len(temp_comment_count))

        post_id = post_grid[i].get_attribute("id") #get the post id 

        #get post user
        post_user = post_grid[i].find_element(By.CLASS_NAME, "E7NPJ3WK").text

        #get post text
        post_text = post_grid[i].find_element(By.CLASS_NAME, "content-body").text #get body text from post[i]
            
        #get post date and neighbourhood name
        byline = post_grid[i].find_elements(By.CLASS_NAME, "post-byline-redesign") #getting two elements from byline

        neighbourhood = byline[0].text #pass the first element to neighbourhood name

        date = byline[1].text #pass the second element to date 

        eng_date = dutch_to_eng(date) #change dutch date to eng date

        changed_date = changedate(eng_date) #reformat date 
        
        #get reactions, including likes and other emojis
        temp_reaction_count = get_reaction()

        '''''
        Append all extracted information to the lists 
        '''''
        post.append(post_id) 
        text.append(post_text)
        neighbourhood_name.append(neighbourhood)
        post_username.append(post_user)
        reaction_count.append(temp_reaction_count)
        post_date.append(changed_date)
        
    except NoSuchElementException: #handle exception where no element indicated was found
        pass

Current post: 0
Current post: 1
Current post: 2
Current post: 3
Current post: 4
Current post: 5
Current post: 6
Current post: 7
Current post: 8
Current post: 9
Current post: 10
Current post: 11
Current post: 12
Current post: 13
Current post: 14
Current post: 15
Current post: 16
Current post: 17
Current post: 18
Current post: 19
Current post: 20
Current post: 21
Current post: 22
Current post: 23
Current post: 24
Current post: 25
Current post: 26
Current post: 27
Current post: 28
Current post: 29
Current post: 30
Current post: 31
Current post: 32
Current post: 33
Current post: 34
Current post: 35
Current post: 36
Current post: 37
Current post: 38
Current post: 39
Current post: 40
Current post: 41
Current post: 42
Current post: 43
Current post: 44
Current post: 45
Current post: 46
Current post: 47
Current post: 48
Current post: 49
Current post: 50
Current post: 51
Current post: 52
Current post: 53
Current post: 54
Current post: 55
Current post: 56
Current post: 57
Current post: 58
Current

In [47]:
#Create data frame
all_post = pd.DataFrame(columns=['ID','Text','Comment Count','Post Author'])

all_post['ID'] = post
all_post['Text'] = text
all_post['Post Author'] = post_username
all_post['Neighbourhood'] = neighbourhood_name
all_post['Comment Count'] = comment_count
all_post['Date'] = post_date
all_post['Reaction Count'] = reaction_count

all_post

Unnamed: 0,ID,Text,Comment Count,Post Author,Neighbourhood,Date,Reaction Count
0,s_17592213863608,Deze week komt de Amsterdam krant uit en wij s...,0,Lotty Liefting,Indische Buurt Zuid West,27/12/2022,3
1,s_17592214199382,Hello neighbours! \n\nCurrently I'm following ...,15,Chris M.,Czaar Peter,11/01/2023,9
2,s_17592214201335,"Beste buren en Friends,\n\nIk ben Eerste monte...",5,Mojtaba Gholami,Czaar Peter,12/01/2023,3
3,s_17592214203869,"Hoi, ik ben op zoek naar een lieve oppas in de...",3,Esther Polak,Linnaeusbuurt,11/01/2023,2
4,s_17592213910952,Fijne feestdagen allemaal! Is er iets in de bu...,0,Simona Lettieri,Dapperbuurt,30/12/2022,0
...,...,...,...,...,...,...,...
116,s_17592213496654,Kerst & vintage liefhebbers opgelet! \n\nNa ge...,4,Maria Eijnde,Kadijken,06/12/2022,17
117,s_17592213544458,Deze winkel bestaat al lang niet meer.,1,Cees Walraven,Kadijken,09/12/2022,1
118,s_17592201720673,Weet iemand hoe je een cv ketel bijvult? Bij v...,9,Catrien Meijering,Transvaalbuurt,10/07/2021,2
119,s_17592213566858,Ik kan je eenvoudige voedzame maaltijden aanle...,0,Katrien van de Laar,Kadijken,10/12/2022,2


In [48]:
#Specific function for actual comment as it requires a set of different data structure 

comment = []
comment_author = []
comment_post_id = []
comment_IDs = []
comment_neighbourhoods = []

for i in range(len(post_grid)):
    try:
        post_id = post_grid[i].get_attribute("id") #get post id

        temp_comment_count = post_grid[i].find_element(By.CLASS_NAME, "css-13yklgh").find_elements(By.CLASS_NAME, "_2kP4d1Rw") #return list of comment (if any)
        temp_comment_count = len(temp_comment_count) #return num of comment (if any)
        
        if temp_comment_count != 0:
            for comment_num in range(temp_comment_count):
                try:
                    ctext_buttons = post_grid[i].find_elements(By.LINK_TEXT, "Meer bekijken") #find expand comment text button

                    for button in ctext_buttons:
                        driver.execute_script("arguments[0].click();", button) #use javascript to click the button

                    time.sleep(1) #wait one second for comments to load in
                except NoSuchElementException:
                    print("nsee")
                    pass

                comment_ID = post_grid[i].find_elements(By.CLASS_NAME, "js-media-comment")[comment_num].get_attribute("id") #get comment ID of a certain comment

                comment_text = post_grid[i].find_elements(By.CLASS_NAME, "js-media-comment")[comment_num].find_element(By.CLASS_NAME, "Linkify").text #get comment main text

                comment_user = post_grid[i].find_elements(By.CLASS_NAME, "js-media-comment")[comment_num].find_element(By.CLASS_NAME, "comment-detail-author-name").text #get comment's username

                comment_neighbourhood = post_grid[i].find_elements(By.CLASS_NAME, "js-media-comment")[comment_num].find_element(By.CLASS_NAME, "css-19hrmqv").text #get comment's author's located neighbourhood

                '''''
                Append all extracted information to the lists 
                '''''

                comment.append(comment_text)
                comment_post_id.append(post_id)
                comment_author.append(comment_user)
                comment_IDs.append(comment_ID)
                comment_neighbourhoods.append(comment_neighbourhood)
                
    except NoSuchElementException:  #handle exception where no element indicated was found
        pass
    except IndexError:  #handle index error 
        pass

In [49]:
#Create comment texts dataframe
detailed_comment = pd.DataFrame(columns=["ID","Comment","Comment Author","Comment Author Neighbourhood", "Comment ID"]) #create a dataframe for comments 

'''
Append all comments information to detailed_comment 
'''

detailed_comment['ID'] = comment_post_id 
detailed_comment['Comment'] = comment
detailed_comment['Comment Author'] = comment_author
detailed_comment['Comment ID'] = comment_IDs
detailed_comment['Comment Author Neighbourhood'] = comment_neighbourhoods

#Merge dataframe
all_post = all_post.merge(detailed_comment, on = "ID", how = "left")

all_post #display the dataframe

Unnamed: 0,ID,Text,Comment Count,Post Author,Neighbourhood,Date,Reaction Count,Comment,Comment Author,Comment Author Neighbourhood,Comment ID
0,s_17592213863608,Deze week komt de Amsterdam krant uit en wij s...,0,Lotty Liefting,Indische Buurt Zuid West,27/12/2022,3,,,,
1,s_17592214199382,Hello neighbours! \n\nCurrently I'm following ...,15,Chris M.,Czaar Peter,11/01/2023,9,"Hoi Chris,\nGeïnteresseerd \nGroetjes Marein",Marein marein1968@gmail.com,KNSM-Eiland,comment_17592255145484
2,s_17592214199382,Hello neighbours! \n\nCurrently I'm following ...,15,Chris M.,Czaar Peter,11/01/2023,9,Hierboven is mijn emailadres,Marein marein1968@gmail.com,KNSM-Eiland,comment_17592255145503
3,s_17592214199382,Hello neighbours! \n\nCurrently I'm following ...,15,Chris M.,Czaar Peter,11/01/2023,9,"hi chris, i would be happy to help - in conjun...",Kayce Maisel,Indische Buurt West,comment_17592255148328
4,s_17592214199382,Hello neighbours! \n\nCurrently I'm following ...,15,Chris M.,Czaar Peter,11/01/2023,9,Als je ook ouder hoofd nodig hebt...,Hanneke Stasse,Kadijken,comment_17592255158481
...,...,...,...,...,...,...,...,...,...,...,...
496,s_17592213445598,Weet iemand hoe het vuurwerkverbod dit jaar ge...,23,Roelien Reinders,Transvaalbuurt,03/12/2022,3,maar een mening kan ook enorm kwetsen en dat ...,Godelieve Van Ewijk,Nieuwe Pijp,comment_17592253550084
497,s_17592213445598,Weet iemand hoe het vuurwerkverbod dit jaar ge...,23,Roelien Reinders,Transvaalbuurt,03/12/2022,3,Handhaven zal moeilijk worden wegens te weini...,Palma van Westrhenen,Betondorp,comment_17592253319671
498,s_17592213445598,Weet iemand hoe het vuurwerkverbod dit jaar ge...,23,Roelien Reinders,Transvaalbuurt,03/12/2022,3,voor illegaal vuurwerk graag! Vorig jaar tril...,Godelieve Van Ewijk,Nieuwe Pijp,comment_17592253550134
499,s_17592213445598,Weet iemand hoe het vuurwerkverbod dit jaar ge...,23,Roelien Reinders,Transvaalbuurt,03/12/2022,3,"Er is een algeheel afsteekverbod op vuurwerk, ...",Bertie Kaal,Grachtengordel-West,comment_17592253353773


In [50]:
#Save df
pd.DataFrame.to_csv(all_post,"all_post_ver2.csv")