In [None]:
!pip install scrapy

Collecting scrapy
  Downloading Scrapy-2.11.2-py2.py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Twisted>=18.9.0 (from scrapy)
  Downloading twisted-24.3.0-py3-none-any.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.0-py3-none-any.whl (12 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.9.1-py2.py3-none-any.whl (17 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.7.0-py2.py3-none-any.whl (13 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-24.1.0-py3-none-any.whl (12 kB)
Collecting w3lib>=1.17.0 (from scrapy)
  Downloading w3lib-2.1.2-py3-non

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.http import Request
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader
from scrapy.item import Item, Field
import csv

class CommunityItem(Item):
    community_title = Field(output_processor=TakeFirst())
    post_title = Field(output_processor=TakeFirst())
    post_author = Field(output_processor=TakeFirst())
    post_date = Field(output_processor=TakeFirst())
    post_content = Field(output_processor=TakeFirst())

class CommunitySpider(scrapy.Spider):
    name = "community_spider"
    start_urls = ['https://cafedread.net/discover']  # Start URL

    def parse(self, response):
        # Discover communities
        communities = response.xpath('//div[contains(@class, "media")]//h5/a')
        for community in communities:
            community_title = community.xpath('.//text()').get()
            community_link = response.urljoin(community.xpath('.//@href').get())
            yield Request(community_link, callback=self.parse_posts, meta={'community_title': community_title})

        # Handle pagination
        next_page = response.xpath('//li[@class="page-item"]/a[@rel="next"]/@href').get()
        if next_page:
            yield response.follow(next_page, callback=self.parse)

    def parse_posts(self, response):
        community_title = response.meta['community_title']
        posts = response.xpath('//h5[contains(@class, "mb-2")]/a')
        for post in posts:
            post_link = response.urljoin(post.xpath('.//@href').get())
            post_title = post.xpath('.//text()').get()
            # Follow the post link to get further details
            yield Request(post_link, callback=self.parse_post_details, meta={
                'community_title': community_title,
                'post_title': post_title
            })

        # Handle pagination in posts
        next_page = response.xpath('//li[@class="page-item"]/a[@rel="next"]/@href').get()
        if next_page:
            yield response.follow(next_page, callback=self.parse_posts, meta={'community_title': community_title})

    def parse_post_details(self, response):
        loader = ItemLoader(item=CommunityItem(), response=response)
        loader.add_value('community_title', response.meta['community_title'])
        loader.add_value('post_title', response.meta['post_title'])
        loader.add_xpath('post_author', '//div[contains(@class, "card-body")]//p/a/text()')
        loader.add_xpath('post_date', '//div[contains(@class, "card-body")]//span/@title')
        loader.add_xpath('post_content', 'string(//div[contains(@class, "post-body")]/p)')
        yield loader.load_item()

class CsvPipeline:
    def open_spider(self, spider):
        self.file = open('community_posts_details.csv', 'w', newline='')
        self.writer = csv.writer(self.file)
        self.writer.writerow(['Community Title', 'Post Title', 'Post Author', 'Date/Time', 'Post Content'])

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        self.writer.writerow([item['community_title'], item['post_title'], item['post_author'], item['post_date'], item['post_content']])
        return item

if __name__ == "__main__":
    process = CrawlerProcess(settings={
        'ITEM_PIPELINES': {'__main__.CsvPipeline': 1},
    })
    process.crawl(CommunitySpider)
    process.start()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2024-06-02 18:51:28 [scrapy.core.scraper] DEBUG: Scraped from <200 https://cafedread.net/post/8f4f383b1d3f962e4796>
{'community_title': '/d/DNMSourcing',
 'post_author': '/u/drugman234',
 'post_content': '\n'
                 '\t\t\t\t\t\t\tSaw this guy on Super Market, though he '
                 "doesn't have much sale but got some good prices. He claims "
                 'to be active on other markets, so I was wondering if someone '
                 'in here can vouch for him that he is '
                 'legit?                        ',
 'post_date': '2024-01-26 08:44:00',
 'post_title': 'can anyone vouch for vendor '
               'venspyrou?                                    '}
DEBUG:scrapy.core.engine:Crawled (200) <GET https://cafedread.net/post/e881e2c5f3f1ff1263fd> (referer: https://cafedread.net/d/DNMSourcing?page=104)
2024-06-02 18:51:28 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://cafedread.ne

In [None]:
import pandas as pd

df = pd.read_csv('community_posts_details.csv')

INFO:numexpr.utils:NumExpr defaulting to 2 threads.
2024-06-02 18:57:38 [numexpr.utils] INFO: NumExpr defaulting to 2 threads.


In [None]:
community_posts_details.to_csv('community_posts_details.csv', index=False)

In [None]:
df.tail()

Unnamed: 0,Community Title,Post Title,Post Author,Date/Time,Post Content
22977,/d/DarkNetMarkets,DrLysergic report thread ...,/u/HugBunter,"22nd March, 2019 12:50","\n\t\t\t\t\t\t\tHi,\n\nThere have been a tonne..."
22978,/d/DarkNetMarkets,Incognito 100% Exit Scam - AVOID NOW ...,/u/HugBunter,2024-03-05 12:10:00,"\n\t\t\t\t\t\t\tUnfortunately, I can 100% conf..."
22979,/d/DarkNetMarkets,Nemesis Market Got Seized ...,/u/Francis_Nemesis,"20th March, 2024 21:24",\n\t\t\t\t\t\t\t[removed]
22980,/d/DarkNetMarkets,Is Pelican actually LEO or just being investia...,/u/PitViper,"27th August, 2018 14:37",\n\t\t\t\t\t\t\tIt seems no one is aware he is...
22981,/d/DarkNetMarkets,Looking for a trusted market place to vend on ...,/u/Preview-Of-Freedom,"12th August, 2019 07:18",\n\t\t\t\t\t\t\tsince my return from the wsm f...


In [None]:
df.shape

(22982, 5)