In [1]:
#!Python3
# Credit to Rumperuu - https://github.com/Rumperuu/

import scrapy, os
from enum import Enum
from scrapy.crawler import CrawlerProcess

In [2]:
Stage = Enum('Stage', 'preamble execs analysts body')
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep']
transcripts = {}

class AlphaSpider(scrapy.Spider):

    name = "transcripts"
    start_urls = ["http://seekingalpha.com/earnings/earnings-call-transcripts/1"]
    
    def parse(self, response):
        # Follow each transcript page link from the index page
        for href in response.css('.dashboard-article-link::attr(href)').extract():
            yield scrapy.Request(response.urljoin(href), callback=self.parse_transcript)
            
        # Follows pagination links at the bottom of index page
        next_page = response.css('li.next a::attr(href)').extract_first()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)

    def parse_transcript(self, response):
        i = 0
        transcript = {}
        details = {}
        execs = []
        analysts = []
        script = []
        mode = 1
        
        # Pages are represented by a series of <p> elements, all with the same '.p1' class and no unique 
        # identfiers, we have to breaking it into chunks and iterate over them.
        body = response.css('div#a-body p.p1')
        chunks = body.css('p.p1')
        while i < len(chunks):
            # If the current line is a heading and we're not currently going
            # through the transcript body (where headings represent speakers),
            # change the current section flag to the next section.
            if (len(chunks[i].css('strong::text').extract())==0) or (mode==4):
                currStage = Stage(mode)
                if currStage == Stage['preamble']:
                    if i == 0:
                        if len(chunks[1].css('strong::text').extract()) == 0:
                            details['company'] = chunks[i].css('p::text').extract_first()
                            if " (" in details['company']:
                                details['company'] = details['company'].split(' (')[0]
                            details['exchange'] = "NYSE"
                            details['ticker'] = chunks.css('a::text').extract_first()
                            if ":" in details['ticker']:
                                ticker = details['ticker'].split(':')
                                details['exchange'] = ticker[0]
                                details['ticker'] = ticker[1]
                        else:
                            details['company'] = chunks[i].css('p::text').extract_first()
                            if " (" in details['company']:
                                details['company'] = details['company'].split(' (')[0]
                            # if a specific stock exchange is not listed, default to NYSE
                            details['exchange'] = "NYSE"
                            details['ticker'] = chunks.css('a::text').extract_first()
                            if ":" in details['ticker']:
                                ticker = details['ticker'].split(':')
                                details['exchange'] = ticker[0]
                                details['ticker'] = ticker[1]
                            titleAndDate = chunks[i].css('p::text').extract[1]
                            for date in months:
                                if date in titleAndDate:
                                    splits = titleAndDate.split(date)
                                    details['title'] = splits[0]
                                    details['date'] = date + splits[1]
                    # Otherwise, we're onto the title line.
                    elif i == 1:
                        title = chunks[i].css('p::text').extract_first()
                        # This should never be the case, but just to be careful I'm leaving it in.
                        if len(title) <= 0:
                            title = "NO TITLE"
                        details['title'] = title
                    # Or the date line.
                    elif i == 2:
                        details['date'] = chunks[i].css('p::text').extract_first()
                # If we're onto the 'Executives' section, we create a list of
                # all of their names, positions and company name (from the 
                # preamble).
                elif currStage == Stage['execs']:
                    anExec = chunks[i].css('p::text').extract_first().split(" - ")
                    # This covers if the execs are separated with an em- rather
                    # than an en-dash (see above).
                    if len(anExec) <= 1:
                        anExec = chunks[i].css('p::text').extract_first().split(" – ")
                    name = anExec[0]
                    if len(anExec) > 1:
                        position = anExec[1]
                    # Again, this should never be the case, as an Exec-less
                    # company would find it hard to get much done.
                    else:
                        position = ""
                    execs.append((name,position,details['company']))
                # This does the same, but with the analysts (which never seem
                # to be separated by em-dashes for some reason).
                elif currStage == Stage['analysts']:
                    name = chunks[i].css('p::text').extract_first().split(" - ")[0]
                    company = chunks[i].css('p::text').extract_first().split(" - ")[1]
                    analysts.append((name,company))
                # This strips the transcript body of everything except simple
                # HTML, and stores that.
                elif currStage == Stage['body']:
                    line = chunks[i].css('p::text').extract_first()
                    html = "p>"
                    if line is None:
                        line = chunks[i].css('strong::text').extract_first()
                        html = "h1>"
                    script.append("<"+html+line+"</"+html)
            else:
                mode += 1
            i += 1

        # Adds the various arrays to the dictionary for the transcript
        details['exec'] = execs 
        details['analysts'] = analysts
        details['transcript'] = ''.join(script)

        # Adds this transcript to the dictionary of all scraped
        # transcripts, and yield that for the output
        transcript["entry"] = details
        yield transcript

process = CrawlerProcess()

# Starting the crawler with our spider.
process.crawl(AlphaSpider)
process.start()
print('Done')

2020-01-06 19:00:27 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: scrapybot)
2020-01-06 19:00:27 [scrapy.utils.log] INFO: Versions: lxml 4.4.1.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.7.4 (default, Aug 13 2019, 15:17:50) - [Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.7, Platform Darwin-19.0.0-x86_64-i386-64bit
2020-01-06 19:00:27 [scrapy.crawler] INFO: Overridden settings: {}
2020-01-06 19:00:27 [scrapy.extensions.telnet] INFO: Telnet Password: 0dbd2865642ae611
2020-01-06 19:00:27 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2020-01-06 19:00:27 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.Do

Done


In [None]:
import pandas as pd
df = pd.read_json('PythonLinks.json')

df.head()

In [None]:
# API to pull twitter data
import scrapy, pandas as pd
from scrapy.crawler import CrawlerProcess
from scrapy.http import FormRequest

class AlphaSpider(scrapy.Spider):
    name='AS'
    allowed_domains = ["seekingalpha.com"]

    start_urls = [
        'https://jkovach89:seekingalpha.com/earnings/earnings-call-transcripts'
    ]
    

    BASE_URL = 'https://seekingalpha.com/'
    
    def parse(self, response):
        token = response.xpath('//*[@name="csrf-token"]/@value').extractfirst()
        
        return FormRequest.from_response(response, 
                                         form_data{
                                             'csrf_token'=token,
                                             'password'='foobar',
                                             'user_name'='foobar'
                                         }, callback_self=scrape_pages)
        
        links = response.xpath('//a[@class="hdrlnk"]/@href').extract()
        for link in links:
            absolute_url = self.BASE_URL + link
            yield scrapy.Request(absolute_url, callback=self.parse_attr)
        
        for item in response.xpath('//lh'):
            # The ns code identifies the type of page the link comes from.  '0' means it is a Wikipedia entry.
            # Other codes indicate links from 'Talk' pages, etc.  Since we are only interested in entries, we filter:
            if item.xpath('@ns').extract_first() == '0':
                yield {
                    'title': item.xpath('@title').extract_first() 
                    }
        # Getting the information needed to continue to the next ten entries.
        next_page = response.xpath('continue/@lhcontinue').extract_first()
        
        # Recursively calling the spider to process the next ten entries, if they exist.
        if next_page is not None:
            next_page = '{}&lhcontinue={}'.format(self.start_urls[0],next_page)
            yield scrapy.Request(next_page, callback=self.parse)
    
        
    def parse_attr(self, response):
        item = DmozItem()
        item["link"] = response.url
        item["attr"] = "".join(response.xpath("//p[@class='attrgroup']//text()").extract())
        return item
    
    def parse(self,response):
        with open('transcripts.html', 'wb') as file:
            file.write(response.body)
    
process = CrawlerProcess()

# Starting the crawler with our spider.
process.crawl(AlphaSpider)
process.start()
print('Done')


In [None]:
# -*- coding: utf-8 -*-
import scrapy


# item class included here 
class DmozItem(scrapy.Item):
    # define the fields for your item here like:
    link = scrapy.Field()
    attr = scrapy.Field()


class DmozSpider(scrapy.Spider):
    name = "dmoz"
    allowed_domains = ["craigslist.org"]
    start_urls = [
    "http://chicago.craigslist.org/search/emd?"
    ]

    BASE_URL = 'http://chicago.craigslist.org/'

    def parse(self, response):
        links = response.xpath('//a[@class="hdrlnk"]/@href').extract()
        for link in links:
            absolute_url = self.BASE_URL + link
            yield scrapy.Request(absolute_url, callback=self.parse_attr)

    def parse_attr(self, response):
        item = DmozItem()
        item["link"] = response.url
        item["attr"] = "".join(response.xpath("//p[@class='attrgroup']//text()").extract())
        return item
    
    