Skip to content
Extraction of Web Archive data using Common Crawl index API
Go
Branch: master
Clone or download
Latest commit 275f83e Jul 14, 2019
Permalink
Type Name Latest commit message Commit time
Failed to load latest commit information.
.gitignore Init Jun 16, 2019
LICENSE Create LICENSE Jul 14, 2019
README.md Configuration of concurrent crawler Jun 29, 2019
auxillary.go Configuration of concurrent crawler Jun 29, 2019
concurrent.go Configuration of concurrent crawler Jun 29, 2019
main.go Random User-Agents Jun 28, 2019

README.md

goCommonCrawl

goCommonCrawl extracts web data from Common Crawl Web archive, that is located on Amazon S3 storage, using their index API server.

Installation

go get -u github.com/karust/goCommonCrawl

Usage

If you need to fetch some URL without concurrency:

package main

import (
	"log"
	cc "github.com/karust/gocommoncrawl"
)

func main() {
    // Get information about `example.com/` URL from  `CC-MAIN-2019-22` archive
	pages, err := cc.GetPagesInfo("CC-MAIN-2019-22", "example.com/", 45)
	if err != nil {
		log.Fatalln(err)
	}

    // Parse retrieved pages information and save it in `./data`
	cc.SaveContent(pages, "./data", 45)
}

Concurrent way to do things:

func main() {
	// Create channel for results
	resChan := make(chan cc.Result)

	// Some URLs to fetch pages from
	sites := []string{"medium.com/", "example.com/", "tutorialspoint.com/"}

	// Make save folder and start goroutine for each URL
	for _, url := range sites {
		// Configure request
		commonConfig := cc.Config{
			ResultChan: resChan,
			Timeout:    30,
			// Version of archive
			CrawlDB: "CC-MAIN-2019-22",
			// Wait time between AWS S3 downloads in milliseconds
			WaitMS: 53,
			// Extensions to save
			Extensions: []string{".html", ".pdf", ".doc", ".txt"},
			// Max amount of files to save
			MaxAmount: 20,
		}

		saveFolder := "./data/" + cc.EscapeURL(url)
		go cc.FetchURLData(url, saveFolder, commonConfig)
	}

	// Listen for results from goroutines
	for r := range resChan {
		if r.Error != nil {
			fmt.Printf("Error occured: %v\n", r.Error)
		} else if r.Progress > 0 {
			fmt.Printf("Progress %v: %v/%v\n", r.URL, r.Progress, r.Total)
		}
	}
}

In the result, you should get folders with files (mostly HTML and robot.txt) that belong to given URLs.

You can’t perform that action at this time.