In [1]:
import os
import re
import json
import time
import hashlib
import random
import bs4
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import shutil

In [2]:
extractors = {'username': re.compile('username:([^\n]+)'),
              'name': re.compile('\nname:([^\n]+)'),
              'age': re.compile('\nage:([^\n]+)'),
              'location': re.compile('\nlocation :([^\n]+)'),
              'ethnicity': re.compile('\nethnicity :([^\n]+)'),
              'occupation': re.compile('\noccupation:([^\n]+)'),
              'status': re.compile('\nmarital status:([^\n]+)'),
              'phone': re.compile('\ntel:([^\n]+)'),
              'inet': re.compile('\nIP address:([^\n]+)'),
              'email': re.compile('\nemail:([^\n]+)'),
              'description': re.compile('\ndescription: ([\n\w\W]+)\nmessage'),
              'messages': re.compile('\nmessage: ([\n\w\W]+)\nWHY IS'),
              'justifications': re.compile('\nWHY IS IT A SCAM / FAKE([\n\w\W]+)\W This post')}

In [3]:
def scrape(startyear, startmonth, endyear, endmonth):
#   Walk the database through the defined ranges, downloading everything.
    year = startyear
    month = startmonth
    while (not (year == endyear and month == endmonth)):
        ys = "{}".format(year)
        ms = "{:02d}".format(month)
        gather_all_profiles(ys,ms) 
        if month == 12:
            year += 1
            month = 0
        month += 1

def gather_all_profiles(year, month):
#   Walk the index pages, harvesting the profile URLs, and then download and process all the profiles stored under this year and month.
    page = 1
    urls = []

    print("{}-{} : Begin indexing.".format(year, month))

    while (page > 0):
        urlstring = "http://scamdigger.com/{}/{}/page/{}".format(year,month,page)    
#         jitter = random.choice([0,1])
#         print(urlstring)
        try:
            inhandle = urlopen(urlstring)
#             print(inhandle)
            urls.extend(enumerate_profiles(inhandle))
#             print(urls)
            page += 1
        except:
          page = 0

    print("{}-{} : {} profiles".format(year,month,len(urls)))

    for url in urls:
        try:
            urlhandle = urlopen(url)
            scrape_profile(urlhandle, year, month)
    
        except Exception as e:
            print("Exception when handling {}".format(url))
            print(e)
  
    print("{}-{} : complete.".format(year,month))
    

def enumerate_profiles(inhandle):
    html = inhandle.read()
    soup = BeautifulSoup(html, 'html.parser')
    content = soup.findAll('div',{'class':'grid-thumb'})
    urllist =[]
    for link in content:
        urllist.append(link.find('a')['href'])
  
    return urllist    

def scrape_profile(inhandle,year,month):
#     Scrape an input scamdiggers page for the profile content of the scammer.

#Read file    
    html = inhandle.read()
    soup = BeautifulSoup(html, 'html.parser')

#Find main page content
    content = soup.find('div', {'class':'entry-content'})
    profile = {}

#Fill in known info from URL
    profile['year_reported'] = year
    profile['month_reported'] = month

    #Get visible text
    text = content.get_text()
#     print(text)

    #Parse information from text
    for key in extractors:
        match = extractors[key].search(text)
        if match:
            matchtext = match.group(1)
            if key in ['justifications','messages']:
                vals = matchtext.split('\n')
            else:
                vals = matchtext
                profile[key] = vals 

    #Parse annotations
    content1 = soup.find('div', {'class':'entry-utility'})
    profile['tags']   = [node.get_text() for node in content1.findAll('a', {'rel':'tag'})]
    profile['gender'] = 'female' if 'Female profiles' in profile['tags'] else 'male'
#     print(profile)
    
    uid = profile['username'].strip()
    outfile='scam'+uid+'.json'
    
    #Save output
    profile['images'] = save_img(content, uid)
    json.dump(profile, open(outfile,'w'))
            
def save_img(content, username): 
    
    i = 1
    image = []
    for img in content.findAll('img'):
        r = requests.get(img['src'], stream=True) #Get request on full_url
        if r.status_code == 200:                     #200 status code = OK
            outfile = username+str(i)+'.jpg'
            try:
                with open(outfile, 'wb') as f: 
                    r.raw.decode_content = True
                    shutil.copyfileobj(r.raw, f)
                    i+=1
                    image.append(outfile)
                
            except Exception as e:
                print("Exception when handling {}".format(url))
                print(e)
    
    return image

    
scrape(2014,1,2014,12) #Just change the date here. It will scrape everything. 

2014-01 : Begin indexing.
2014-01 : 29 profiles
2014-01 : complete.
2014-02 : Begin indexing.
2014-02 : 14 profiles
2014-02 : complete.
2014-03 : Begin indexing.
2014-03 : 0 profiles
2014-03 : complete.
2014-04 : Begin indexing.
2014-04 : 0 profiles
2014-04 : complete.
2014-05 : Begin indexing.
2014-05 : 0 profiles
2014-05 : complete.
2014-06 : Begin indexing.
2014-06 : 0 profiles
2014-06 : complete.
2014-07 : Begin indexing.
2014-07 : 0 profiles
2014-07 : complete.
2014-08 : Begin indexing.
2014-08 : 0 profiles
2014-08 : complete.
2014-09 : Begin indexing.
2014-09 : 0 profiles
2014-09 : complete.
2014-10 : Begin indexing.
2014-10 : 0 profiles
2014-10 : complete.
2014-11 : Begin indexing.
2014-11 : 0 profiles
2014-11 : complete.
