# Lang-8 Web Scraper

LING 530B <br>
Khia Johnson <br>
April 15, 2019

---

In [None]:
import time
import json
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [None]:
def open_as_soup(url):  
    attempts = 0
    
    while attempts < 3:
        try:
            soup = BeautifulSoup(urlopen(url).read().decode('utf-8', errors='ignore'), 'html.parser')
            break
        except Exception as e:
            print('fail to load #', attempts)
            repr(e)
            attempts+=1
        time.sleep(1)
        
    return soup

In [None]:
def extract_profile_info(page_soup):
    rows = page_soup.find_all('div', {'class':'user_profile_row'})
    user_details = {}  
    
    for r in rows:
        profile_row = r.text.strip('\n').split('\n')
        user_details[profile_row[0].replace(' ','_')] = profile_row[1]   
    
    user_details['L1'] = page_soup.find('dd', {'class':'speaking_lang_name'}).text
    user_details['L2'] = page_soup.find('dd', {'class':'studying_lang_name'}).text.split()
    try:
        user_details['Entries_written'] = page_soup.find('td', {'class':'l01'}).a.text
    except:
        user_details['Entries_written'] = "0"
    try:
        user_details['Corrections_made'] = page_soup.find('td', {'class':'l02'}).a.text
    except:
        user_details['Corrections_made'] = "0"
    try:
        user_details['Corrections_rcvd'] = page_soup.find('td', {'class':'l03'}).a.text 
    except:
        user_details['Corrections_rcvd'] = "0"
        
    return user_details

In [None]:
def get_pagination(page_soup):
    try:
        pages = []
        for p in page_soup.find('ul', {'class':'pagination'}):
            try:
                pages.append(int(p.string))
            except:
                pass
        return max(pages) 
    except:
        return 1

In [None]:
def collect_page_friends_info(page_soup):   
    friend_soup = page_soup.find('ul', {'class':'list_fliend'}).find_all('div',{'class':'column cfx'})   
    keep = {}    

    for f in friend_soup:
        details = {}
        friend_id = f.find('a').get('href').replace('/','')
        details['L1'] = f.find('li', {'class':'speaking'}).string
        if 'English' in f.find('li', {'class':'studying study_column'}).string.replace('\n','').split(', '):
            details['L2_English'] = True
        else: 
            details['L2_English'] = False
        details['friend_num'] = f.find('li', {'class':'friend_num'}).string
        try: 
            details['loc'] = f.find('div', {'class':'location'}).string              
        except: 
            details['loc'] = None  
        keep[friend_id] = details 
        
    return keep

In [None]:
def collect_page_journal_urls(page_soup):
    journal_soup = page_soup.find_all('div',{'class':'vertical-spaced journals_flex floated_on_right'})
    urls = []
    
    for entry in journal_soup: 
        try:
            if entry.find('li', {'class':'studying'}).text.strip() == 'English':
                if entry.find('a', {'class':'premium'}) is not None:
                    urls.append(entry.find_all('a')[2].get('href'))
                else:
                    urls.append(entry.find_all('a')[1].get('href'))
            else: 
                pass
        except:
            print(user, 'entry language not identified')
        
    return urls

In [None]:
def get_one_entry_and_comments(page_soup, user_id): 
    entry = {}
    entry['entry_text'] = page_soup.find('div', {'id':'body_show_ori'}).get_text(" ").strip()
    entry['entry_date'] = page_soup.find_all('span', {'class':'journal_date floated_on_left'})[0].text.strip()

    comments = []
    comment_soup = page_soup.find_all('div',{'class':'journal_comment_header'})
    for c in comment_soup:
        if user_id == c.find('a').get('href').replace('/',''):
            comment_date = c.text.strip().split('\n')[0]
            comment_text = c.next_sibling.next_sibling.text.strip()
            comments.append((comment_date, comment_text))
        else: 
            pass
    
    entry['comments'] = comments
    return entry

In [None]:
# Write initial state for states files
# Commented out-- don't re-run
#todo_user_ids = ['1784120','1122476','1677477','225463','1015012','1805882','1636158']
#users_scraped = []
#users_failed = []

#with open('530_project_lists.txt', 'w') as file:
#        json.dump({'to-do': todo_user_ids, 'scraped': users_scraped, 'failed': users_failed}, file)

**Main scraping loop**

In [None]:
# Load current state and reset error counter
with open('530_project_lists.txt', 'r') as file:
    state = json.loads(file.read())

todo_user_ids = state['to-do']
users_scraped = state['scraped']
users_failed = state['failed']
errors = 0

# Main scraping loop
while True:
    start = time.time()
    user = todo_user_ids.pop()
    if user in todo_user_ids:
        continue
    
    base_url = 'https://lang-8.com/' + user
    try:
        friend_soup = open_as_soup(base_url + '/friends')
        profile = extract_profile_info(friend_soup)
        
        friend_pages = get_pagination(friend_soup)
        friends = collect_page_friends_info(friend_soup)
        if friend_pages > 1: 
            for p in range(2, friend_pages + 1): 
                p_soup = open_as_soup(base_url + '/friends' + '?page=' + str(p))
                friends.update(collect_page_friends_info(p_soup))   
            
        if 'Age' not in profile.keys():
            users_failed.append(user)
            for f in friends:
                if friends[f]['L2_English'] == True and friends[f]['loc'] is not None and f not in users_scraped and f not in users_failed:
                    todo_user_ids.append(f)
            print(user,'missing age, seconds:',time.time()-start)
            continue
    
        journal_soup = open_as_soup(base_url + '/journals')
        journal_urls = collect_page_journal_urls(journal_soup)
        journal_pages = get_pagination(friend_soup) 
        
        if journal_pages > 1:
            for p in range(2, journal_pages + 1): 
                p_soup = open_as_soup(base_url + '/journals' + '?page=' + str(p))
                journal_urls.extend(collect_page_journal_urls(p_soup))
        
        journal = {}
        for entry_url in journal_urls:
            entry_soup = open_as_soup(entry_url)
            journal[entry_url] = get_one_entry_and_comments(entry_soup, user)
    
        if journal == {}:
            print(user, 'has no English entries, seconds:',time.time()-start)
            users_failed.append(user)
            for f in friends:
                if friends[f]['L2_English'] == True and friends[f]['loc'] is not None and f not in users_scraped and f not in users_failed:
                    todo_user_ids.append(f)
            continue
    
        data = {user: {'profile': profile, 
                       'friends': friends, 
                       'journal': journal}}
        
        with open('530_project_data.txt', 'a') as file:
            json.dump(data, file)
            file.write('\n')
        
        users_scraped.append(user)
        for f in friends:
            if friends[f]['L2_English'] == True and friends[f]['loc'] is not None and f not in users_scraped and f not in users_failed:
                todo_user_ids.append(f)
            
        with open('530_project_lists.txt', 'w') as file:
            json.dump({'to-do': todo_user_ids, 
                       'scraped': users_scraped, 
                       'failed': users_failed}, file)
    
        print(user + " seconds:", round(time.time()-start, 2), ', users scraped:',len(set(users_scraped)))

    except:
        print('error with: ' + user + ', users scraped:', len(set(users_scraped)))
        users_failed.append(user)
        
        with open('530_project_lists.txt', 'w') as file:
            json.dump({'to-do': todo_user_ids, 
                       'scraped': users_scraped, 
                       'failed': users_failed}, file)
        errors+=1
        if errors>1:
            break