In [1]:
### Web Scraping Stack Overflow

### Base code modified and expanded from
### https://medium.com/@nveenverma/web-scraping-tutorial-project-scraping-stack-overflow-e28bb139fc3b

In [2]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup as bs
import requests

%matplotlib inline

In [3]:
# Using requests module for downloading webpage content
response1 = requests.get('https://stackoverflow.com/questions?sort=votes&pagesize=50')

# Getting status of the request
# 200 status code means our request was successful
# 404 status code means that the resource you were looking for was not found
response1.status_code

200

In [4]:
# Parsing html data using BeautifulSoup
soup1 = bs(response1.content, 'html.parser')

print(soup1.prettify())

<!DOCTYPE html>
<html class="html__responsive">
 <head>
  <title>
   Highest Voted Questions - Stack Overflow
  </title>
  <link href="https://cdn.sstatic.net/Sites/stackoverflow/Img/favicon.ico?v=ec617d715196" rel="shortcut icon"/>
  <link href="https://cdn.sstatic.net/Sites/stackoverflow/Img/apple-touch-icon.png?v=c78bd457575a" rel="apple-touch-icon"/>
  <link href="https://cdn.sstatic.net/Sites/stackoverflow/Img/apple-touch-icon.png?v=c78bd457575a" rel="image_src"/>
  <link href="/opensearch.xml" rel="search" title="Stack Overflow" type="application/opensearchdescription+xml"/>
  <meta content="width=device-width, height=device-height, initial-scale=1.0, minimum-scale=1.0" name="viewport"/>
  <meta content="website" property="og:type">
   <meta content="https://stackoverflow.com/questions?sort=votes&amp;pagesize=50" property="og:url"/>
   <meta content="Stack Overflow" property="og:site_name"/>
   <meta content="https://cdn.sstatic.net/Sites/stackoverflow/Img/apple-touch-icon@2.png?

In [5]:
# Body 
body1 = soup1.select_one('body')

# Printing the object type of body
type(body1)

bs4.element.Tag

In [6]:
# Finding the date and time of the question submitted
time_spans = body1.select("span.relativetime")
print(time_spans[0])

<span class="relativetime" title="2012-06-27 13:51:36Z">Jun 27 '12 at 13:51</span>


In [7]:
print(len(time_spans))

40


In [8]:
time_stamp = [i["title"] for i in time_spans]
time_stamp[0]

'2012-06-27 13:51:36Z'

In [9]:
# Finding the user who submitted the question
user_spans = body1.select("div.user-details")
print(user_spans[0])

<div class="user-details">
<a href="/users/87234/gmannickg">GManNickG</a>
<div class="-flair">
<span class="reputation-score" dir="ltr" title="reputation score 434,495">434k</span><span aria-hidden="true" title="46 gold badges"><span class="badge1"></span><span class="badgecount">46</span></span><span class="v-visible-sr">46 gold badges</span><span aria-hidden="true" title="447 silver badges"><span class="badge2"></span><span class="badgecount">447</span></span><span class="v-visible-sr">447 silver badges</span><span aria-hidden="true" title="527 bronze badges"><span class="badge3"></span><span class="badgecount">527</span></span><span class="v-visible-sr">527 bronze badges</span>
</div>
</div>


In [10]:
a_user_list = [i.select('a') for i in user_spans]
print(a_user_list[0])

[<a href="/users/87234/gmannickg">GManNickG</a>]


In [11]:
a_user_list[0][0].text

'GManNickG'

In [12]:
### NOTE: Questions can be also "community-wiki" or "anon" (anonymous) without a date and time!!!

In [13]:
#user_span_single = user_spans[0]

def user_find(user_span_single):
    trial = user_span_single.select('a')
    
    if len(trial) >= 1:
    
        intermediate = trial[0].text
        
        if '\r\n' in intermediate:
        
            return 'community-wiki'
        
        return intermediate
    
    if 'anon' in user_span_single.text:
    
        return 'anonymous'

In [14]:
user_found = [user_find(user_span_single) for user_span_single in user_spans]

In [15]:
len([x for x in user_found if x])

50

In [16]:
time_spans = body1.select("span.relativetime")
time_stamp = [i["title"] for i in time_spans]

def time_find(users, time_stamp):
 
    polished_times = []
    times_iter = iter(time_stamp)

    for user_id in users:
        if user_id == 'community-wiki' or user_id == 'anonymous':
            polished_times.append('None') 
        
        else:
            polished_times.append(next(times_iter))
            
    
    return polished_times

In [19]:
user_found = [user_find(user_span_single) for user_span_single in user_spans]
users = [x for x in user_found if x]

print(len(time_find(users, time_stamp)))

50


In [20]:
# Function to check, if there is any error in length of the extracted bs4 object
def error_checking(list_name, length):
    if (len(list_name) != length):
        print("Error in {} parsing, length not equal to {}!!!".format(list_name, length))
        return -1
    else:
        pass

In [21]:
def get_top_questions(url, question_count):
    # WARNING: Only enter one of these 3 values [15, 30, 50].
    # Since, stackoverflow, doesn't display any other size questions list
    url = url + "?sort=votes&pagesize={}".format(question_count)
    
    # Using requests module for downloading webpage content
    response = requests.get(url)

    # Parsing html data using BeautifulSoup
    soup = bs(response.content, 'html.parser')
    body = soup.find('body')

    # Extracting Top Questions
    question_links = body1.select("h3 a.question-hyperlink")
    error_checking(question_links, question_count)                     # Error Checking
    questions = [i.text for i in question_links]                       # questions list
    
    # Extracting Summary
    summary_divs = body1.select("div.excerpt")
    error_checking(summary_divs, question_count)                       # Error Checking
    summaries = [i.text.strip() for i in summary_divs]                 # summaries list
    
    # Extracting Tags
    tags_divs = body1.select("div.summary > div:nth-of-type(2)")
    
    error_checking(tags_divs, question_count)                          # Error Checking
    a_tags_list = [i.select('a') for i in tags_divs]                   # tag links
    
    tags = []

    for a_group in a_tags_list:
        tags.append([a.text for a in a_group])                         # tags list
    
    # Extracting User Info
    user_spans = body1.select("div.user-details")
    user_found = [user_find(user_span_single) for user_span_single in user_spans]
    users = [x for x in user_found if x]
    #print(len(users))
    
    # Extracting Question Time Stamps
    time_spans = body1.select("span.relativetime")
    time_stamp = [i["title"] for i in time_spans]
    times = time_find(users, time_stamp)
    
    #print(len(time_spans))                                             # time list
    
    # Extracting Number of votes
    vote_spans = body1.select("span.vote-count-post strong")
    error_checking(vote_spans, question_count)                         # Error Checking
    no_of_votes = [int(i.text) for i in vote_spans]                    # votes list
    
    # Extracting Number of answers
    answer_divs = body1.select("div.status strong")
    error_checking(answer_divs, question_count)                        # Error Checking
    no_of_answers = [int(i.text) for i in answer_divs]                 # answers list
    
    # Putting all of them together
    df = pd.DataFrame({'question': questions, 
                       'summary': summaries, 
                       'tags': tags,
                       'time_stamp': times,
                       'user_id': users,
                       'no_of_votes': no_of_votes,
                       'no_of_answers': no_of_answers})

    return df

In [22]:
URL2 = 'https://stackoverflow.com/questions'

df1 = get_top_questions(URL2, 50)
df1.tail(15)

Unnamed: 0,question,summary,tags,time_stamp,user_id,no_of_votes,no_of_answers
35,The definitive guide to form-based website aut...,Form-based authentication for websites\n\nWe b...,"[security, http, authentication, language-agno...",,community-wiki,5368,12
36,PUT vs. POST in REST,According to the HTTP/1.1 Spec: \r\n The POST...,"[http, rest, post, put]",2009-03-10 14:25:20Z,alex,5357,34
37,How to make Git “forget” about a file that was...,"There is a file that was being tracked by git,...","[git, gitignore, git-rm]",2009-08-13 19:23:22Z,Ivan,5331,26
38,How do I find all files containing specific te...,I'm trying to find a way to scan my entire Lin...,"[linux, text, grep, directory, find]",2013-06-06 08:06:45Z,Nathan,5217,47
39,What is the most efficient way to deep clone a...,What is the most efficient way to clone a Java...,"[javascript, object, clone]",,community-wiki,5180,67
40,How to disable text selection highlighting,For anchors that act like buttons (for example...,"[css, cross-browser, highlight, textselection]",,anonymous,5175,47
41,How do I include a JavaScript file in another ...,Is there something in JavaScript similar to @i...,"[javascript, file, import, include]",2009-05-05 20:29:18Z,Alec Smart,5159,60
42,Move the most recent commit(s) to a new branch...,I'd like to move the last several commits I've...,"[git, git-branch, branching-and-merging]",2009-06-04 11:59:50Z,Mark A. Nicolosi,4955,14
43,What is a plain English explanation of “Big O”...,I'd prefer as little formal definition as poss...,"[algorithm, complexity-theory, computer-scienc...",2009-10-27 03:07:34Z,Arec Barrwin,4927,39
44,How to get the source directory of a Bash scri...,How do I get the path of the directory in whic...,"[bash, directory]",,community-wiki,4921,66
