# Scrape stackoverflow questions by Tags

In [16]:
import requests
from requests_html import HTML


In [17]:
base_url = "https://stackoverflow.com/questions/tagged/"
tag = "python"
url = f"{base_url}{tag}"
url

'https://stackoverflow.com/questions/tagged/python'

In [18]:
r = requests.get(url)
r.status_code

200

In [19]:
html_str = r.text
html = HTML(html=html_str)

In [20]:
question_summaries = html.find('.question-summary')
question_summaries[0]

<Element 'div' class=('question-summary',) id='question-summary-65060927'>

In [21]:
print(question_summaries[0].text)

0
votes
0answers
2 views
not able to debug this code related to central limit theorem
I had this problem where a maximum load of 9800 poundss could be carried in a lift. there was a cargo of boxes with average weight 205 pounds and standard deviation of 15 pounds. i had to find the ...
python debugging math statistics
asked 1 min ago
Pranav Pushkar
111 bronze badge


In [22]:
cols = ['votes', 'vote_title', 'num_answers', 'views', 'question', 'short_desc', 'tags', 'date', 'user', 'user_details']
this_row = list(question_summaries[0].text.split('\n'))
this_row

['0',
 'votes',
 '0answers',
 '2 views',
 'not able to debug this code related to central limit theorem',
 'I had this problem where a maximum load of 9800 poundss could be carried in a lift. there was a cargo of boxes with average weight 205 pounds and standard deviation of 15 pounds. i had to find the ...',
 'python debugging math statistics',
 'asked 1 min ago',
 'Pranav Pushkar',
 '111 bronze badge']

In [23]:
len(this_row) == len(cols)

True

In [24]:
row_data = dict(zip(cols, this_row))
row_data

{'votes': '0',
 'vote_title': 'votes',
 'num_answers': '0answers',
 'views': '2 views',
 'question': 'not able to debug this code related to central limit theorem',
 'short_desc': 'I had this problem where a maximum load of 9800 poundss could be carried in a lift. there was a cargo of boxes with average weight 205 pounds and standard deviation of 15 pounds. i had to find the ...',
 'tags': 'python debugging math statistics',
 'date': 'asked 1 min ago',
 'user': 'Pranav Pushkar',
 'user_details': '111 bronze badge'}

In [25]:
key_names = ['question', 'votes', 'tags']
classes_needed = ['.question-hyperlink', '.vote', '.tags']
this_question_element = question_summaries[0]
this_question_element.find(classes_needed[0], first=True).text

'not able to debug this code related to central limit theorem'

In [26]:
def clean_scraped_data(text, key_name=None):
    if key_name == 'votes':
        return text.replace('\nvotes', '')
    elif key_name == 'tags':
        return text.replace('\n', '')
    else:
        return text

In [27]:
datas = []
for q_el in question_summaries:
    question_data = {}
    for i, _class in enumerate(classes_needed):
        sub_el = q_el.find(_class, first=True)
        # print(sub_el.text)
        question_data[key_name[i]] = clean_scraped_data(sub_el.text, key_name=key_name[i])
    datas.append(question_data)

datas[0]

{'question': 'not able to debug this code related to central limit theorem',
 'votes': '0',
 'tags': 'python debugging math statistics'}

In [28]:
def parse_tagged_page(html):
    question_summaries = html.find('.question-summary')
    key_names = ['question', 'votes', 'tags']
    classes_needed = ['.question-hyperlink', '.vote', '.tags']
    datas = []
    for q_el in question_summaries:
        question_data = {}
        for i, _class in enumerate(classes_needed):
            sub_el = q_el.find(_class, first=True)
            # print(sub_el.text)
            question_data[key_names[i]] = clean_scraped_data(sub_el.text, key_name=key_names[i])
        datas.append(question_data)
    return datas
    

In [29]:
def extract_data_from_url(url):
    r = requests.get(url)
    if r.status_code not in range(200, 299):
        return []
    html_str = r.text
    html = HTML(html=html_str)
    datas = parse_tagged_page(html)
    return datas
    


In [31]:
base_url = "https://stackoverflow.com/questions/tagged/"
tag = "javascript"
url = f"{base_url}{tag}"
url

'https://stackoverflow.com/questions/tagged/javascript'

In [32]:
extract_data_from_url(url)

[{'question': 'Capture interactive scroll events and ignore programatic scroll?',
  'votes': '0',
  'tags': 'javascript scroll'},
 {'question': 'Node.js Event Loop and Call stack',
  'votes': '1\nvote',
  'tags': 'javascript node.js v8 event-loop'},
 {'question': 'Is there in any php or javascript library available for printing bill on tvs rp3200 star thermal printer?',
  'votes': '-3',
  'tags': 'javascript php thermal-printer inventory-management'},
 {'question': 'How to create React Native Carousel/Slider for shopping app',
  'votes': '0',
  'tags': 'javascript reactjs react-native'},
 {'question': 'javascript dropdown select time conditioned current',
  'votes': '0',
  'tags': 'javascript php date time drop-down-menu'},
 {'question': 'How to chain actions with Link in React Router?',
  'votes': '0',
  'tags': 'javascript reactjs react-router'},
 {'question': 'Save user IP adress to DB using express/mongoose',
  'votes': '0',
  'tags': 'javascript node.js reactjs express mongoose'},

In [35]:
def scrape_tag(tag="python", query_filter='Votes', pagesize=50, max_pages=50):
    base_url = "https://stackoverflow.com/questions/tagged/"
    datas = []
    for p in range(max_pages):
        url = f"{base_url}{tag}?tab={query_filter}&page={p+1}&pagesize={pagesize}"
        datas += extract_data_from_url(url)
    return datas


In [36]:
datas = scrape_tag()

In [39]:

import pandas as pd 
df = pd.DataFrame(datas)
df

Unnamed: 0,question,votes,tags
0,What does the “yield” keyword do?,10694,python iterator generator yield coroutine
1,What does if __name__ == “__main__”: do?,6454,python namespaces main python-module idioms
2,Does Python have a ternary conditional operator?,6348,python operators ternary-operator conditional-...
3,What are metaclasses in Python?,5940,python oop metaclass python-datamodel
4,How do I check whether a file exists without e...,5870,python file file-exists
...,...,...,...
2495,How to expand a list to function arguments in ...,155,python arguments
2496,How to JSON serialize sets?,155,python json serialization set
2497,Stop pip from failing on single package when i...,155,python pip
2498,Excluding directories in os.walk,155,python


In [40]:
df.shape

(2500, 3)

In [41]:
df.to_csv('python-questions.csv', index=False)