-
Notifications
You must be signed in to change notification settings - Fork 0
/
association-scrape.py
51 lines (43 loc) · 1.43 KB
/
association-scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
commands_df = pd.read_csv('commands.csv')
def extract_text(link):
doc = ""
try:
r = requests.get(link)
except requests.exceptions.ConnectionError:
return doc
bs = BeautifulSoup(r.text, "html.parser")
for pre in bs.find_all('pre'):
doc += pre.text
for code in bs.find_all('code'):
doc += code.text
for span in bs.find_all('span'):
doc += span.text
for p in bs.find_all('p'):
doc += p.text
return doc
targets = []
for i in range(1,11):
url = "https://stackoverflow.com/questions/tagged/linux?tab=newest&page=1&pagesize=10"
r = requests.get(url)
bs = BeautifulSoup(r.text, "html.parser")
targets.extend(bs.find_all(attrs={"class":"s-link"}))
corpus = []
for t in targets:
if not t.attrs['href'][0] == '/':
continue
url = f"https://stackoverflow.com{t.attrs['href']}"
corpus.append(extract_text(url))
vectorizer = CountVectorizer(binary=True)
vecs = vectorizer.fit_transform(corpus)
df = pd.DataFrame(data=vecs.toarray(), columns=vectorizer.get_feature_names_out())
association_df = pd.DataFrame()
for cmd in commands_df['command']:
if cmd in df.columns:
association_df[cmd] = df[cmd]
else:
association_df[cmd] = [0] * len(df.index)
association_df.to_csv('association.csv', index=False)