-
Notifications
You must be signed in to change notification settings - Fork 1
/
update_techcrunch.py
68 lines (50 loc) · 1.89 KB
/
update_techcrunch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import feedparser
from urllib2 import urlopen
from bs4 import BeautifulSoup
import psycopg2
from entity_api import entity_extract
from untitled1 import db, Article
rss = {'http://feeds.feedburner.com/TechCrunch/'}
for key in rss:
print(key)
d = feedparser.parse(key)
for post in d.entries:
try:
html=urlopen(post.link)
bsObj=BeautifulSoup(html,"html.parser")
str1=str(bsObj.find("div",attrs={"class":"article-entry text"}))
str2=str(bsObj.find("div",attrs={"class":"aside aside-related-articles"}))
str3=bsObj.findAll("script")
cleantext=bsObj.find("div",attrs={"class":"article-entry text"}).get_text()
date=bsObj.find("meta",attrs={ "class":"swiftype","name":"timestamp"})["content"]
for string in str3:
str1=str1.replace(str(string),'')
title= post.title
image= post.media_content[0]["url"]
html= str(str1.replace(str2,'')).decode("utf-8")
description=bsObj.find("meta",attrs={"name":"sailthru.description"})["content"]
# category
category="TechCrunch"
# print title
# print image
# print date
# print description
# print html
# print cleantext
# print category
if not db.session.query(Article).filter(Article.title == title).count():
article_a = Article(title=title, full_story=cleantext, image=image, category=category,
description=description, pubdate=date, html=html)
db.session.add(article_a)
db.session.commit()
print article_a.id
entity_extract(article_a.id, cleantext, 1)
except psycopg2.IntegrityError: # as ie:
# print ie
print"Caught"
db.session.rollback()
# break
# continue
except Exception as e:
print e
pass