In [1]:
import requests
import re
from collections import Counter

In [2]:
# 1. retrieve the Berlin Wikipedia page
# 2. check the status code

berlin = "https://en.wikipedia.org/wiki/Berlin"
response = requests.get(berlin)

print(response.status_code)

200


In [3]:
# 3. store the html contents of the page

html = response.text


In [4]:
# 4. write the html contents to a file
with open("berlin.html", "wb") as f:
    f.write(html.encode('utf8'))
    f.close()



In [5]:
# 5. remove all html tags
text = re.sub(r'<.*?>', '', html)
text


'\n\n\n\nBerlin - Wikipedia\ndocument.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"YCqfhfV4XDMdOp9RTrW7xQAAAM0","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Berlin","wgTitle":"Berlin","wgCurRevisionId":1006622491,"wgRevisionId":1006622491,"wgArticleId":3354,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 German-language sources (de)","All articles with dead external links","Articles with dead external links from July 2020","Articles with permanently dead external links","Articles with dead external links from November 2020","CS1 maint: multiple names: authors list","Articles with dead external links fr

In [6]:
# 6. extract words
words = re.findall('[A-Za-z]+', text)
words

['Berlin',
 'Wikipedia',
 'document',
 'documentElement',
 'className',
 'client',
 'js',
 'RLCONF',
 'wgBreakFrames',
 'wgSeparatorTransformTable',
 'wgDigitTransformTable',
 'wgDefaultDateFormat',
 'dmy',
 'wgMonthNames',
 'January',
 'February',
 'March',
 'April',
 'May',
 'June',
 'July',
 'August',
 'September',
 'October',
 'November',
 'December',
 'wgRequestId',
 'YCqfhfV',
 'XDMdOp',
 'RTrW',
 'xQAAAM',
 'wgCSPNonce',
 'wgCanonicalNamespace',
 'wgCanonicalSpecialPageName',
 'wgNamespaceNumber',
 'wgPageName',
 'Berlin',
 'wgTitle',
 'Berlin',
 'wgCurRevisionId',
 'wgRevisionId',
 'wgArticleId',
 'wgIsArticle',
 'wgIsRedirect',
 'wgAction',
 'view',
 'wgUserName',
 'null',
 'wgUserGroups',
 'wgCategories',
 'CS',
 'German',
 'language',
 'sources',
 'de',
 'All',
 'articles',
 'with',
 'dead',
 'external',
 'links',
 'Articles',
 'with',
 'dead',
 'external',
 'links',
 'from',
 'July',
 'Articles',
 'with',
 'permanently',
 'dead',
 'external',
 'links',
 'Articles',
 'with',

In [7]:
# 7. remove words with less than 6 characters
long_words = [word for word in words if len (word) >= 6]
long_words

['Berlin',
 'Wikipedia',
 'document',
 'documentElement',
 'className',
 'client',
 'RLCONF',
 'wgBreakFrames',
 'wgSeparatorTransformTable',
 'wgDigitTransformTable',
 'wgDefaultDateFormat',
 'wgMonthNames',
 'January',
 'February',
 'August',
 'September',
 'October',
 'November',
 'December',
 'wgRequestId',
 'YCqfhfV',
 'XDMdOp',
 'xQAAAM',
 'wgCSPNonce',
 'wgCanonicalNamespace',
 'wgCanonicalSpecialPageName',
 'wgNamespaceNumber',
 'wgPageName',
 'Berlin',
 'wgTitle',
 'Berlin',
 'wgCurRevisionId',
 'wgRevisionId',
 'wgArticleId',
 'wgIsArticle',
 'wgIsRedirect',
 'wgAction',
 'wgUserName',
 'wgUserGroups',
 'wgCategories',
 'German',
 'language',
 'sources',
 'articles',
 'external',
 'Articles',
 'external',
 'Articles',
 'permanently',
 'external',
 'Articles',
 'external',
 'November',
 'multiple',
 'authors',
 'Articles',
 'external',
 'Webarchive',
 'template',
 'wayback',
 'French',
 'language',
 'sources',
 'Articles',
 'description',
 'description',
 'different',
 'Wikida

In [8]:
# 8. create a Counter object from the long words
c = Counter(long_words)
c


Counter({'Berlin': 670,
         'Wikipedia': 27,
         'document': 3,
         'documentElement': 1,
         'className': 1,
         'client': 2,
         'RLCONF': 1,
         'wgBreakFrames': 1,
         'wgSeparatorTransformTable': 1,
         'wgDigitTransformTable': 1,
         'wgDefaultDateFormat': 1,
         'wgMonthNames': 1,
         'January': 31,
         'February': 46,
         'August': 65,
         'September': 33,
         'October': 38,
         'November': 45,
         'December': 35,
         'wgRequestId': 1,
         'YCqfhfV': 1,
         'XDMdOp': 1,
         'xQAAAM': 1,
         'wgCSPNonce': 1,
         'wgCanonicalNamespace': 1,
         'wgCanonicalSpecialPageName': 1,
         'wgNamespaceNumber': 1,
         'wgPageName': 1,
         'wgTitle': 1,
         'wgCurRevisionId': 1,
         'wgRevisionId': 1,
         'wgArticleId': 1,
         'wgIsArticle': 1,
         'wgIsRedirect': 1,
         'wgAction': 1,
         'wgUserName': 1,
         'wgU

In [9]:
# 9. print how often the word 'Berlin' occurs
c['Berlin']



670

In [10]:
# 10. print the 10 most common words
c.most_common(10)

[('Berlin', 670),
 ('Retrieved', 219),
 ('German', 132),
 ('parser', 100),
 ('Germany', 99),
 ('output', 98),
 ('Archived', 73),
 ('original', 69),
 ('August', 65),
 ('Brandenburg', 53)]