In [1]:
# import the libraries
import requests
from bs4 import BeautifulSoup

## Scraping all pages

In [2]:
# we'll be using the cars and conversations forum
url = 'https://forums.edmunds.com/discussion/18576/general/x/edmunds-members-cars-conversations'

In [3]:
# number of forum pages
num_pages = 2572

In [None]:
# store the comments in a string
all_comments = ""

# go through each page
for page_number in range(1, num_pages + 1):
    # first page string of url isn't anything special
    if(page_number == 1):
        new_url = url
    else:
        # end url format is /p220 etc.
        new_url = url + "/p" + str(page_number)
    
    # collect the url with requests library
    page = requests.get(url)
    
    # get the html of the page in string form
    page_html = page.text
    
    # create the BeautifulSoup object that takes in the html in str form and a html/xml parser of choice either html.parser or lxml
    soup = BeautifulSoup(page_html, 'html.parser')
    
    # inspect the elements from the webpage to know where stuff is
    # in this case the comments are under a <p> tag which are all under a <div class = "Message userContent"</div/ tag 
    # which are all under a <div class = "MessageList DataList Comments"> tag
    
    # Pull all text from the 'MessageList DataList Comments' divs
    messagelist = soup.find(class_ = "MessageList DataList Comments")
    
    # within the 'MessageList DataList Comments' divs pull all text from 'Message userContent' divs
    usermessages = messagelist.find_all(class_ =  "Message userContent")
    
    # now extract just the <p> tags from all comments! 
    for i in range(len(usermessages)):
        # a user comment might have multiple <p> tags
        user_comments = usermessages[i].find_all('p')
        for j in range(len(user_comments)):
            # remove <a>, <img>, <br> tags embedded in <p>
            [s.extract() for s in user_comments[j]('a')]
            [s.extract() for s in user_comments[j]('img')]
            [s.extract() for s in user_comments[j]('br')]
            
            # add the users comments to the mega string
            all_comments += user_comments[j].prettify()
    
    # print some checks
    print('Page 'str(page_number) + 'complete')

In [None]:
# let's do some string manipulation
# remove the newline characters, '<p>', '</p','said:'
parsedData = all_comments.replace('\n', '')
parsedData = parsedData.replace(r"\'", r"'")
parsedData = parsedData.replace(r"<p>", "")
parsedData = parsedData.replace(r"</p>", "")
parsedData = parsedData.replace(r"</p>", "")
parsedData = parsedData.replace(r"said:", "")
parsedData = parsedData.replace(r":", "")
parsedData = parsedData.lower()

## Word Frequency

In [5]:
# grab word frequency using nltk library
import nltk
# stop words
from nltk.corpus import stopwords
nltk.download()
nltk.download('stopwords')

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# tokenize the text 
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(parsedData)

In [None]:
# let's do a word count
from collections import Counter

# counter object
words = Counter()

# update counter with new words
words.update(tokens)

In [None]:
# check
words.most_common()[:10]

So these are the most common words, but they are also the same in any english text. Therefore they are not very insightful. Hence, let's remove single characters, numbers, and common words.

In [None]:
# lets do this again: remove stop words, single letter variable names, numbers and et, al, cid
words = Counter(x for x in tokens if x not in stopwords and x.isdigit() == False and len(x) != 1)

In [None]:
# check
words.most_common()[:10]