In [123]:
import json
import pprint
import re
import requests

from bs4 import BeautifulSoup

pp = pprint.PrettyPrinter(indent=4)

data = json.load(open('gg2018.json'))

# helper functions
def find_matching_tweets_from_data(regex, source):
    # source is a list tweet objects, regex is the pattern to match
    returnList = []
    r = re.compile(regex)
    for tweet in source:
        # print(tweet)
        text = tweet.get('text')
        # print(text)
        if(bool(r.match(text))):
            returnList.append(text)
    return returnList


def find_matching_tweet_parts(regex, source):
    # source is a list of strings, regex is the pattern to match
    returnList = []
    r = re.compile(regex)
    for tweet in source:
        # print(tweet)
        returnList += (re.findall(r, tweet))
    return returnList

def find_matching_tweets(regex, source):
    # source is a list of strings, regex is the pattern to match
    returnList = []
    r = re.compile(regex)
    for tweet in source:
        # print(tweet)
        if(bool(r.match(tweet))):
            returnList.append(tweet)
    return returnList

def mostCommonD(source):
    # returns dictionary with number of occurences for each string in source (useful for lists of hashtags/handles)
    d = {}
    for tweet in source:
        if tweet in d:
            d[tweet] += 1
        else:
            d[tweet] = 1
    return d

def listMostCommon(d):
    # reformats dictionary from mostCommonD into an ordered list
    sorted_list = []
    d_invert = dict(map(lambda item: (item[1],item[0]),d.items()))
    sorted_by_vals = sorted(d_invert)
    for val in reversed(sorted_by_vals):
        sorted_list.append([val, d_invert[val]])
    return sorted_list

In [15]:
# Two ways to get the host, 1st gets Seth Myers' Twitter handle

hostList = find_matching_tweets_from_data(".(?i)*host.*", data)
# print(len(hostList))
# hostList is every tweet object that has "host" in it

jobAsHost = find_matching_tweets(".*(?i)job.*", hostList)
# print(len(jobAsHost))
# jobAsHost is every tweet object that has "host" and "job" in it

startsWithHandle = find_matching_tweets("@.*", jobAsHost)
# print(len(startsWithHandle))

# startsWithHandle is every tweet object that has "host" and "job" in it that starts with a twitter handle

justHandle = [text.split(' ', 1)[0] for text in startsWithHandle]
# print(justHandle)
d = mostCommonD(justHandle) # dictionary with number of occurences for each handle
l = listMostCommon(d)
print(l)

[[6, '@sethmeyers'], [2, '@alohamomma60']]


In [16]:
# 2nd gets Seth Myers' name

hostingList = find_matching_tweets_from_data(".*hosting.*", data)
sethList = find_matching_tweets(".*job.*", hostingList)
print(len(sethList))

def mostCommonTwoStartingWords(source):
    maxMentions=0
    mentions=0
    maxMentionName=""
    for tweet in source:
        name=tweet.split(' ', 1)[0]+' '+tweet.split(' ')[1]
        for tweet in source:
            egExpres=".*"+name+".*"
            r=re.compile(egExpres)
            if(bool(r.match(tweet))):
                mentions+=1
        if(mentions>maxMentions):
            maxMentions=mentions
            maxMentionName=name
        mentions=0
    print(maxMentions)
    print(maxMentionName)

mostCommonTwoStartingWords(sethList)

# checking the most common two starting words (figure this will be a name)

20
8
Seth Meyers


In [103]:
# Web Scraping

# Scraping Categories

res = requests.get("https://en.wikipedia.org/wiki/Golden_Globe_Award")
# print(res.status_code)
# print(res.headers)
content = res.content
soup = BeautifulSoup(content)
element = soup.find(id="Categories")
# navigate
element = element.parent.next_sibling.next_sibling.next_sibling.next_sibling
# element is <ul> of motion picture awards
motion_picture_awards = [s for s in element.stripped_strings]
pp.pprint(motion_picture_awards)
element = element.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling
# element is <ul> of tv awards
tv_awards = [s for s in element.stripped_strings]
pp.pprint(tv_awards)

[   'Best Motion Picture – Drama',
    'Best Motion Picture – Musical or Comedy',
    'Best Director',
    'Best Actor – Motion Picture Drama',
    'Best Actor – Motion Picture Musical or Comedy',
    'Best Actress – Motion Picture Drama',
    'Best Actress – Motion Picture Musical or Comedy',
    'Best Supporting Actor – Motion Picture',
    'Best Supporting Actress – Motion Picture',
    'Best Screenplay',
    'Best Original Score',
    'Best Original Song',
    'Best Foreign Language Film',
    'Best Animated Feature Film',
    'Cecil B. DeMille Award for Lifetime Achievement in Motion Pictures']
[   'Best Drama Series',
    'Best Comedy Series',
    'Best Actor in a Television Drama Series',
    'Best Actor in a Television Comedy Series',
    'Best Actress in a Television Drama Series',
    'Best Actress in a Television Comedy Series',
    'Best Limited Series or Motion Picture made for Television',
    'Best Actor in a Limited Series or Motion Picture made for Television',
    'Be



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [124]:
# Hashtag Searching

award_tweets = find_matching_tweets_from_data("(?i).*best director.*", data)
hashtags_in_award_tweets = find_matching_tweet_parts("(?i)#\w+", award_tweets)
d = mostCommonD(hashtags_in_award_tweets)
pp.pprint(listMostCommon(d))

# hashtags works okay for movies, not as well for actors/actresses
# handles don't work well for actors/actresses because the handle of the person with the most popular opinion is the one ranked the highest

[   [4982, '#GoldenGlobes'],
    [657, '#TheShapeofWater'],
    [576, '#OscarSoWhite'],
    [325, '#GoldenGlobe'],
    [278, '#NataliePortman'],
    [169, '#TheShapeOfWater'],
    [121, '#TIMESUP'],
    [103, '#goldenglobes'],
    [67, '#TIME'],
    [66, '#GoldenGl'],
    [63, '#TimesUp'],
    [50, '#TIMES'],
    [42, '#GoldenGlobes2018'],
    [38, '#Golde'],
    [30, '#G'],
    [23, '#GoldenGlo'],
    [22, '#Representati'],
    [20, '#Gol'],
    [18, '#timesup'],
    [16, '#go'],
    [14, '#Times'],
    [13, '#TI'],
    [11, '#TimesUP'],
    [9, '#Gold'],
    [7, '#GretaGerwig'],
    [5, '#Globes75'],
    [4, '#natalieportman'],
    [3, '#tech'],
    [2, '#USRC'],
    [1, '#love']]
