# Collocations & sentiment analysis - MSNBC


In [134]:
corpusfolder ='./corpora/MSNBC transcripts/'       
corpusfile = corpusfolder + 'MSNBC_withmetadata.csv'         


In [135]:
from datetime import datetime

In [136]:
#separate corpusfile into months - months in column 3 or MonthNr
import pandas as pd 

file = pd.read_csv(corpusfile)


In [137]:
march = 114
april = 249
may = 368
june = 479
july = 593
august = 664
september = 735

## Define categories of interest



In [138]:
cat1_words = set(['social distancing', 'social distance', 'six feet', 'face masks', 'mask', 'masks', 'quarantine', 'lockdown'])
cat2_words = set()
# # Specify the word list for each category in term_list
nrcategories = 1
term_list = [cat1_words, cat2_words]
cat_names = ['Social Distancing']

In [139]:
capsmatter = False

countstems = False

catsexclusive = True

sentencefilter = True

In [140]:
# Ignore anything specified for the second category list if we have just 1 category
if nrcategories == 1:
    term_list[1] = set()

# Lower-case search terms if caps don't matter.
if not capsmatter:
    term_list = [set(x.lower() for x in terms) for terms in term_list]

# Get a list of all terms from either category
allterms = {x for terms in term_list for x in list(terms)}

# wordsonly marks whether all the terms in term_list are single words/stems
# as opposed to multi-word phrases
wordsonly = not any([' ' in x for x in allterms])
if wordsonly:
    cat_words = allterms
else:
    cat_words = set([x for term in allterms for x in term.split()])
    # add word boundary markers at start (and optionally end)
    term_list = [set(['\\b' + searchterm + ('' if countstems else '\\b') \
                      for searchterm in terms]) \
                 for terms in term_list]


In [141]:
def cat_match(text, cat_terms, countstems=False, wordsonly=True):
    """See if the text contains any terms in cat_terms."""
    import re
    if len(cat_terms) == 0:
        return False
    if wordsonly:  # single-word search terms
        wordlist = text.split()
        words = set(wordlist)
        if countstems:
            match = any([word[:len(term)] == term \
                         for word in words for term in cat_terms])
        else:
            match = len(cat_terms.intersection(words)) > 0
    else:  # multi-word search terms; use re module to match
        match = any([re.search(term, text) is not None for term in cat_terms])
    return match


In [142]:
def classify_text(text, cat1_terms, cat2_terms, countstems=False, wordsonly=True):
    """Check for presence of terms from each list in the text."""
    if not capsmatter:
        text = text.lower()
        # Assume terms are already lower-case
    if cat_match(text, cat1_terms, countstems, wordsonly):
        if cat_match(text, cat2_terms, countstems, wordsonly):
            return 3  # match both
        else:
            return 1
    elif cat_match(text, cat2_terms, countstems, wordsonly):
        return 2
    else:
        return 0
    

In [143]:
def increment_counts(words, cat, word_counter, word_freq):
    """Increment global word counters based on category"""
    word_counter[cat] += len(words)
    for word in words:
        if word not in word_freq[0]:
            for x in range(4):
                word_freq[x][word] = 0
        word_freq[cat][word] += 1
    return word_counter, word_freq


In [144]:
def check_caps(word, capitalization):
    """Check if word is capitalized."""
    word_nocap = word.lower()
    if word_nocap not in capitalization[0]:
        capitalization[0][word_nocap] = 0
        capitalization[1][word_nocap] = 0
    capitalization[0][word_nocap] += 1
    if word[0] == word[0].upper():  # optional addition for numbers: and word[0] not in '0123456789'
        capitalization[1][word_nocap] += 1
    return capitalization


In [145]:
def stemofword(word, stems):
    return any([stem == word[:len(stem)] for stem in stems])

In [146]:
sentencefilter

True

#### 5a. Collocations

In [147]:
def analyze_texts(corpusfile, term_list, nrcategories, capsmatter=False,
                  sentencefilter=True, countstems=False):
    """Main text analysis function."""
    import csv
    csv.field_size_limit(10000000)
    
    # Define main variables to track words, etc.
    article_counter, sentence_counter, word_counter = \
        [0,0,0,0], [0,0,0,0], [0,0,0,0]
    cat_texts  = [[], [], [], []]     # containers for selected text
    word_freq = [{}, {}, {}, {}]      # dictionary for words encountered
    capitalization_record = [{}, {}]  # record of words capitalized
    
    cat1_terms = term_list[0]
    cat2_terms = term_list[1]

    sent_counter = 0
    articles = 0
    
    with open(corpusfile, 'r') as texts:

        for text_counter, row in enumerate(csv.reader(texts)):
            
            if text_counter <= march: #change month parameters 
                text = row[7]
                textcat = classify_text(text, cat1_terms, cat2_terms, countstems)
                article_counter[textcat] += 1
                articles +=1

                for sentence in text.split('.'):
                    sent_counter += 1
                    sent_words = sentence.split()
                    # figure out how often each word is capitalized (except 1st word in sentence)
                    for word in sent_words[1:]:
                        capitalization_record = check_caps(word, capitalization_record) 

                    if sentencefilter:
                        # make words lower-case as appropriate 
                        if not capsmatter:
                            sent_words = [w.lower() for w in sent_words]
                        sentcat = classify_text(sentence, cat1_terms, cat2_terms, countstems, capsmatter)
                        sentence_counter[sentcat] += 1
                        word_counter, word_freq = \
                            increment_counts(sent_words, sentcat, word_counter, word_freq)
                        cat_texts[sentcat].append(sentence)

                if not sentencefilter:
                    word_counter, word_freq = \
                        increment_counts([w if capsmatter else w.lower() for w in text.split()], textcat, 
                                         word_counter, word_freq)
                    cat_texts[textcat].append(text)

                # Progress update for user
                if text_counter % 5000 == 0:
                    print("Processing article %d" % (text_counter,))
                    
        

    print("Processed %d sentences in %d articles" % (sent_counter, articles))
    #print("word:", cat_texts[0][:100])
    
    return article_counter, sentence_counter, word_counter, \
           word_freq, capitalization_record, cat_texts, sent_counter
                               

In [148]:
print("Starting at", datetime.now())

article_counter, sentence_counter, word_counter, word_freq, \
    word_caps, cat_texts, nrsentences = \
        analyze_texts(corpusfile, term_list, nrcategories, capsmatter,
                      sentencefilter, countstems)

print("Done at", datetime.now())


Starting at 2020-11-10 17:36:06.567361
Processing article 0
Processed 55895 sentences in 115 articles
Done at 2020-11-10 17:36:10.741039


In [149]:
import csv

# Specify sentiment lexicon location
sentdictfile = './corpora/more_vader.csv' 

sentiment_dict = {}

# Read data into dictionary
with open(sentdictfile, 'r') as dictfile:
    for row in csv.reader(dictfile):
        if len(row) >= 2:
            sentiment_dict[row[0]] = float(row[1])

nr_entries = len(sentiment_dict)
print('The Vader sentiment dictionary contains {} entries.'.format(nr_entries))

avg_valence = sum([x[1] for x in sentiment_dict.items()]) / nr_entries
print('The average valence is {:4.2f}.'.format(avg_valence))

The Vader sentiment dictionary contains 7534 entries.
The average valence is -0.18.


In [150]:
def get_sentiment(texts, sentiment_dict, aggregation='simple'):
    """Return a sentiment value for each text in texts.
    
    Aggregation options: 'simple' sum (positive words - negative words),
    or 'scaled' sum (simple sum / length of text).
    """
    list_total_sentiment = []
    for text in texts:
        text_sentiment_values = []
        words = text.split(" ")
        for word in words:
            if word in sentiment_dict.keys():
                text_sentiment_values.append(sentiment_dict.get(word))
        Sum = sum(text_sentiment_values)
        list_total_sentiment.append(Sum)
            
    if aggregation == 'simple':
        return list_total_sentiment
    
    if aggregation == 'scaled':
        scaled_list = []
        for i in range(0, len(texts)): #i is index of sentence and its sentiment value in their corresponding lists
            words = texts[i].split(" ")
            len_sentence = len(words)
            text_sentiment = list_total_sentiment[i]
            scaled = text_sentiment/len_sentence
            scaled_list.append(scaled)
        return scaled_list  


In [151]:
sentiment_data = [get_sentiment(one_category, sentiment_dict, aggregation='simple') \
                  for one_category in cat_texts]

In [152]:
for cat_sent in sentiment_data[:2]:
    print(float(len(cat_sent)))

54756.0
1139.0


In [153]:
avg_sentiment = [sum(cat_sentiment)/float(len(cat_sentiment)) \
                 for cat_sentiment in sentiment_data[:2]]

print("Average sentiment for non-matching sentences: %5.2f" % avg_sentiment[0])
print("Average sentiment for %s sentences: %5.3f" % (cat_names[0], avg_sentiment[1]))
#print("Average sentiment for %s sentences: %5.3f" % (cat_names[1], avg_sentiment[2]))
#print("Average sentiment for sentences containing both: %5.3f" % avg_sentiment[3])

Average sentiment for non-matching sentences:  0.08
Average sentiment for Social Distancing sentences: 0.037


In [154]:
# Variables for the various counts, so we don't have to keep accessing a list
cat0_nrwords = float(word_counter[0])
cat1_nrwords = float(word_counter[1])
cat2_nrwords = float(word_counter[2])
cat3_nrwords = float(word_counter[3])

# Analogous values for sentences/articles.
cat0_texts = float(sentence_counter[0]) if sentencefilter else float(article_counter[0])
cat1_texts = float(sentence_counter[1]) if sentencefilter else float(article_counter[1])
cat2_texts = float(sentence_counter[2]) if sentencefilter else float(article_counter[2])
cat3_texts = float(sentence_counter[3]) if sentencefilter else float(article_counter[3])


In [155]:
# Get total word count, and total count for each individual word
corpuslen = cat0_nrwords + cat1_nrwords + cat2_nrwords + cat3_nrwords

print("Total number of words: %d. Total number of distinct words: %d." % \
      (corpuslen, len(word_freq[0].items())))

Total number of words: 1038064. Total number of distinct words: 18438.


In [156]:
filter_common = True
filter_catwords = True
filter_stems = False  
filter_bylist = True
filter_frequency = False

In [157]:
# Filter 1: most common words in either category
if filter_common:
    topN = 25000  # the number of top words, per category, to include
                   # note: the top will likely overlap, so this will not produce 2*topN words
    allcatwords = set([w[0] for w in sorted(word_freq[1].items(), 
                                            key=lambda x: x[1],
                                            reverse=True)[:topN]])
    if nrcategories == 2:
        allcatwords |= set([w[0] for w in sorted(word_freq[2].items(),
                                                 key=lambda x: x[1],
                                                 reverse=True)[:topN]])  
else:
    allcatwords = set(word_freq[0].keys())
    
print("Number of different words: %d" % len(allcatwords))
allcatwordsfull = allcatwords

Number of different words: 18438


In [158]:
# Filter 2: Remove any words in our categories
if filter_catwords:
    allcatwords -= allterms


In [159]:
# Filter 3: Remove words beginning with one of our search terms
if filter_stems:
    allcatwords = {w for w in allcatwords if not stemofword(w, allterms)}
    

In [160]:
if filter_bylist:
    allcatwords &= set(sentiment_dict.keys())

In [161]:
print("Number of different words: %d" % len(allcatwords))

Number of different words: 2104


In [162]:
if corpuslen == 0:
    print("No words found: error in processing")
else:
    word_ratios = {w: (word_freq[0][w] + word_freq[1][w] + \
                       word_freq[2][w] + word_freq[3][w]) / corpuslen
                   for w in allcatwords}


In [163]:
# Calculate frequency ratios for remaining words; go with per-word ratio for now
# To go with sentences/articles instead, replace cat0_nrwords by cat0_texts, etc.

if cat0_nrwords == 0:   
    cat0_ratios = {w: 0 for w in allcatwords}
else:
    cat0_ratios = {w: word_freq[0][w] / cat0_nrwords for w in allcatwords}

if catsexclusive:  # exclude category 3 results (makes difference only for nrcats == 2)
    if cat1_nrwords == 0:
        cat1_ratios = {w: 0 for w in allcatwords}
    else:
        cat1_ratios = {w: word_freq[1][w] / cat1_nrwords
                       for w in allcatwords}
    if cat2_nrwords == 0:
        cat2_ratios = {w: 0 for w in allcatwords}
    else:
        cat2_ratios = {w: word_freq[2][w] / cat2_nrwords
                       for w in allcatwords}
    
else:  # add in category 3 results
    cat1_nrwordsX = cat1_nrwords + cat3_nrwords
    cat2_nrwordsX = cat2_nrwords + cat3_nrwords
    if cat1_nrwordsX == 0:
        cat1_ratios = {w: 0 for w in allcatwords}
    else:
        cat1_ratios = {w: (word_freq[1][w] + word_freq[3][w]) / cat1_nrwordsX
                       for w in allcatwords}
    if cat2_nrwordsX == 0:
        cat2_ratios = {w: 0 for w in allcatwords}
    else:
        cat2_ratios = {w: (word_freq[2][w] + word_freq[3][w]) / cat2_nrwordsX
                       for w in allcatwords}


In [164]:
# Filter 5: Frequency of occurrence:
# - compare frequency in target category against baseline frequency
# - compare frequency in target category against threshold

if filter_frequency:
    baseline_threshold = 1.5  # Frequency should be x * greater than baseline frequency
    min_threshold = 0.00001   # Frequency in a category of interest (either one, if we have
                              # two categories) should exceed this threshold
    allcatwords = {w for w in allcatwords \
                   if max(cat0_ratios[w] * baseline_threshold, min_threshold) < \
                      max(cat1_ratios[w], cat2_ratios[w])}


In [None]:
cap_threshold = 0.667  
proper_nouns = \
    set([word for word in allcatwords if  
             word.lower() in word_caps[0] and \
                 (cap_threshold < word_caps[1][word.lower()] / float(word_caps[0][word.lower()]))])

In [166]:
# Filter 5: Remove proper nouns; store as separate list
allcatwords_noproper = allcatwords - proper_nouns

# Now let's see how many words we're left with
print("All words left: %d. Non-proper noun words left: %d" % \
      (len(allcatwords), len(allcatwords_noproper)))

All words left: 2104. Non-proper noun words left: 2056


In [167]:
import math
import sys
eps = sys.float_info.epsilon

# First, category 1 against all text (categories 0 through 3 combined)
PMI_baseline1 = {w: -9.99 if cat1_ratios[w] == 0 \
                     else math.log(cat1_ratios[w] / word_ratios[w]) \
                     for w in allcatwords}
# Two additional measures if we have 2 categories
if nrcategories > 1:
    ratio_relative = {w: (cat1_ratios[w] / (cat1_ratios[w] + cat2_ratios[w])) \
                        for w in allcatwords if cat1_ratios[w] > 0}
    PMI_baseline2 = {w: -9.99 if cat2_ratios[w] == 0 \
                        else math.log(cat2_ratios[w] / word_ratios[w]) \
                        for w in allcatwords}


## 8. Display results

### 8a. Overall frequency patterns

In [168]:
total_texts = cat0_texts + cat1_texts + cat2_texts + cat3_texts 

descriptor = 'sentence' if sentencefilter else 'article'

print('%d total %ss; %.2f%% category-specific' % \
      (total_texts, descriptor, 100 * (cat1_texts + cat2_texts + cat3_texts) / total_texts))                                
print('\n%d %ss contain %s (%.2f%%).' % \
      (int(cat1_texts), descriptor, cat_names[0], 100 * cat1_texts / total_texts))

if nrcategories == 2:
    print('%d %ss contain %s (%.2f%%).' % \
          (int(cat2_texts), descriptor, cat_names[1], 100 * cat2_texts / total_texts))
    print('%d %ss contain both (%.2f%%).' % \
          (int(cat3_texts), descriptor, 100 * cat3_texts / total_texts))
    if cat2_texts != 0:
        print('\n%.1f %ss about %s only for each %s about %s only.' % \
              (cat1_texts / cat2_texts, descriptor, cat_names[0], descriptor, cat_names[1]))

55895 total sentences; 2.04% category-specific

1139 sentences contain Social Distancing (2.04%).


### 8b. Top results by metric (no proper nouns)

In [169]:
def getwords(metriclist, properfilter=False, reverse=True):
    if properfilter:
        metriclist = {w: val for w, val in metriclist.items() if w in allcatwords_noproper}
    return sorted(metriclist, key=metriclist.get, reverse=reverse)

def getdata(cat, wordlist, ratiovals, metricvals, nrwords, mincount=1):
    datalist = [(w, word_freq[cat][w], ratiovals[w], metricvals[w]) for w in wordlist \
                if word_freq[cat][w] >= mincount]
    if len(datalist) >= nrwords:
        return datalist[:nrwords]
    else:
        return datalist + [('-', 0, 0, -9.99) for x in range(nrwords - len(datalist))]

In [170]:
def display1list(catdata):
    print("\nCategory 1      (word, count, frequency, PMI)")
    for w, count, freq, PMI in catdata:
        print("%18s %6d %8.5f %5.2f" % (w, count, freq, PMI))

def display2lists(catdata):
    print("\nCategory 1 (word, count, frequency, PMI)  Category 2 (word, count, frequency, PMI)")
    for w1, count1, freq1, PMI1, w2, count2, freq2, PMI2 in catdata:
        print("%18s %6d %8.5f %5.2f %19s %6d %8.5f %5.2f" % \
              (w1, count1, freq1, PMI1, w2, count2, freq2, PMI2))

def displaycomplists(catdata):
    print("\nCateg.1 (word, count, frequency, metric)  Categ.2 (word, count, frequency, metric)")
    for w1, count1, freq1, PMI1, w2, count2, freq2, PMI2 in catdata:
        print("%18s %6d %8.5f %5.2f %19s %6d %8.5f %5.2f" % \
              (w1, count1, freq1, PMI1, w2, count2, freq2, PMI2))


In [171]:
def swapratiorel(ratiorelative):
    """Convert from ratio1/(ratio1 + ratio2) to ratio2/(ratio1 + ratio2)"""
    return {w: 1 - val for w, val in ratiorelative.items()}


In [172]:
def displayresults(nopropers=True, nrwords=50, mincount=1):
    """Display result lists, with frequency & PMI information."""
    
    print("Selection criteria:")
    print("\nCategory 1: %s" % (cat_names[0]))
    #print("\nCategory 2: %s" % (cat_names[1]))
    
    # Calculate top words of category 1 vs. baseline (category 0)
    cat1words = getwords(PMI_baseline1, properfilter=nopropers)
    cat1data = getdata(1, cat1words, cat1_ratios, PMI_baseline1, 
                       nrwords, mincount=mincount)

    print("\nTop category words, comparing against baseline.")
    print("Measure: PMI (value > 0 means more prevalent in category than in baseline)")

    if nrcategories == 1:
        display1list(cat1data)

    else:  # assume 2 categories

        # Calculate top words of category 2 vs. baseline
        cat2words = getwords(PMI_baseline2, properfilter=nopropers)
        cat2data = getdata(2, cat2words, cat2_ratios, PMI_baseline2, 
                           nrwords, mincount=mincount)
        bothcatsdata = [c1 + c2 for c1, c2 in zip(cat1data, cat2data)]
        display2lists(bothcatsdata)

        # Now calculate & display salience between categories
        cat1words_rel = getwords(ratio_relative, properfilter=nopropers, reverse=True)
        cat2words_rel = getwords(ratio_relative, properfilter=nopropers, reverse=False)
        cat1data_rel = getdata(1, cat1words_rel, cat1_ratios, ratio_relative,
                               nrwords, mincount=mincount)
        cat2data_rel = getdata(2, cat2words_rel, cat2_ratios, swapratiorel(ratio_relative),
                               nrwords, mincount=mincount)
        bothcatsdata = [c1 + c2 for c1, c2 in zip(cat1data_rel, cat2data_rel)]

        print("\nTop category words, comparing categories to one another.")
        print("Measure: category word ratio / sum of both category ratios")
        print("(value > 0.5 means more prevalent in this category)")

        displaycomplists(bothcatsdata)


In [173]:
displayresults(nopropers=True, nrwords=50, mincount=10)  

Selection criteria:

Category 1: Social Distancing

Top category words, comparing against baseline.
Measure: PMI (value > 0 means more prevalent in category than in baseline)

Category 1      (word, count, frequency, PMI)
           wearing     30  0.00087  2.74
              wear     19  0.00055  2.54
          shortage     15  0.00043  1.79
         shortages     11  0.00032  1.62
          guidance     11  0.00032  1.33
        aggressive     10  0.00029  1.23
           protect     23  0.00066  1.03
           exposed     11  0.00032  1.03
              hand     12  0.00035  0.94
          critical     13  0.00038  0.76
          infected     13  0.00038  0.47
              care     55  0.00159  0.45
              best     18  0.00052  0.36
              stop     10  0.00029  0.35
              need    110  0.00318  0.31
              sick     17  0.00049  0.28
            pretty     10  0.00029  0.19
            deaths     10  0.00029  0.12
              kind     28  0.00081  0.12

### 8c. Results with proper nouns

In [174]:
displayresults(nopropers=False, nrwords=50, mincount=10)

Selection criteria:

Category 1: Social Distancing

Top category words, comparing against baseline.
Measure: PMI (value > 0 means more prevalent in category than in baseline)

Category 1      (word, count, frequency, PMI)
           wearing     30  0.00087  2.74
              wear     19  0.00055  2.54
          shortage     15  0.00043  1.79
         shortages     11  0.00032  1.62
          guidance     11  0.00032  1.33
        aggressive     10  0.00029  1.23
           defense     16  0.00046  1.10
           protect     23  0.00066  1.03
           exposed     11  0.00032  1.03
              hand     12  0.00035  0.94
          critical     13  0.00038  0.76
          infected     13  0.00038  0.47
              care     55  0.00159  0.45
              best     18  0.00052  0.36
              stop     10  0.00029  0.35
              need    110  0.00318  0.31
              sick     17  0.00049  0.28
            pretty     10  0.00029  0.19
            deaths     10  0.00029  0.12