# List of required imports

In [1]:
# -*- coding: utf-8 -*-

import json
import re
import nltk
nltk.data.path.append('/modules/cs918/nltk_data/') #for running on lab machines 
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 
from nltk.util import ngrams
from collections import Counter

wnl = WordNetLemmatizer() 

# Regexes to find URLs

In [2]:
"""url regexs"""
url_reg2 = r"https?[\S]+" 
url_reg = r"((https?)\:\/\/)?(www\.)?(([a-zA-z0-9-]+)(\.))+((com)|(be)|(ly)|(ca)|(edu)|(gl)|(co\.uk)|(net)|(org(\.uk)?)|(gov(\.uk)?))(\.(\/)?)?((\/[\S]+)+)?"

url_reg doesn't find all URLs so url_reg2 is needed to find the remaining one. 

# Counters and token's list

In [3]:
"""Stores tokens in this list"""
tokens = [] #token list of corpus

"""Counters"""
positive_stories = 0 #number of stories with more positive words
negative_stories = 0 #number of stories with more negative words

positive_count = 0 #number of positive words in corpus
negative_count = 0 #number of negative words in corpus

tokens stores all of the tokens in the corpus in a list
postive_stories and negative_stories keeps track of the number of stories with more positive or negative words.
While positive_count and negative_count keeps track of the matching words throughout the entire corpus. 

# Function to retrieve the POS tag

In [4]:
""" Finds position tag for POS tagging"""
def get_pos_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

This function is used if POS tagging is enabled. The POS tagger returns tags in a format not recognised by the lemmatizer hence this function is used to provide a recogised argument to the lemmatizer. 

# Opens positive and negative words file

In [5]:
    
""" Open positive words file"""
pr = set(open("positive-words.txt").read().split())


""" Open negative words file"""
nr = set(open("negative-words.txt").read().split())


# Functions to count the number of positive and negative words

In [6]:
""" Counts positive words """   
def pos_word_count(test_list):
    return sum(i in pr for i in test_list)

""" Counts negative words """
def neg_word_count(test):
    return sum(i in nr for i in test)
    

# Preprocessing text and calculating the number of positive & negative words in each content

In [7]:
""" Part A """

print("\n \n ===== Part A =====")

print("\n preprocessing text and calculating positive & negative counts...")

with open("signal-news1.jsonl") as f:
    for line in f: #string form of json object
        tmp = json.loads(line) #each json object loaded as dict
        content = tmp["content"] #fetches only content field
        lower_content = content.lower() #lowercases content
        urlr2 = re.sub(url_reg2, "", lower_content) #removes URLs
        urlr = re.sub(url_reg, "", urlr2) #removes remaining URLs 
        stripped = re.sub(r"([^\s\w]|_)+", "", urlr) #removes all non-alphanumeric characters except spaces
        removed_numb = re.sub(r"\b[0-9]+\b", "", stripped) #removes numbers
        removed_char = re.sub(r"\b\w{1}\b", "", removed_numb) #removes words with 1 character
        split_text = removed_char.split() #converts the text into list of words (tokens)
#        tagged = nltk.pos_tag(split_text)
#        lem_text = [wnl.lemmatize(i[0], get_pos_tag(i[1])) for i in tagged] #lemmatizes the list
        lem_text = [wnl.lemmatize(i) for i in split_text] #lemmatizes the list
        tokens += lem_text # adds the tokens to the token list of corpus
        p_count = pos_word_count(lem_text)
        n_count =  neg_word_count(lem_text)
        if p_count > n_count:
            positive_stories += 1
        elif p_count < n_count:
            negative_stories += 1
        positive_count += p_count
        negative_count += n_count

print("\n preprocessing finished")


 
 ===== Part A =====

 preprocessing text and calculating positive & negative counts...

 preprocessing finished


Opens the corpus and loads on the "content" field for each article. 
Text is preprocessed in the following order:
1) lower cased
2) URLs are removed
3) Non-alphanumeric characters are removed
4) Numbers are removed
5) Single character words are removed 

Then the text is split so it can be lemmatized. 
The POS tagger is optional but it is left out by default because it increases the runtime by 4min+.
If it is to be used then uncomment the 2 lines of code and comment the original lem_text.

Then all the tokens are added to the token list. 
Positive and negative words are counted.

# Calculating vocabulary size

In [8]:
""" Part B"""

print("\n \n ===== Part B =====")

types = set(tokens) #finds the types from the tokens of the corpus


 
 ===== Part B =====


calculates the vocabulary size (types) by converting the list of tokens into a set. 

# Prints number of tokens and types

In [9]:
print("\n Number of types (vocabulary size):", len(types))        
print("\n Number of tokens:", len(tokens))   


 Number of types (vocabulary size): 124045

 Number of tokens: 5692756


# Calculates the top 25 most popular trigrams

In [10]:
trigrams = ngrams(tokens, 3) #finds trigrams in corpus
cnt = Counter(trigrams) #counts the trigrams
top_trigrams = cnt.most_common(25) #returns top 25 trigrams

First the trigrams are found. Then they are counted. The final command is used to find the top 25 most common trigrams. 

# Prints results

In [11]:
print("\n The top 25 trigrams are: \n", top_trigrams)
print("\n Number of positive words in corpus:", positive_count)
print("\n Number of negative words in corpus:", negative_count)
print("\n Number of stories with more positive words: ", positive_stories)
print("\n Number of stories with more negative words: ", negative_stories)


 The top 25 trigrams are: 
 [(('one', 'of', 'the'), 2434), (('on', 'share', 'of'), 2095), (('on', 'the', 'stock'), 1567), (('a', 'well', 'a'), 1423), (('in', 'research', 'report'), 1415), (('in', 'research', 'note'), 1373), (('the', 'united', 'state'), 1223), (('for', 'the', 'quarter'), 1221), (('average', 'price', 'of'), 1193), (('research', 'report', 'on'), 1177), (('research', 'note', 'on'), 1138), (('share', 'of', 'the'), 1132), (('the', 'end', 'of'), 1130), (('in', 'report', 'on'), 1124), (('earnings', 'per', 'share'), 1121), (('cell', 'phone', 'plan'), 1073), (('phone', 'plan', 'detail'), 1070), (('according', 'to', 'the'), 1066), (('of', 'the', 'company'), 1057), (('buy', 'rating', 'to'), 1016), (('appeared', 'first', 'on'), 995), (('moving', 'average', 'price'), 995), (('day', 'moving', 'average'), 993), (('price', 'target', 'on'), 981), (('part', 'of', 'the'), 935)]

 Number of positive words in corpus: 170754

 Number of negative words in corpus: 129731

 Number of stories w