In [3]:
# %load tools.py
from collections import defaultdict
import re
import math
from unidecode import unidecode

spchars = re.compile('\`|\~|\!|\@|\#|\$|\%|\^|\&|\*|\(|\)|\_|\+|\=|\\|\||\{|\[|\]|\}|\:|\;|\'|\"|\<|\,|\>|\?|\/|\.|\-')

# Utility function that does the following to the text:
# - Convert to unicode
# - Convert to lowercase
# - Remove special chars
def make_text_parsable(text):
    # convert to unicode
    text = unidecode(text) #.decode('utf-8', 'ignore'))
    # convert text to lowercase
    text = text.lower()
    # remove special characters
    text = spchars.sub(" ", text)
    return(text)

#
# Tokenize by whitespace. Use the defaultdict(int) whichsets the default 
# factory to int which makes it  the default dict useful for counting. 
#
def count_words(text, wc=None):
    if wc == None:
        wc = defaultdict(int)
    tokens = text.split(" ")
    for t in tokens:
        wc[t] += 1  
    return(wc)

#
# Main function. Opens the file and calls helper functions to parse
# Returns the sorted word count
#
def extract_info(filename):
    import json
    wc = defaultdict(int)
    df = defaultdict(set)
    count = 0
    with open(filename) as fin:
        for line in fin:
            count += 1
            current = json.loads(line)
            text = make_text_parsable(current["abstract"] + " " + \
                current["description"] + " " + current["title"])
            wc = count_words(text, wc)
    

    sorted_wc = sorted(wc.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_wc

In [4]:
sorted_wordCount = extract_info("data_file.txt")
print("1. The most common word is: %d", sorted_wordCount[0])

1. The most common word is: %d ('', 1919)


In [5]:
def extract_info_exclued_stopwords(filename):
    stopwords = get_stopwords()
    sorted_wc = extract_info(filename)
    final = []
    
    for wc in sorted_wc:
        if wc[0] not in stopwords and wc[0] != '':
            final.append(wc)
            
    return final

def get_stopwords():
    lines = [line.rstrip('\n') for line in open('stopwords.txt')]
    return lines

In [6]:
sorted_wordCount = extract_info_exclued_stopwords("data_file.txt")
print("2. The most common three words after removing stopwords are: %d", sorted_wordCount[0:3])

2. The most common three words after removing stopwords are: %d [('data', 231), ('with', 133), ('python', 122)]


In [7]:

#3. Removing irrelevant common stopwords.
def more_stop_words():
    return ['with', 'more', 'use']

def extract_info_exclued_stopwords_and_more(filename):
    stopwords = get_stopwords() + more_stop_words()
    sorted_wc = extract_info(filename)
    final = []
    
    for wc in sorted_wc:
        if wc[0] not in stopwords and wc[0] != '':
            final.append(wc)
            
    return final

In [8]:
sorted_wordCount = extract_info_exclued_stopwords_and_more("data_file.txt")
print("4. The most common three words after removing additional stopwords are: %d", sorted_wordCount[0:3])

4. The most common three words after removing additional stopwords are: %d [('data', 231), ('python', 122), ('learning', 77)]


In [168]:
def extract_info_tfidf(filename):
    import json
    df = defaultdict(int)
    
    wc_all = defaultdict(int)
    
    all_tfs = []
    count = 0
    with open(filename) as fin:
        for line in fin:
            count += 1
            current = json.loads(line)
            doc = make_text_parsable(current["abstract"] + " " + \
                current["description"] + " " + current["title"])
                        
            wc = count_words(doc, defaultdict(int))
            df = update_doc_freq(df ,wc)
            all_tfs.append(calc_tf(wc))
            
    

    sorted_df = sorted(df.items(), key=lambda x: x[1], reverse=True)
    
    idfs = calc_idf(sorted_df, count)
    sorted_idfs = sorted(idfs.items(), key=lambda x: x[1], reverse=True)
    
    tf_idfs = calc_tf_idf(all_tfs, idfs)
    
    stopwords = get_stopwords()
    final = defaultdict(float)
    
    for wc in tf_idfs.items():
        if wc[0] not in stopwords and wc[0] != '':
            final[wc[0]] = wc[1]
            
    sorted_idfs = sorted(final.items(), key=lambda x: x[1], reverse=True)
    return sorted_idfs
    
def calc_tf(word_counts):
    tfs = defaultdict(float)
    total = 0
    
    for wc in word_counts.items():
        total += int(wc[1])
                
    for wc in word_counts.items():
        word = wc[0]
        count = wc[1]
        tfs[word] = count / total
    
    return tfs

def update_doc_freq(df ,word_counts):
    for wc in word_counts.items():
        df[wc[0]] += 1  

    return df

def calc_idf(dfs, count):
    idfs = defaultdict(float)
    
    for df in dfs:
        f = df[1]
        idf = math.log1p(count / f)
        idfs[df[0]] = idf
        
    return idfs

def calc_tf_idf(all_tfs, idfs):
    max_tf_idfs = defaultdict(float)
    
    for tfs in all_tfs:
        for t in tfs:
            tf_idf = tfs[t] * idfs[t]
            max_val = max(max_tf_idfs[t], tf_idf)
            max_tf_idfs[t] = max_val            
            
    return max_tf_idfs

In [170]:
sorted_wordCount = extract_info_tfidf("data_file.txt")
print("The scores in sorted order are: ",sorted_wordCount)

