# Applied Text and Natural Language Analytics, Fall 2020

### Assignment 7

Submitted by - 
Harsh Dhanuka, hd2457

In [1]:
# !pip install webhoseio
# !pip install simhash

In [2]:
import webhoseio, os
from gensim.models import KeyedVectors
import logging
from simhash import Simhash, SimhashIndex
import numpy as np
import pandas as pd

import json
import nltk

import gensim, operator
from scipy import spatial

import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_rows", 100)

# 1. Write a Python program that filters out exactly and/or semantically duplicate articles from your Webhose dataset of news articles:

### Load the downloaded pre-trained Google Word2Vec model from your computer

In [3]:
model_path = '/Users/harshdhanuka/Desktop/Columbia Class Matter/SEM 3/5430 Applied Text NLP/Assignment 6/'

def load_wordvec_model(modelName, modelFile, flagBin):
    print('Loading ' + modelName + ' model...')
    model = KeyedVectors.load_word2vec_format(model_path + modelFile, binary=flagBin)
    print('Finished loading ' + modelName + ' model...')
    return model

model_w2v       = load_wordvec_model('Word2Vec Google News', 'GoogleNews-vectors-negative300.bin.gz', True)
#model_fasttext = load_wordvec_model('FastText', 'fastText_wiki_en.vec', False)

Loading Word2Vec Google News model...
Finished loading Word2Vec Google News model...


In [4]:
model_w2v.vector_size

300

### Define functions:

In [5]:
# Define function to check if the input words are present in Word2vec model vocabulary
def vocab_check(vectors, words):
    
    output = list()
    for word in words:
        if word in vectors.vocab:
            output.append(word.strip())
            
    return output

In [6]:
# Define function to calculate similarity between two strings using a particular word vector model
def calc_similarity(input1, input2, vectors):
    s1words = set(vocab_check(vectors, input1.split()))
    s2words = set(vocab_check(vectors, input2.split()))
    
    output = vectors.n_similarity(s1words, s2words)
    return output

In [7]:
# Define function to remove English stopwords
def cleanup(input):
    # remove English stopwords
    input = input.replace("'s", " ").replace("n’t", " not").replace("’ve", " have")
    input = re.sub(r'[^a-zA-Z0-9 ]', '', input)
    return input

### Load your previously obtained dataset of Webhose news articles

I will be using the `Netflix` dataset

In [8]:
netflix_data = []
with open('/Users/harshdhanuka/Desktop/Columbia Class Matter/SEM 3/5430 Applied Text NLP/Assignment 7/webhose_netflix.json', 'r') as f:
    for line in f.readlines():
        netflix_data.append(json.loads(line))

In [9]:
len(netflix_data)

25288

#### I will slice the data to only consider the first 10,000 rows, to save application run-time. The entire 25,000 rows keeps running for many hours.

In [10]:
netflix_data = netflix_data[:10000]

### Read original dataset titles only, to a new variable called 'feeds'

We will base deduplication on article titles only.

In [11]:
# Show only the first 50 articles, for reference purpose only.

feeds = []
i = 0
for feed in netflix_data[:50]:
    feed['id'] = i
    print(feed['id'], str(feed['title']))
    i += 1
    feeds.append(feed)

0 13 Reasons Why: The popular Netflix show's creator teases chance of a hopeful ending
1 Judge gives control of 'Tiger King' Joe Exotic's zoo to Carole Baskin
2 A TV reboot of Bong Joon-ho's acclaimed film Snowpiercer has landed on Netflix — what's the deal?
3 2-Pack: Ideaworks Mosquito Killer Lamps (battery powered) 2 for $15
4 Already-Obese Average Americans Have Drunk & Eaten Their Way To An Extra 5lbs During Lockdown
5 Netflix, Disney join other big brands in support of George Floyd protests on 'Blackout Tuesday'
6 Novel Entertainment's First Animated Feature-Length Horrid Henry Special to Air on Netflix – aNb Media
7 Anime Based On Best-Selling 1973 Disaster Novel, Japan Sinks: 2020 Lands On Netflix This July
8 Tiger King star Carole Baskin’s dead husband’s signature on his will was forged, says sheriff
9 All about Netflix’s sci-fi television shows we love
10 File:Federation starbase, 2230s.png
11 news: Reemerged documentary reveals reason behind Her Majesty’s iconic wave
12 Samsu

In [12]:
# Select all of the titles, and pass it to the new variable 'feeds'

feeds = []
i = 0
for feed in netflix_data:
    feed['id'] = i
    #print(feed['id'], str(feed['title']))
    i += 1
    feeds.append(feed)

## 1.1. Use LSH (SimHash or MinHash), separately or along with Word2Vec, to deduplicate your Webhose feeds based on titles

### Calculate and print SimHash + Word2Vec similarity based duplicate titles

In [13]:
# Create a SimHash logger

logging.getLogger('simhash').setLevel(logging.CRITICAL)

In [14]:
# Create a SimHash Object, with a random intuitive distance of 25 characters (as guided by professor in class)

hamming_distance = 25

objs = [(str(feed['id']), Simhash(str(feed['title']))) for feed in feeds]
index = SimhashIndex(objs, k = hamming_distance)

In [15]:
# Define threshold score for the similarity score, for the word 2 vec model, score of  0.7 (as guided by professor in class)

threshold_score = 0.7

### Run a for loop, to iterate through all the titles, and capture their respective duplicates

In [16]:
dup_index_list = []   # Initialize the emtpy duplicates indices list
count_duplicates = 0  # initialize total duplicates count to 0

# Run loop throughout the feeds set, as per he hamming distance of 25, and threshold of 0.7

for feed_index in range(len(feeds)):
    if int(feed_index) not in dup_index_list:  # make sure the index is not already in the  duplicates list
        selected_feed = feeds[feed_index]
        feed_hash = Simhash(str(selected_feed['title']))
        duplicate_indices = index.get_near_dups(feed_hash) 
        # the above steps gives us the duplicate indices (id's) for the entire data, including the selected feed index row
        
        # calculate the similarity score as per word 2 vec model
        # I will see that the first entry is kep, and the other duplicates are removed
        for dupe in duplicate_indices:
            if int(dupe) not in dup_index_list:
                try:
                    score = calc_similarity(selected_feed['title'], feeds[int(dupe)]['title'], model_w2v)    
                except:
                    score = 0
                
                # Remove the duplicate entries which have a similarity score of over 0.7
                if score > threshold_score:
                    if int(dupe) not in dup_index_list:
                        if feeds[int(dupe)]['id'] != selected_feed['id']:    # keep the original first entry item, remove the /duplicates others only
                            count_duplicates += 1
                            dup_index_list.append(feeds[int(dupe)]['id'])

### Check duplicate statistics

In [17]:
print()
print('The original or raw dataset has a total of: ' + str(len(feeds)) + ' values or entries or rows')
print()
print('The number of duplicates as per the hamming distance of 25, and a similarity score threshold of 0.7 is: ' + str(count_duplicates))
print()
print('Finally, the dataset has: ' + str((count_duplicates/len(feeds))*100) + '% duplicates')


The original or raw dataset has a total of: 10000 values or entries or rows

The number of duplicates as per the hamming distance of 25, and a similarity score threshold of 0.7 is: 5740

Finally, the dataset has: 57.4% duplicates


In [18]:
# Testing if its only pulling out duplicates, or also the original entries

for row in sorted(dup_index_list)[:10]:
    print(feeds[row]['id'],feeds[row]['title'])

24 Carole Baskin awarded Joe Exotic's former zoo
27 The will of 'Tiger King' star Carole Baskin's missing husband Don Lewis was forged, sheriff says
36 Judge Gives Carole Baskin the Tiger King’s Zoo – غزة اليوم
39 Steve Martin and Martin Short have rescheduled their 'The Funniest Show In Town...At The Moment' tour dates - 03-Jun-2020 - NZ Entertainment news
41 Steve Martin and Martin Short have rescheduled their 'The Funniest Show In Town...At The Moment' tour dates - 03-Jun-2020 - NZ Entertainment news
43 Steve Martin and Martin Short have rescheduled their 'The Funniest Show In Town...At The Moment' tour dates - 03-Jun-2020 - NZ Entertainment news
52 Pinpoint Asset Management Ltd Acquires Shares of 12,106 Netflix, Inc. (NASDAQ:NFLX)
53 Pinpoint Asset Management Ltd Acquires Shares of 12,106 Netflix, Inc. (NASDAQ:NFLX)
54 Fascinating study reveals which country has the best Netflix content
57 US To "Investigate" India, 9 Other Nations Over Tax On Online Firms


#### We see that title at index 1 is similar to articles at index 24, 45, 98, 99, etc, but in the duplicates list, we excluded index 1, and put the others, which is exactly what we want. 

### Make the final new list of dictionary, without the duplicates

In [19]:
netflix_unique = []

for feed in range(len(feeds)):
    if int(feed) not in dup_index_list:
        netflix_unique.append(feeds[int(feed)])
        
print()
print("The length of the new list with unique netflix titles is: " + str(len(netflix_unique)))
print()
print("A random entry (id and title) from the new dataset is: ")
print()
print(netflix_unique[5]['id'])
print(netflix_unique[5]['title'])


The length of the new list with unique netflix titles is: 4260

A random entry (id and title) from the new dataset is: 

5
Netflix, Disney join other big brands in support of George Floyd protests on 'Blackout Tuesday'


In [20]:
print()
print("The total number of titles or rows in the new unique dataset is: ")
print()
len(netflix_unique)


The total number of titles or rows in the new unique dataset is: 



4260

## 1.2. Make sure to store entire feeds in a JSON, text or CSV file

In [21]:
with open('unique_data.json', 'w') as f:
    json.dump(netflix_unique, f)