# Applied Text and Natural Language Analytics, Fall 2020

### Assignment 6

Submitted by - 
Harsh Dhanuka, hd2457

In [1]:
import json
import nltk

import gensim, operator
from scipy import spatial
import numpy as np
from gensim.models import KeyedVectors

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 100)

# 1. Write a Python program based on the provided Class Exercise, which:

## 1.1. Loads the downloaded pre-trained Google Word2Vec model from your computer

In [2]:
model_path = '/Users/harshdhanuka/Desktop/Columbia Class Matter/SEM 3/5430 Applied Text NLP/Assignment 6/'

#### Import the word2vec pre-trained model

In [3]:
def load_wordvec_model(modelName, modelFile, flagBin):
    print('Loading ' + modelName + ' model...')
    model = KeyedVectors.load_word2vec_format(model_path + modelFile, binary=flagBin)
    print('Finished loading ' + modelName + ' model...')
    return model

model_word2vec = load_wordvec_model('Word2Vec', 'GoogleNews-vectors-negative300.bin.gz', True)

Loading Word2Vec model...
Finished loading Word2Vec model...


In [4]:
model_word2vec.vector_size

300

## 1.2. Loads your previously obtained dataset of Webhose news articles

I will be using the `Netflix` dataset

In [5]:
file = '/Users/harshdhanuka/Desktop/Columbia Class Matter/SEM 3/5430 Applied Text NLP/Assignment 6/webhose_netflix.json'

json_data=open(file).readlines()
newsfeeds = []

for line in json_data:
    newsfeeds.append(json.loads(line))

In [6]:
df = pd.DataFrame.from_dict(newsfeeds)
df.head(1)

Unnamed: 0,thread,uuid,url,ord_in_thread,parent_url,author,published,title,text,highlightText,highlightTitle,highlightThreadTitle,language,external_links,external_images,entities,rating,crawled
0,{'uuid': 'f890670c1406310226c8e9420a798d81420c...,f890670c1406310226c8e9420a798d81420c5a96,http://omgili.com/ri/.wHSUbtEfZSCvFgWhG.N__Y_k...,0,,stuff.co.nz,2020-06-03T22:49:00.000+03:00,13 Reasons Why: The popular Netflix show's cre...,The controversial 13 Reasons Why is returning ...,,,,english,[],[],"{'persons': [{'name': 'hannah baker', 'sentime...",,2020-06-03T05:55:53.024+03:00


In [7]:
titles = df['title']

# Create a list of all titles
titles_list = [title for title in titles]
print(" ")
print("A sample title in the list is: ")
print(titles_list[1])
print(" ")
print("The total number of titles are:")
len(titles_list)

 
A sample title in the list is: 
Judge gives control of 'Tiger King' Joe Exotic's zoo to Carole Baskin
 
The total number of titles are:


25288

## 1.3. For any one selected article title from the dataset, finds 100 most similar titles based on Word2Vec similarity, prints those titles in a descending order of similarity scores.

The title I select to find the 100 most similar titles is title 9.

In [8]:
title_check = titles_list[9]
title_check

'All about Netflix’s sci-fi television shows we love'

In [9]:
while("" in titles_list): 
    titles_list.remove("") 

In [10]:
print("The total number of titles are:")
len(titles_list)

The total number of titles are:


25287

#### Check if the input words are present in Word2vec model vocabulary

In [11]:
def vocab_check(vectors, words):
    
    output = list()
    for word in words:
        if word in vectors.vocab:
            output.append(word.strip())
            
    return output

#### Calculate similarity between two strings using an input Word Vector Model

In [12]:
# function calculates similarity between two strings using a particular word vector model
def calc_similarity(input1, input2, vectors):
    s1words = set(vocab_check(vectors, input1.split()))
    s2words = set(vocab_check(vectors, input2.split()))
    
    output = vectors.n_similarity(s1words, s2words)
    return output

#### Calculate the scores, for all titles, with reference to title 9 we selected

I will run a try except catch, to run the scores for all the titles first, and then, sort and extract the top 100.

In [13]:
similarity_list = []

for i in titles_list:
    try:
        similarity_score = calc_similarity(title_check, i, model_word2vec)
        similarity_list.append(similarity_score)
    except:
        #similarity_list.append(0, 'ERROR ZERO DIV '+i)
        similarity_list.append(0)

#### Print the 100 top most similar articles

In [14]:
df = pd.DataFrame(list(zip(titles_list, similarity_list)),columns =['Title', 'Similarity'])
most_similar = df.sort_values(['Similarity'], ascending=0)
print(most_similar[1:101])

# the first entry will be the same article with a score of 1.0, 
# so I will print from 1 to 101, instead of 0 to 100

                                                   Title  Similarity
23039  What are you watching on TV during lockdown? W...    0.775924
15201  What is CBS All Access? What you need to know ...    0.765988
18726  I know this much is true: you need to watch th...    0.763972
20053  Will reality shows take over the TV industry? ...    0.763933
9792        7 great TV shows you can watch this weekend,    0.752362
4763          Everything you need to know about Acorn TV    0.744234
20914        7 great TV shows you can watch this weekend    0.744012
5398   HBO Max: Everything you need to know about AT&...    0.743869
5384   HBO Max: Everything you need to know about AT&...    0.742667
22864   Apocalyptic movies show what we worry about most    0.736770
22616   Apocalyptic movies show what we worry about most    0.736770
24878  Joanna Lumley: ‘I adore Gogglebox but I’d be h...    0.735039
24939  Joanna Lumley: ‘I adore Gogglebox but I’d be h...    0.735039
12081  BBC Together lets you watch

# 2. Write a Pyspark program based on the other provided Class Exercise, which:

## 2.1. Loads your previously obtained dataset of Webhose news articles into a Spark dataframe

In [15]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
sc = SparkContext() 
sqlContext = SQLContext(sc)
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.clustering import LDA, LDAModel
from nltk.stem.wordnet import WordNetLemmatizer
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec

print("Using Apache Spark Version", sc.version)

Using Apache Spark Version 3.0.1


#### Read in Crunchbase Open Data Map into Spark DataFrame

I  will load the `Netflix` dataset into the spark df 

In [16]:
crunchbase_df = sqlContext.read.options(inferSchema='True',delimiter=',',header='True') \
                               .json("webhose_netflix.json")
# Check length
crunchbase_df.count()

25288

In [17]:
# Show first 2 rows
crunchbase_df.show(2)

+-----------+--------------------+--------------------+---------------+--------------+-------------+--------------------+--------------+--------+-------------+----------+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|     author|             crawled|            entities|external_images|external_links|highlightText|highlightThreadTitle|highlightTitle|language|ord_in_thread|parent_url|           published|rating|                text|              thread|               title|                 url|                uuid|
+-----------+--------------------+--------------------+---------------+--------------+-------------+--------------------+--------------+--------+-------------+----------+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|stuff.co.nz|2020-06-03T05:55:...|[[], [[netflix, n...|             []|            []|  

#### Filter out empty descriptions from the dataframe

As per the given columms, the article titles are contained in the `title` column.
So, I will be extracting the required `uuid` and the `title` columns for my analysis. 

In [18]:
crunchbase_df = crunchbase_df.select('uuid','title') \
                             .filter('title != ""')
crunchbase_df.count()

# See that it filters out 1 blank row

25287

In [19]:
# Show the first  5 rows
crunchbase_df.show(5)

+--------------------+--------------------+
|                uuid|               title|
+--------------------+--------------------+
|f890670c140631022...|13 Reasons Why: T...|
|f1da1d6c5ddf6b095...|Judge gives contr...|
|f431c194e4eddacdd...|A TV reboot of Bo...|
|5930a57af03089f5d...|2-Pack: Ideaworks...|
|050149948217f53d4...|Already-Obese Ave...|
+--------------------+--------------------+
only showing top 5 rows



## 2.2. Clean up and tokenize article bodies using the RegexTokenizer and Stopword remover functions provided in the Class Exercise

In the further questions, I need to produce matching titles. Hence, I will be using the  `title` column for all my analysis and tokenization.

In [20]:
regextok = RegexTokenizer(gaps = False, pattern = '\w+', 
                          inputCol = 'title', outputCol = 'tokens')
stopwrmv = StopWordsRemover(inputCol = 'tokens', outputCol = 'tokens_sw_removed')

crunchbase_df = regextok.transform(crunchbase_df)
crunchbase_df = stopwrmv.transform(crunchbase_df)
crunchbase_df.show(5)

+--------------------+--------------------+--------------------+--------------------+
|                uuid|               title|              tokens|   tokens_sw_removed|
+--------------------+--------------------+--------------------+--------------------+
|f890670c140631022...|13 Reasons Why: T...|[13, reasons, why...|[13, reasons, pop...|
|f1da1d6c5ddf6b095...|Judge gives contr...|[judge, gives, co...|[judge, gives, co...|
|f431c194e4eddacdd...|A TV reboot of Bo...|[a, tv, reboot, o...|[tv, reboot, bong...|
|5930a57af03089f5d...|2-Pack: Ideaworks...|[2, pack, ideawor...|[2, pack, ideawor...|
|050149948217f53d4...|Already-Obese Ave...|[already, obese, ...|[already, obese, ...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



## 2.3. Trains a Word2Vec model based on the output column produced in step 2

In [21]:
word2vec = Word2Vec(vectorSize = 300, minCount = 5, inputCol = 'tokens_sw_removed', outputCol = 'wordvectors')
model = word2vec.fit(crunchbase_df)
wordvectors = model.transform(crunchbase_df)

#wordvectors.select('wordvectors').show(1, truncate = True)
crunchbase_desc = wordvectors.select('uuid','title','wordvectors').rdd.toDF()
crunchbase_desc.show(5)

+--------------------+--------------------+--------------------+
|                uuid|               title|         wordvectors|
+--------------------+--------------------+--------------------+
|f890670c140631022...|13 Reasons Why: T...|[-0.0109111847588...|
|f1da1d6c5ddf6b095...|Judge gives contr...|[-0.0933298821561...|
|f431c194e4eddacdd...|A TV reboot of Bo...|[-0.0527028515511...|
|5930a57af03089f5d...|2-Pack: Ideaworks...|[0.00304350345395...|
|050149948217f53d4...|Already-Obese Ave...|[-0.0066364290192...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



## 2.4. Implements any sample search query, as shown in Class Exercise, and produces matching article titles

For this, I am using the sample query from the link: https://www.forbes.com/sites/forbescommunicationscouncil/2020/10/20/your-content-marketing-should-look-more-like-netflix-not-websters/#235aa6fe70ca 

The sample article title is `Your Content Marketing Should Look More Like Netflix, Not Webster's`

In [22]:
chunk = crunchbase_desc.take(30000)

In [23]:
SEARCH_QUERY = "Your Content Marketing Should Look More Like Netflix, Not Webster's"

In [24]:
query_df  = sc.parallelize([(1,SEARCH_QUERY)]).toDF(['index','title'])
query_tok = regextok.transform(query_df)
query_swr = stopwrmv.transform(query_tok)
query_swr.show()
query_vec = model.transform(query_swr)
query_vec = query_vec.select('wordvectors').collect()[0][0]
#query_vec

+-----+--------------------+--------------------+--------------------+
|index|               title|              tokens|   tokens_sw_removed|
+-----+--------------------+--------------------+--------------------+
|    1|Your Content Mark...|[your, content, m...|[content, marketi...|
+-----+--------------------+--------------------+--------------------+



In [25]:
def cossim(v1, v2): 
    return np.dot(v1, v2) / np.sqrt(np.dot(v1, v1)) / (np.sqrt(np.dot(v2, v2))+.1)

In [26]:
print(" ")
print("The top 50 matching articles and their similarity scores are as follows:")
print(" ")

sim_rdd = sc.parallelize((i[0], i[1], float(cossim(query_vec, i[2]))) for i in chunk)
sim_df  = sqlContext.createDataFrame(sim_rdd).\
                   withColumnRenamed('_1', 'uuid').\
                   withColumnRenamed('_2', 'title').\
                   withColumnRenamed('_3', 'similarity').\
                   orderBy("similarity", ascending = False)
sim_df.show(50, truncate = False)

 
The top 50 matching articles and their similarity scores are as follows:
 
+----------------------------------------+----------------------------------------------------------------------------------------------------------+------------------+
|uuid                                    |title                                                                                                     |similarity        |
+----------------------------------------+----------------------------------------------------------------------------------------------------------+------------------+
|f3e088f5ad6f18f29520f7a9f515c651e2506f92|A Look Into Netflix's Price Over Earnings                                                                 |0.7900621327159819|
|0cd289d4d57d56148f4f24c46d18a9a827cfee81|Movies That You Should Look Out For On Netflix                                                            |0.7736310926125574|
|e7878727120623bb26a5984fe237ee9dca34dc80|PTC- Take closer look, parents, at N