In [1]:
## Jason Galvan

In [2]:
pip install --upgrade gensim

Requirement already up-to-date: gensim in /Users/batman/anaconda3/lib/python3.7/site-packages (3.8.3)
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd 
import gensim, operator
from scipy import spatial
import numpy as np
from gensim.models import KeyedVectors

In [4]:
model_path = '/Users/batman/Downloads/'

In [5]:
#1.1 Loads the downloaded pre-trained Google Word2Vec model from your computer

In [6]:
def load_wordvec_model(modelName, modelFile, flagBin):
    print('Loading ' + modelName + ' model...')
    model = KeyedVectors.load_word2vec_format(model_path + modelFile, binary=flagBin)
    print('Finished loading ' + modelName + ' model...')
    return model

model_word2vec = load_wordvec_model('Word2Vec', 'GoogleNews-vectors-negative300.bin.gz', True)
#model_word2vec = load_wordvec_model('Word2Vec', 'webhose_skipgram_300.bin', True)

Loading Word2Vec model...
Finished loading Word2Vec model...


In [7]:
def vec_similarity(input1, input2, vectors):
    term_vectors = [np.zeros(300), np.zeros(300)]
    terms = [input1, input2]
        
    for index, term in enumerate(terms):
        for i, t in enumerate(term.split(' ')):
            try:
                term_vectors[index] += vectors[t]
            except:
                term_vectors[index] += 0
        
    result = (1 - spatial.distance.cosine(term_vectors[0], term_vectors[1]))
    if result is 'nan':
        result = 0
        
    return result

In [8]:
# Function checks whether the input words are present in the vocabulary for the model
def vocab_check(vectors, words):
    
    output = list()
    for word in words:
        if word in vectors.vocab:
            output.append(word.strip())
            
    return output

In [9]:
# Function calculates similarity between two strings using a particular word vector model
def calc_similarity(input1, input2, vectors):
    s1words = set(vocab_check(vectors, input1.split()))
    s2words = set(vocab_check(vectors, input2.split()))
    #if len(s1words) < 1 | len(s2words) < 1:
    #return 0
    
    try:
        
        output = vectors.n_similarity(s1words, s2words)
    
    except:
        
        output = 0
        
    return output

In [10]:
# 1.2 Loads your previously obtained dataset of Webhose news articles

In [11]:
import json

In [12]:
json_data=open('/Users/batman/Downloads/webhose_IBM.json').readlines()
newsfeeds_read = []
for line in json_data:
    newsfeeds_read.append(json.loads(line))
print(len(newsfeeds_read))

3779


In [13]:
# Obtaining all Titles --isolating from remaining data
total_list = []

for feed in newsfeeds_read:
    total_list.append(feed['title'])

In [14]:
# Obtaining only Unique Titles amongst total listing 
unique_title = []

for line in total_list:
    if line not in unique_title:
        unique_title.append(line)

In [15]:
# Confirming total listing amount equals total length of dataset as imported--newsfeeds_read
len(total_list)

3779

In [16]:
# Confirming length of only unique titles amongst total listing 
len(unique_title)

2903

In [17]:
# Created a loop to print only 10 titles instead of 2903---allows me to inspect the data and efficiently 
#select a title   
my_list = unique_title[:10]
for i in my_list:
    print(i)

Microsoft joins Amazon, IBM in pausing face scans for police
Microsoft joins Amazon, IBM in pausing face scans police
Microsoft joins rivals, bars police use of face recognition tech
Microsoft’s Brad Smith says company will not sell facial recognition tech to police
Microsoft won’t sell police its facial-recognition tech, following similar moves by Amazon, IBM
Helping to Accelerate the Search for Solutions to the COVID-19 Crisis
Microsoft says it won’t sell facial recognition technology to police
Microsoft Joins Rivals, Bars Police Use Of Face Recognition Tech
Tech giants to stop selling facial recognition technology to the police following the death of George Floyd
Microsoft says it won't sell its facial recognition tech to police


In [18]:
input1 = "Microsoft joins Amazon, IBM in pausing face scans for police"

In [19]:
# Using a for loop to compare titles to selected title 
total_output = []
scores = {}
for item in unique_title:
    input2 = item
    output = calc_similarity(input1, input2, model_word2vec)
    
    try:
        output = calc_similarity(input1, input2, model_word2vec)
        total_output.append((input2, output))
    except ZeroDivisionError:
        pass
    #print(str2, output)

In [21]:
# Converting total_output to pandas df in order to sort top 100 results in descending order
from pandas import DataFrame    

df = DataFrame (total_output,columns=['title', 'similarity scores'])

In [22]:
len(df)

2903

In [23]:
# 1.3 For any one selected article title from the dataset, finds 100 most similar titles based 
#on Word2Vec similarity, prints those titles in a descending order of similarity scores.  

import pandas as pd 

res = df.sort_values('similarity scores', ascending=False)[:100] 
res.reset_index()

Unnamed: 0,index,title,similarity scores
0,0,"Microsoft joins Amazon, IBM in pausing face sc...",1.000000
1,1,"Microsoft joins Amazon, IBM in pausing face sc...",0.991228
2,2,"Microsoft joins rivals, bars police use of fac...",0.726344
3,255,"IBM quits facial recognition, joins call for p...",0.694210
4,59,"IBM quits facial recognition, joins call for p...",0.694210
5,122,"Armonk-based IBM quits facial recognition, joi...",0.694210
6,188,"IBM quits facial recognition, joins call for p...",0.693293
7,250,"IBM quits facial recognition, joins call for p...",0.691081
8,110,IBM abandons development of facial recognition...,0.689923
9,234,"IBM quits facial recognition, citing surveilla...",0.686724


In [24]:
!pip install pyspark



In [25]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
sc = SparkContext() 
sqlContext = SQLContext(sc)
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.clustering import LDA, LDAModel
from nltk.stem.wordnet import WordNetLemmatizer
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec

print("Using Apache Spark Version", sc.version)

Using Apache Spark Version 3.0.0


In [26]:
# 2.1 Loading previously obtained dataset of Webhose news articles into Spark dataframe

In [27]:
spark_df = sqlContext.read.option("header", "true").option("delimiter", ",") \
                    .option("inferSchema", "true") \
                    .json("/Users/batman/Downloads/webhose_IBM.json")

In [28]:
#2.2. Clean and tokenize article data using RegexTokenizer and Stopword functions

data=spark_df['text']

In [29]:
pip install -U spacy

Requirement already up-to-date: spacy in /Users/batman/anaconda3/lib/python3.7/site-packages (2.3.1)
Note: you may need to restart the kernel to use updated packages.


In [30]:
#Importing necessary libraries onto new system --no access previously downloads
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/batman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [31]:
#Importing necessary libraries onto new system --no access previously downloads

nltk.download('stopwords')
from nltk.corpus import stopwords

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/batman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/batman/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [32]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load( disable=['parser', 'tagger','ner'] )

def cleanup_pretokenize(data):
    #text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'http\S+', '', data)
    text = text.replace("'s", " ")
    text = text.replace("n't", " not ")
    text = text.replace("'ve", " have ")
    text = text.replace("'re", " are ")
    text = text.replace("I'm"," I am ")
    text = text.replace("you're"," you are ")
    text = text.replace("You're"," You are ")
    text = text.replace("-"," ")
    text = text.replace("/"," ")
    text = text.replace("("," ")
    text = text.replace(")"," ")
    text = text.replace("%"," percent ")
    return text

lmtzr = WordNetLemmatizer()
def text_cleanup(row):
    desc = row[2].strip().lower()
    tokens = [w.lemma_ for w in nlp(cleanup_pretokenize(desc))]
    tokens = [token for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if len(token) > 3]
    #tokens = [lmtzr.lemmatize(token,'v') for token in tokens]
    row[2] = ' '.join(tokens)
    return row

regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'text', outputCol = 'tokens')
swr = StopWordsRemover(inputCol = 'tokens', outputCol = 'tokens_sw_removed')

In [33]:
ibmdata = spark_df['uuid','title','text']

In [36]:
#Confirming correct size of dataset
ibmdata.count()

3779

In [37]:
#Remove diplicates
ibmdata2=ibmdata.drop_duplicates(subset=['title'])

In [38]:
#Confirming correct length of the data after removing duplicates
ibmdata2.count()

2903

In [39]:
crunchbase_columns = [0,1,2]
crunchbase_rdd = ibmdata2.select('*') \
                       .rdd.map(lambda row: [row[i] for i in crunchbase_columns]) \
                       .filter(lambda row: row[1] is not None)
crunchbase_df = sqlContext.createDataFrame(crunchbase_rdd, 
                                           ['uuid','title','text'])
crunchbase_df.show(5)

+--------------------+--------------------+--------------------+
|                uuid|               title|                text|
+--------------------+--------------------+--------------------+
|abdd9354dd68f44c3...|           2020-4288|CVE-2020-4288 Det...|
|8e3a552fadee4b091...|Amazon won’t say ...|Amazon’s facial r...|
|562a381c7bfc37ea2...|Global Blockchain...|https://www.marke...|
|3365120cf716a2d7a...|Hertz files for b...|The Chapter 11 fi...|
|a8cc03ee6574b280e...|IBM adds AI-fuell...|Credit: Dreamstim...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [40]:
def cossim(v1, v2): 
    return np.dot(v1, v2) / np.sqrt(np.dot(v1, v1)) / (np.sqrt(np.dot(v2, v2))+.1)

In [41]:
df_tokens = regexTokenizer.transform(crunchbase_df)
desc_swr = swr.transform(df_tokens)
desc_swr.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                uuid|               title|                text|              tokens|   tokens_sw_removed|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|abdd9354dd68f44c3...|           2020-4288|CVE-2020-4288 Det...|[cve, 2020, 4288,...|[cve, 2020, 4288,...|
|8e3a552fadee4b091...|Amazon won’t say ...|Amazon’s facial r...|[amazon, s, facia...|[amazon, facial, ...|
|562a381c7bfc37ea2...|Global Blockchain...|https://www.marke...|[https, www, mark...|[https, www, mark...|
|3365120cf716a2d7a...|Hertz files for b...|The Chapter 11 fi...|[the, chapter, 11...|[chapter, 11, fil...|
|a8cc03ee6574b280e...|IBM adds AI-fuell...|Credit: Dreamstim...|[credit, dreamsti...|[credit, dreamsti...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [42]:
# 2.3 Train a Word2Vec model based on the output column produced in step 2

In [43]:
word2vec = Word2Vec(vectorSize = 300, minCount = 5, inputCol = 'tokens_sw_removed', outputCol = 'wordvectors')
model = word2vec.fit(desc_swr)
wordvectors = model.transform(desc_swr)
#wordvectors.select('wordvectors').show(1, truncate = True)
crunchbase_desc = wordvectors.select('uuid','title','wordvectors').rdd.toDF()
crunchbase_desc.show(5)

+--------------------+--------------------+--------------------+
|                uuid|               title|         wordvectors|
+--------------------+--------------------+--------------------+
|abdd9354dd68f44c3...|           2020-4288|[-0.0206399736205...|
|8e3a552fadee4b091...|Amazon won’t say ...|[-0.1234268072463...|
|562a381c7bfc37ea2...|Global Blockchain...|[-0.0096478730347...|
|3365120cf716a2d7a...|Hertz files for b...|[-0.0211732663956...|
|a8cc03ee6574b280e...|IBM adds AI-fuell...|[-0.0237706488485...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [44]:
# 2.4 Implement any sample search query, as shown in Class Exercise, and produce matching article titles

In [45]:
synonyms = model.findSynonyms("privacy", 20)   
synonyms.show()

+-----------+------------------+
|       word|        similarity|
+-----------+------------------+
|   concerns|0.7404017448425293|
|incorrectly| 0.698283851146698|
|  criticism|0.6786624193191528|
|       laws|0.6733431816101074|
| opposition|0.6728506088256836|
|  advocates|0.6685310006141663|
|  biometric|0.6667436957359314|
| questioned|0.6633265018463135|
|   suspects|0.6630204916000366|
| criticized|0.6620674729347229|
|    misused|0.6587579250335693|
|   innocent|0.6524674892425537|
|unregulated|0.6521512269973755|
|     ceased|0.6508261561393738|
|     biased|0.6492396593093872|
|      flaws| 0.645483672618866|
| researcher|0.6445693373680115|
| unreliable| 0.636561930179596|
|     pauses|0.6361752152442932|
|     abuses|0.6321342587471008|
+-----------+------------------+



In [46]:
#chunk = webhose_desc.filter(lambda r: r[1]>=0 and r[1]<1000).collect()
chunk = crunchbase_desc.take(20000)
#chunk = webhose_desc.collect()

In [47]:
SEARCH_QUERY = "Facial recognition technology violates privacy"

In [48]:
query_df  = sc.parallelize([(1,SEARCH_QUERY)]).toDF(['index','text'])
query_tok = regexTokenizer.transform(query_df)
query_swr = swr.transform(query_tok)
query_swr.show()
query_vec = model.transform(query_swr)
query_vec = query_vec.select('wordvectors').collect()[0][0]
#query_vec

+-----+--------------------+--------------------+--------------------+
|index|                text|              tokens|   tokens_sw_removed|
+-----+--------------------+--------------------+--------------------+
|    1|Facial recognitio...|[facial, recognit...|[facial, recognit...|
+-----+--------------------+--------------------+--------------------+



In [49]:
import numpy as np
sim_rdd = sc.parallelize((i[0], i[1], float(cossim(query_vec, i[2]))) for i in chunk)
sim_df  = sqlContext.createDataFrame(sim_rdd).\
                   withColumnRenamed('_1', 'uuid').\
                   withColumnRenamed('_2', 'title').\
                   withColumnRenamed('_3', 'similarity').\
                   orderBy("similarity", ascending = False)
sim_df.show(20, truncate = False)

+----------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|uuid                                    |title                                                                                                                                                    |similarity        |
+----------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|5269c3d61d7f7dc05bb4b1e37b383482179897b1|IBM to withdraw from the facial recognition market - CNET                                                                                                |0.8239450872461224|
|1356ba050408d1d5e8696880d7a9ab1213835d34|IBM to withdraw from the facial recognition market out of profiling fears                     

In [None]:
#End of Assignment 6 