In [1]:
# Jason Galvan

In [2]:
pwd

'/Users/batman/Downloads'

In [3]:
import pandas as pd 
import gensim, operator
from scipy import spatial
import numpy as np
from gensim.models import KeyedVectors
from operator import itemgetter, attrgetter

In [4]:
model_path = '/Users/batman/Downloads/'

In [5]:
#LOADING WORD VECTOR MODEL

In [6]:
def load_wordvec_model(modelName, modelFile, flagBin):
    print('Loading ' + modelName + ' model...')
    model = KeyedVectors.load_word2vec_format(model_path + modelFile, binary=flagBin)
    print('Finished loading ' + modelName + ' model...')
    return model

model_word2vec = load_wordvec_model('Word2Vec', 'GoogleNews-vectors-negative300.bin.gz', True)
#model_word2vec = load_wordvec_model('Word2Vec', 'webhose_skipgram_300.bin', True)

Loading Word2Vec model...
Finished loading Word2Vec model...


In [34]:
# BUILDING MY OWN TAXONOMY BASED ON LDA and MANUAL DATA EXPLORATION

In [35]:
topic_taxonomy = {
    "Business Finance":
    {
        "Revenue":  "millions earnings income receipts ",
        "Profits":  "gains dividends interest",
        "Management": "global markeplace product employee "
    },
    "Data Protection":
    {

        "Network Security": "unauthorized access intrusion misuse denial",
        "Cloud Security":  "data applications services infrastructure",
        "Backup and Recovery":  "customer duplicate files folders cloud"
    },
    "Stock Portfolio":
    {
        "Shares":    "company quarter ratings news",
        "Dividends": "average research target report",
        "Price": "purchase ratio valued earnings"
    },
    "Facial Recognition Technology":
    {
        "Tech Companies":  "Amazon Microsoft Apple Google AWS",
        "Law Enforcement": "digital identification criminal software solution bias",
        "Tech Bias": "race gender ethnicity reliability map features prejudice population demographics"
    },
   
    "Cloud Services":
    {
        "Aritifical Intelligence": "visual perception human intelligence application new",
        "Storage": "technology database data backup provider enterprise solution electronic capture",
        "Computing":  "center market customer data hardware software license"
    },
    "Artificial Intelligence Bias":
    {
        "Design": "development global software personal mitigating",
        "Machine Learning": "algorithm identity bad data type response introduction",
        "Biometrics": "characteristics human identification computer science identify measurements",
        "Covid-19": "virus global outbreak pandemic containment NYC detection",
        
    }
}

In [36]:
def vec_similarity(input1, input2, vectors):
    term_vectors = [np.zeros(300), np.zeros(300)]
    terms = [input1, input2]
        
    for index, term in enumerate(terms):
        for i, t in enumerate(term.split(' ')):
            try:
                term_vectors[index] += vectors[t]
            except:
                term_vectors[index] += 0
        
    result = (1 - spatial.distance.cosine(term_vectors[0], term_vectors[1]))
    if result is 'nan':
        result = 0
        
    return result

In [37]:
# Function checks whether the input words are present in the vocabulary for the model
def vocab_check(vectors, words):
    
    output = list()
    for word in words:
        if word in vectors.vocab:
            output.append(word.strip())
            
    return output

In [38]:
# Function calculates similarity between two strings using a particular word vector model
def calc_similarity(input1, input2, vectors):
    s1words = set(vocab_check(vectors, input1.split()))
    s2words = set(vocab_check(vectors, input2.split()))
    #if len(s1words) < 1 | len(s2words) < 1:
    #return 0
    
    try:
        
        output = vectors.n_similarity(s1words, s2words)
    
    except:
        
        output = 0
        
    return output

In [39]:
#IMPORTING WEBHOSE DATASET FOR ANALYSIS

In [40]:
import json

In [41]:
json_data=open('/Users/batman/Downloads/webhose_IBM.json').readlines()
newsfeeds_read = []
for line in json_data:
    newsfeeds_read.append(json.loads(line))
print(len(newsfeeds_read))

3779


In [42]:
# Obtaining all Titles --isolating from remaining data
total_list = []

for feed in newsfeeds_read:
    total_list.append(feed['title'])

In [43]:
# Obtaining only Unique Titles amongst total listing 
input1 = []

for line in total_list:
    if line not in input1:
        input1.append(line)

In [44]:
len(input1)

2903

In [45]:
 #function takes an input string, runs similarity for each item in topic_taxonomy, sorts and returns top result
def classify_topics(input, vectors):
    feed_score = dict()
    for key, value in topic_taxonomy.items():
        max_value_score = dict()
        for label, keywords in value.items():
            max_value_score[label] = 0
            topic = (key + ' ' + keywords).strip()
            max_value_score[label] += float(calc_similarity(input, topic, vectors))
            
        sorted_max_score = sorted(max_value_score.items(), key=operator.itemgetter(1), reverse=True)[0]
        feed_score[sorted_max_score[0]] = sorted_max_score[1]
    return sorted(feed_score.items(), key=operator.itemgetter(1), reverse=True)[:1]

In [46]:
#USING FOR LOOP TO CALL CLASSIFY_TOPICS FUNCTION AND RETURN RESULTS IN 3 STRINGS--ALLOWING EASE OF MANIPULATION

In [47]:
score2 = []
topic2 = []
title2= []
for i in range(len(input1)) :   
    input3=i

    outputFIN = classify_topics(input1[i], model_word2vec)
    topic2.append(outputFIN[0][0])
    score2.append(outputFIN[0][1])
    title2.append(input1[i])

In [48]:
len(topic2)

2903

In [49]:
len(score2)

2903

In [50]:
len(title2)

2903

In [51]:
#DF BRINGING 3 INDEPENDENT STRINGS TOGETHER 

In [52]:
dfS = pd.DataFrame({'title': title2,'score': score2,'topic':topic2})

In [53]:
type(dfS.columns)

pandas.core.indexes.base.Index

In [54]:
dfFINAL = dfS.sort_values(['topic','score'],ascending=False).groupby('topic').head(10)

In [55]:
#DISPLAYING TOP 10 ARTICLES FOR EACH TOPIC IN TAXONOMY

In [61]:
#Displaying top 30 results for ease of inspection
dfFINAL.head(30)

Unnamed: 0,title,score,topic
258,Cloud NLP Market Growth Analysis to 2024: Appl...,0.828172,Tech Companies
38,AI in Fintech Market Production & Demand by 20...,0.764505,Tech Companies
1233,Global Analytics as a Service Market 2024 by G...,0.733912,Tech Companies
2098,Artificial Intelligence (AI) as a Service Mark...,0.716813,Tech Companies
1931,Global Cloud Computing Market Insights Report ...,0.701312,Tech Companies
807,Know How Cloud Computing Education Market is d...,0.696775,Tech Companies
2409,Announcing Google Cloud VMware Engine,0.69665,Tech Companies
2810,"Top cloud providers in 2020: AWS, Microsoft Az...",0.69405,Tech Companies
2538,2020 Forbes8 Digital Startup Accelerator Progr...,0.684576,Tech Companies
690,Slack and Amazon partner to take on Microsoft ...,0.677094,Tech Companies


In [62]:
#INSPECTING TOTAL DF OUTPUT 

In [63]:
dfFINAL

Unnamed: 0,title,score,topic
258,Cloud NLP Market Growth Analysis to 2024: Appl...,0.828172,Tech Companies
38,AI in Fintech Market Production & Demand by 20...,0.764505,Tech Companies
1233,Global Analytics as a Service Market 2024 by G...,0.733912,Tech Companies
2098,Artificial Intelligence (AI) as a Service Mark...,0.716813,Tech Companies
1931,Global Cloud Computing Market Insights Report ...,0.701312,Tech Companies
...,...,...,...
2210,Intellect Design launches iTurmeric FinCloud d...,0.557053,Aritifical Intelligence
395,IBM will no longer develop facial recognition ...,0.552058,Aritifical Intelligence
2123,Why is business intelligence important for an ...,0.551558,Aritifical Intelligence
326,IBM says it won’t offer facial recognition any...,0.541244,Aritifical Intelligence


In [None]:
#End of Assignment 9 !!!!!!!!!!!!