# **NB 6.2: Targeted Entitiy Sentiment Identification**

## Importing Libraries

In [17]:
import os
import requests
import re
import sys
import regex as re
import ast

In [4]:
import nltk as nltk
import nltk.corpus  
from nltk.text import Text

In [5]:
import pandas as pd
import numpy as np
import sklearn

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# Graphics in SVG format are more sharp and legible
%config InlineBackend.figure_format = 'svg'
import warnings
warnings.filterwarnings("ignore")

In [6]:
# Multiprocessing

#pip install pandarallel
import multiprocessing

num_processors = multiprocessing.cpu_count()
print(f'Available CPUs: {num_processors}')

import pandarallel
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False, progress_bar=True )

Available CPUs: 16
INFO: Pandarallel will run on 15 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [7]:
# Imports the Google Cloud client library

from google.cloud import storage
# Instantiates a client
storage_client = storage.Client()

# The name for the new bucket
bucket_name = "nlp_final_project_kshitijm"

# Creates the new bucket
bucket = storage_client.bucket(bucket_name)
print(f"Bucket {bucket.name} connected.")

Bucket nlp_final_project_kshitijm connected.


In [13]:
#Visuals

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_style('white')
# Graphics in SVG format are more sharp and legible
%config InlineBackend.figure_format = 'svg'

import warnings
warnings.filterwarnings("ignore")

from tqdm import tqdm

In [14]:
# Task specific
from collections import Counter
from itertools import chain

## Importing Datasets

In [9]:
%%time
## Positive and Negative Sentiment Datasets

df_filt_pos=pd.read_csv('gs://nlp_final_project_kshitijm/00_Data/NLP_FP_Data7_POS_BERTopics_Sentiments_NER.csv',lineterminator='\n')
df_filt_neg=pd.read_csv('gs://nlp_final_project_kshitijm/00_Data/NLP_FP_Data7_NEG_BERTopics_Sentiments_NER.csv',lineterminator='\n')

CPU times: user 40.4 s, sys: 6.09 s, total: 46.4 s
Wall time: 2min 9s


In [10]:
# We see the following organisations coming as most prominent from our entire dataset; across years
ORG_trend=pd.read_csv('NER_Trend_ORG.csv')
ORG_trend

Unnamed: 0,2020,2021,2022,2023
0,"('AI', 37756)","('AI', 50705)","('AI', 71025)","('ChatGPT', 155292)"
1,"('V19', 28787)","('V19', 21160)","('Gray Media Group', 54282)","('AI', 84010)"
2,"('Google', 20518)","('Gray Media Group', 19507)","('Station20022022 Gray Television', 23401)","('Microsoft', 53121)"
3,"('Microsoft', 14155)","('Google', 19109)","('Google', 19290)","('Google', 50085)"
4,"('IBM', 13379)","('Microsoft', 14261)","('ML', 12760)","('OpenAI', 29117)"
5,"('Artificial Intelligence', 12495)","('Artificial Intelligence', 13384)","('V19', 9213)","('Gray Media Group', 28657)"
6,"('GR', 11260)","('IBM', 13200)","('Microsoft', 7737)","('Bard', 18375)"
7,"('Facebook', 9600)","('Facebook', 10079)","('ChatGPT', 7456)","('Station20022023 Gray Television', 12870)"
8,"('Artificial Intelligence AI', 7383)","('GR', 9837)","('Facebook', 7436)","('Microsofts', 10510)"
9,"('Amazon', 6893)","('Station20022021 Gray Television', 7721)","('IBM', 7204)","('Googles', 10029)"


In [11]:
PER_trend=pd.read_csv('PER_Trend_ORG.csv')
PER_trend

Unnamed: 0,2020,2021,2022,2023
0,"('Size', 3979)","('Size', 2994)","('Elon Musk', 2219)","('Bing', 10618)"
1,"('Trump', 3283)","('Biden', 1993)","('Musk', 2192)","('Musk', 8478)"
2,"('Instagram', 1360)","('Closefor', 1908)","('Lemoine', 2044)","('Elon Musk', 7255)"
3,"('Middle EastAfrica', 1277)","('Greta Van', 1880)","('Ai Weiwei', 1766)","('Biden', 4333)"
4,"('Biden', 1046)","('Instagram', 1469)","('Vectorspace AI', 1651)","('Trump', 4095)"
5,"('Richests Richests', 1016)","('Elon Musk', 1220)","('Biden', 1464)","('Sam Altman', 3417)"
6,"('Gebru', 886)","('Jim Thorpe', 1088)","('EnterpriseFor', 1184)","('Bard', 2638)"
7,"('Ai Weiwei', 857)","('Middle EastAfrica', 1087)","('Mark Zuckerberg', 1066)","('Sundar Pichai', 2584)"
8,"('Musk', 808)","('Su Tierra Tiempo', 1085)","('StartedIs Bitcoin', 1006)","('Pichai', 2359)"
9,"('Elon Musk', 779)","('Musk', 1080)","('CoFounder', 961)","('Ernie Bot', 2302)"


### We will try to extract entities from docs which have a positive sentiment

In [15]:
def get_top_ents(df):

    df['ENT_ORG']=df['ENT_ORG'].parallel_apply(ast.literal_eval)
    org_list = list(chain.from_iterable(df['ENT_ORG']))
    counter_org = Counter(org_list)
    top_20_org = counter_org.most_common(20)
    
    df['ENT_PROD']=df['ENT_PROD'].parallel_apply(ast.literal_eval)
    prod_list = list(chain.from_iterable(df['ENT_PROD']))
    counter_prod = Counter(prod_list)
    top_20_prod = counter_prod.most_common(20)
    
    df['ENT_PER']=df['ENT_PER'].parallel_apply(ast.literal_eval)
    per_list = list(chain.from_iterable(df['ENT_PER']))
    counter_per = Counter(per_list)
    top_20_per = counter_per.most_common(20)
    
    df['ENT_NORP']=df['ENT_NORP'].parallel_apply(ast.literal_eval)
    norm_list = list(chain.from_iterable(df['ENT_NORP']))
    counter_norm = Counter(norm_list)
    top_20_norm = counter_norm.most_common(20)
    
    return top_20_org, top_20_prod, top_20_per, top_20_norm

In [None]:
%%time
top_20_org_pos, top_20_prod_pos, top_20_per_pos, top_20_norm_pos = get_top_ents(df_filt_pos)

In [19]:
POS_trend=pd.DataFrame({'ORGANIZATION':top_20_org_pos,'PRODUCT':top_20_prod_pos,'PERSON':top_20_per_pos, 'NORM':top_20_norm_pos})
POS_trend

Unnamed: 0,ORGANIZATION,PRODUCT,PERSON,NORM
0,"(AI, 78573)","(AI, 185906)","(Size, 2140)","(Chinese, 5935)"
1,"(Gray Media Group, 60661)","(AIdriven, 5053)","(Bing, 1887)","(European, 3026)"
2,"(ChatGPT, 29275)","(UsMeet, 2600)","(Greta Van, 1600)","(Canadian, 2659)"
3,"(Google, 23770)","(HPC, 1718)","(Elon Musk, 1561)","(American, 2109)"
4,"(Microsoft, 22937)","(Cresta, 1472)","(Biden, 1411)","(British, 1650)"
5,"(V19, 18860)","(Bing, 1435)","(Trump, 1258)","(Americans, 1401)"
6,"(Station20022022 Gray Television, 14059)","(V19, 1430)","(Standigm, 1250)","(French, 1268)"
7,"(ML, 13751)","(SE, 1271)","(Closefor, 1228)","(German, 1229)"
8,"(GR, 12167)","(CRM, 1263)","(CoFounder, 1181)","(Lunit, 1080)"
9,"(IBM, 12077)","(Google Cloud, 1115)","(Laivly, 1117)","(Korean, 992)"


* POS: ChatGPT, Microsoft, 
* POS: Google + Alphabet - Bard
* POS: Coinbase, Ethereum

In [None]:
%%time
top_20_org_neg, top_20_prod_neg, top_20_per_neg, top_20_norm_neg = get_top_ents(df_filt_neg)

In [22]:
NEG_trend=pd.DataFrame({'ORGANIZATION':top_20_org_neg,'PRODUCT':top_20_prod_neg,'PERSON':top_20_per_neg, 'NORM':top_20_norm_neg})
NEG_trend

Unnamed: 0,ORGANIZATION,PRODUCT,PERSON,NORM
0,"(ChatGPT, 5924)","(AI, 11450)","(Vectorspace AI, 992)","(Chinese, 1393)"
1,"(AI, 5642)","(Bing, 477)","(Bing, 473)","(European, 458)"
2,"(Google, 5628)","(SE, 353)","(Ernie Bot, 460)","(American, 385)"
3,"(Microsoft, 3025)","(SoundHound AI, 318)","(Elon Musk, 440)","(Canadians, 302)"
4,"(Bard, 1994)","(Coinbase, 231)","(Musk, 429)","(Americans, 268)"
5,"(Googles, 1882)","(XT, 129)","(Trump, 350)","(Canadian, 259)"
6,"(Matrix AI Network, 1701)","(Matrix AI Networks, 127)","(Biden, 331)","(British, 250)"
7,"(V19, 1307)","(Nebula AI, 123)","(Sundar Pichai, 301)","(Italian, 242)"
8,"(Lisk Machine Learning, 1248)","(AIdriven, 116)","(Vectorspace, 275)","(French, 198)"
9,"(Ethereum, 1197)","(JavaScript, 107)","(Putin, 207)","(German, 166)"


* NEG: Vectorspace AI - ML Based Blockchain
* NEG: Google + Alphabet - Bard
* NEG: Coinbase, Ethereum

In [24]:
len(df_filt_neg.Bert_topics.unique())

121