## Script 6, Saloni Khandelwal
#### (Entity Identification)

### General Entities

In [None]:
import pandas as pd
import spacy
from collections import Counter

In [None]:
from pandarallel import pandarallel
import multiprocessing

num_processors = multiprocessing.cpu_count()
print(f'Available CPUs: {num_processors}')

pandarallel.initialize(nb_workers=num_processors-3, use_memory_fs=False, progress_bar=True)

Available CPUs: 10
INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [None]:
df = pd.read_parquet('sentiment_filtered_news.parquet')
df.head()

Unnamed: 0,url,date,language,title,text,cleaned_text,important_words,tokens,topic,predicted_sentiment_yelp_new,sentiment_numeric
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,Some urban areas of the city started to use ET...,urban areas city started use etc system roadsi...,"[urban, areas, city, started, use, etc, system...",0,p,1
1,http://newsparliament.com/2020/02/27/children-...,2020-02-27,en,Children With Autism Saw Their Learning and So...,\nChildren With Autism Saw Their Learning and ...,Children With Autism Saw Their Learning and So...,children autism saw learning social skills boo...,"[children, autism, saw, learning, social, skil...",1,n,-1
2,http://www.dataweek.co.za/12835r,2021-03-26,en,"Forget ML, AI and Industry 4.0 – obsolescence ...","\n\nForget ML, AI and Industry 4.0 – obsolesce...",The world entered a new era of accelerated tra...,world entered new era accelerated transformati...,"[world, entered, new, era, accelerated, transf...",0,p,1
3,http://www.homeoffice.consumerelectronicsnet.c...,2021-03-10,en,Strategy Analytics: 71% of Smartphones Sold Gl...,\n\nStrategy Analytics: 71% of Smartphones Sol...,AI is used in various functions inside smartph...,ai used various functions inside smartphones i...,"[ai, used, various, functions, inside, smartph...",0,p,1
4,http://www.itbusinessnet.com/2020/10/olympus-t...,2020-10-20,en,Olympus to Support Endoscopic AI Diagnosis Edu...,\n\nOlympus to Support Endoscopic AI Diagnosis...,"In collaboration with CYBERNET SYSTEMS CO,. LT...",collaboration cybernet systems co ltd olympus ...,"[collaboration, cybernet, systems, co, ltd, ol...",0,p,1


In [None]:
#pip install ipywidgets

In [None]:
def extract_entities(text, nlp_package_name='en_core_web_sm'):
    nlp = spacy.load(nlp_package_name)
    doc = nlp(text)
    entities = [(entity.text, entity.label_) for entity in doc.ents]
    return entities

In [None]:
pandarallel.initialize(nb_workers=num_processors-3, use_memory_fs=False, progress_bar=True)

INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [None]:
nlp = spacy.load('en_core_web_sm')

def extract_entities(texts):
    docs = nlp.pipe(texts, disable=['parser', 'tagger'])
    return [[(ent.text, ent.label_) for ent in doc.ents] for doc in docs]

In [None]:
from tqdm import tqdm

In [None]:
batch_size = 32  # Adjust this value to your needs.
entities = []
for i in tqdm(range(0, len(df), batch_size)):
    batch_texts = df['cleaned_text'].iloc[i:i+batch_size]
    batch_entities = extract_entities(batch_texts)
    entities.extend(batch_entities)

100%|█████████████████████████████████████| 4279/4279 [2:50:39<00:00,  2.39s/it]


In [None]:
df['entities'] = entities

In [None]:
df['organization'] = df['entities'].apply(lambda x: [entity[0] for entity in x if entity[1] == 'ORG'])

In [None]:
top_org_entities = df.explode('organization')['organization'].value_counts().nlargest(10)

print("Top 10 Most Frequent Organization Entities:")
for entity, count in top_org_entities.items():
    print(f"Entity: {entity}\tCount: {count}")

Top 10 Most Frequent Organization Entities:
Entity: AI	Count: 436912
Entity: Google	Count: 57425
Entity: Microsoft	Count: 48394
Entity: COVID-19	Count: 18911
Entity: Artificial Intelligence	Count: 17917
Entity: ML	Count: 13632
Entity: Amazon	Count: 12921
Entity: Bard	Count: 12238
Entity: IBM	Count: 11213
Entity: Bing	Count: 9600


In [None]:
df['products'] = df['entities'].apply(lambda entities: [entity[0] for entity in entities if entity[1] == 'PRODUCT'])

In [None]:
top_product_entities = df.explode('products')['products'].value_counts().nlargest(20)

print("Top 10 Most Frequent Product Entities:")
for entity, count in top_product_entities.items():
    print(f"Product: {entity}\tCount: {count}")

Top 10 Most Frequent Product Entities:
Product: Twitter	Count: 7464
Product: JavaScript	Count: 2072
Product: DeepMind	Count: 1705
Product: Excel	Count: 1299
Product: CRM	Count: 1208
Product: Coinbase	Count: 1056
Product: Cancel	Count: 1053
Product: A100	Count: 784
Product: Discovery	Count: 737
Product: TensorFlow	Count: 634
Product: Fortune 500	Count: 604
Product: C3.ai	Count: 530
Product: H100	Count: 472
Product: K-12	Count: 460
Product: LTC	Count: 401
Product: Learn	Count: 380
Product: COVID-19	Count: 356
Product: StarFilled	Count: 350
Product: Checked	Count: 345
Product: Waymo	Count: 335


In [None]:
df['GPE'] = df['entities'].apply(lambda entities: [entity[0] for entity in entities if entity[1] == 'GPE'])

In [None]:
top_GPE_entities = df.explode('GPE')['GPE'].value_counts().nlargest(20)

print("Top 10 Most Frequent GPE Entities:")
for entity, count in top_GPE_entities.items():
    print(f"Product: {entity}\tCount: {count}")

Top 10 Most Frequent GPE Entities:
Product: OpenAI	Count: 39985
Product: China	Count: 24563
Product: US	Count: 23500
Product: U.S.	Count: 20388
Product: India	Count: 19850
Product: UK	Count: 11105
Product: Canada	Count: 8461
Product: Japan	Count: 8143
Product: Us	Count: 7181
Product: Russia	Count: 6732
Product: the United States	Count: 6674
Product: AI	Count: 6384
Product: Germany	Count: 6324
Product: New York	Count: 5785
Product: France	Count: 5573
Product: Italy	Count: 5283
Product: Australia	Count: 5121
Product: LinkedIn	Count: 4952
Product: California	Count: 4372
Product: San Francisco	Count: 4182


In [None]:
from collections import Counter

In [None]:
all_entities = [entity for entities in df['entities'] for entity in entities]

In [None]:
entity_counts = Counter(all_entities)
top_entities = entity_counts.most_common(20)

In [None]:
for entity, count in top_entities:
    print(f'Entity: {entity}\tCount: {count}')

Entity: ('AI', 'ORG')	Count: 436912
Entity: ('Google', 'ORG')	Count: 57425
Entity: ('first', 'ORDINAL')	Count: 52182
Entity: ('Microsoft', 'ORG')	Count: 48394
Entity: ('one', 'CARDINAL')	Count: 47006
Entity: ('OpenAI', 'GPE')	Count: 39985
Entity: ('two', 'CARDINAL')	Count: 26404
Entity: ('today', 'DATE')	Count: 26197
Entity: ('China', 'GPE')	Count: 24563
Entity: ('US', 'GPE')	Count: 23500
Entity: ('U.S.', 'GPE')	Count: 20388
Entity: ('India', 'GPE')	Count: 19850
Entity: ('2020', 'DATE')	Count: 19474
Entity: ('COVID-19', 'ORG')	Count: 18911
Entity: ('2021', 'DATE')	Count: 18800
Entity: ('One', 'CARDINAL')	Count: 18048
Entity: ('Artificial Intelligence', 'ORG')	Count: 17917
Entity: ('Europe', 'LOC')	Count: 16320
Entity: ('three', 'CARDINAL')	Count: 14699
Entity: ('ML', 'ORG')	Count: 13632


In [None]:
sentiment_groups = df.groupby('predicted_sentiment_yelp_new')

In [None]:
df.to_parquet('entity_nlp.parquet')

### Targeted (entity) Sentiment Identification (Start)

In [None]:
def targeted_sentiment(text, target_entity):
    doc = nlp(text)
    sentiment = None
    for ent in doc.ents:
        if ent.text.lower() == target_entity.lower():
            sentiment = ent._.polarity
            break
    return sentiment

In [None]:
df.head()

Unnamed: 0,url,date,language,title,text,cleaned_text,important_words,tokens,topic,predicted_sentiment_yelp_new,sentiment_numeric,entities,organization,products,GPE
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,Some urban areas of the city started to use ET...,urban areas city started use etc system roadsi...,"[urban, areas, city, started, use, etc, system...",0,p,1,"[(ETC, ORG), (July 1, 2019, DATE), (ETC, ORG),...","[ETC, ETC, AI, ETC, Wang, ETC, AIpark, AI, AI,...",[],"[Chaoyang, Beijing, Beijing, Beijing, China, C..."
1,http://newsparliament.com/2020/02/27/children-...,2020-02-27,en,Children With Autism Saw Their Learning and So...,\nChildren With Autism Saw Their Learning and ...,Children With Autism Saw Their Learning and So...,children autism saw learning social skills boo...,"[children, autism, saw, learning, social, skil...",1,n,-1,"[(Thursday, February 27, 2020, DATE), (seven, ...","[Kiwi, Kiwi, Kiwi, Science Robotics, Kiwi, Kiw...",[Cancel],"[U.S., Matarić]"
2,http://www.dataweek.co.za/12835r,2021-03-26,en,"Forget ML, AI and Industry 4.0 – obsolescence ...","\n\nForget ML, AI and Industry 4.0 – obsolesce...",The world entered a new era of accelerated tra...,world entered new era accelerated transformati...,"[world, entered, new, era, accelerated, transf...",0,p,1,"[(the last eighteen months, DATE), (years, DAT...","[the Brazilian Embassy, PowerPoint, ML, AI, Ir...",[],"[London, Lego, Mexico, US]"
3,http://www.homeoffice.consumerelectronicsnet.c...,2021-03-10,en,Strategy Analytics: 71% of Smartphones Sold Gl...,\n\nStrategy Analytics: 71% of Smartphones Sol...,AI is used in various functions inside smartph...,ai used various functions inside smartphones i...,"[ai, used, various, functions, inside, smartph...",0,p,1,"[(AI, ORG), (AI, ORG), (AI, ORG), (Edge AI, OR...","[AI, AI, AI, Edge AI, AI, AI, AI, AI, AI, AI, ...",[],[]
4,http://www.itbusinessnet.com/2020/10/olympus-t...,2020-10-20,en,Olympus to Support Endoscopic AI Diagnosis Edu...,\n\nOlympus to Support Endoscopic AI Diagnosis...,"In collaboration with CYBERNET SYSTEMS CO,. LT...",collaboration cybernet systems co ltd olympus ...,"[collaboration, cybernet, systems, co, ltd, ol...",0,p,1,"[(CYBERNET SYSTEMS CO, ORG), (Olympus, ORG), (...","[CYBERNET SYSTEMS CO, Olympus, AI, the Asian I...",[],"[Hyderabad, India, India, Japan, India, India,..."


In [None]:
df_exploded = df.assign(Entity=df['entities'].str.split(';')).explode('entities')

In [None]:
df_exploded

Unnamed: 0,url,date,language,title,text,cleaned_text,important_words,tokens,topic,predicted_sentiment_yelp_new,sentiment_numeric,entities,organization,products,GPE,Entity
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,Some urban areas of the city started to use ET...,urban areas city started use etc system roadsi...,"[urban, areas, city, started, use, etc, system...",0,p,1,"(ETC, ORG)","[ETC, ETC, AI, ETC, Wang, ETC, AIpark, AI, AI,...",[],"[Chaoyang, Beijing, Beijing, Beijing, China, C...",
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,Some urban areas of the city started to use ET...,urban areas city started use etc system roadsi...,"[urban, areas, city, started, use, etc, system...",0,p,1,"(July 1, 2019, DATE)","[ETC, ETC, AI, ETC, Wang, ETC, AIpark, AI, AI,...",[],"[Chaoyang, Beijing, Beijing, Beijing, China, C...",
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,Some urban areas of the city started to use ET...,urban areas city started use etc system roadsi...,"[urban, areas, city, started, use, etc, system...",0,p,1,"(ETC, ORG)","[ETC, ETC, AI, ETC, Wang, ETC, AIpark, AI, AI,...",[],"[Chaoyang, Beijing, Beijing, Beijing, China, C...",
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,Some urban areas of the city started to use ET...,urban areas city started use etc system roadsi...,"[urban, areas, city, started, use, etc, system...",0,p,1,"(90 percent, PERCENT)","[ETC, ETC, AI, ETC, Wang, ETC, AIpark, AI, AI,...",[],"[Chaoyang, Beijing, Beijing, Beijing, China, C...",
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,Some urban areas of the city started to use ET...,urban areas city started use etc system roadsi...,"[urban, areas, city, started, use, etc, system...",0,p,1,"(AI, ORG)","[ETC, ETC, AI, ETC, Wang, ETC, AIpark, AI, AI,...",[],"[Chaoyang, Beijing, Beijing, Beijing, China, C...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200331,https://yourstory.com/2021/05/startups-covid-1...,2021-05-06,en,Startups fight COVID-19: Slang Labs is using v...,Startups fight COVID-19: Slang Labs is using v...,0 CLAPS 201419 false true 0 0 The massive rise...,0 claps 201419 false true 0 0 massive rise cov...,"[0, claps, 201419, false, true, 0, 0, massive,...",0,p,1,"(150 0/150, CARDINAL)","[Mavericks, Co-Founder, YourStory, ICU, Voice,...",[YourStory Search],"[Bengaluru, India]",
200331,https://yourstory.com/2021/05/startups-covid-1...,2021-05-06,en,Startups fight COVID-19: Slang Labs is using v...,Startups fight COVID-19: Slang Labs is using v...,0 CLAPS 201419 false true 0 0 The massive rise...,0 claps 201419 false true 0 0 massive rise cov...,"[0, claps, 201419, false, true, 0, 0, massive,...",0,p,1,"(1000, CARDINAL)","[Mavericks, Co-Founder, YourStory, ICU, Voice,...",[YourStory Search],"[Bengaluru, India]",
200331,https://yourstory.com/2021/05/startups-covid-1...,2021-05-06,en,Startups fight COVID-19: Slang Labs is using v...,Startups fight COVID-19: Slang Labs is using v...,0 CLAPS 201419 false true 0 0 The massive rise...,0 claps 201419 false true 0 0 massive rise cov...,"[0, claps, 201419, false, true, 0, 0, massive,...",0,p,1,"(Latest Updates, LOC)","[Mavericks, Co-Founder, YourStory, ICU, Voice,...",[YourStory Search],"[Bengaluru, India]",
200331,https://yourstory.com/2021/05/startups-covid-1...,2021-05-06,en,Startups fight COVID-19: Slang Labs is using v...,Startups fight COVID-19: Slang Labs is using v...,0 CLAPS 201419 false true 0 0 The massive rise...,0 claps 201419 false true 0 0 massive rise cov...,"[0, claps, 201419, false, true, 0, 0, massive,...",0,p,1,"(YourStory Search, PRODUCT)","[Mavericks, Co-Founder, YourStory, ICU, Voice,...",[YourStory Search],"[Bengaluru, India]",


In [None]:
df_exploded[['entity_name', 'entity_type']] = df_exploded['entities'].str.extract(r'\(([^,]+), ([^)]+)\)')
df_exploded.head()

Unnamed: 0,url,date,language,title,text,cleaned_text,important_words,tokens,topic,predicted_sentiment_yelp_new,sentiment_numeric,entities,organization,products,GPE,Entity,entity_name,entity_type
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,Some urban areas of the city started to use ET...,urban areas city started use etc system roadsi...,"[urban, areas, city, started, use, etc, system...",0,p,1,"(ETC, ORG)","[ETC, ETC, AI, ETC, Wang, ETC, AIpark, AI, AI,...",[],"[Chaoyang, Beijing, Beijing, Beijing, China, C...",,,
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,Some urban areas of the city started to use ET...,urban areas city started use etc system roadsi...,"[urban, areas, city, started, use, etc, system...",0,p,1,"(July 1, 2019, DATE)","[ETC, ETC, AI, ETC, Wang, ETC, AIpark, AI, AI,...",[],"[Chaoyang, Beijing, Beijing, Beijing, China, C...",,,
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,Some urban areas of the city started to use ET...,urban areas city started use etc system roadsi...,"[urban, areas, city, started, use, etc, system...",0,p,1,"(ETC, ORG)","[ETC, ETC, AI, ETC, Wang, ETC, AIpark, AI, AI,...",[],"[Chaoyang, Beijing, Beijing, Beijing, China, C...",,,
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,Some urban areas of the city started to use ET...,urban areas city started use etc system roadsi...,"[urban, areas, city, started, use, etc, system...",0,p,1,"(90 percent, PERCENT)","[ETC, ETC, AI, ETC, Wang, ETC, AIpark, AI, AI,...",[],"[Chaoyang, Beijing, Beijing, Beijing, China, C...",,,
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,Some urban areas of the city started to use ET...,urban areas city started use etc system roadsi...,"[urban, areas, city, started, use, etc, system...",0,p,1,"(AI, ORG)","[ETC, ETC, AI, ETC, Wang, ETC, AIpark, AI, AI,...",[],"[Chaoyang, Beijing, Beijing, Beijing, China, C...",,,


In [None]:
df_exploded['entity_type'] = df_exploded['entity_type'].str.strip("'")
df_exploded['entity_name'] = df_exploded['entity_name'].str.strip("'")
df_exploded.head()

Unnamed: 0,url,date,language,title,text,cleaned_text,important_words,tokens,topic,predicted_sentiment_yelp_new,sentiment_numeric,entities,organization,products,GPE,Entity,entity_name,entity_type
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,Some urban areas of the city started to use ET...,urban areas city started use etc system roadsi...,"[urban, areas, city, started, use, etc, system...",0,p,1,"(ETC, ORG)","[ETC, ETC, AI, ETC, Wang, ETC, AIpark, AI, AI,...",[],"[Chaoyang, Beijing, Beijing, Beijing, China, C...",,,
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,Some urban areas of the city started to use ET...,urban areas city started use etc system roadsi...,"[urban, areas, city, started, use, etc, system...",0,p,1,"(July 1, 2019, DATE)","[ETC, ETC, AI, ETC, Wang, ETC, AIpark, AI, AI,...",[],"[Chaoyang, Beijing, Beijing, Beijing, China, C...",,,
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,Some urban areas of the city started to use ET...,urban areas city started use etc system roadsi...,"[urban, areas, city, started, use, etc, system...",0,p,1,"(ETC, ORG)","[ETC, ETC, AI, ETC, Wang, ETC, AIpark, AI, AI,...",[],"[Chaoyang, Beijing, Beijing, Beijing, China, C...",,,
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,Some urban areas of the city started to use ET...,urban areas city started use etc system roadsi...,"[urban, areas, city, started, use, etc, system...",0,p,1,"(90 percent, PERCENT)","[ETC, ETC, AI, ETC, Wang, ETC, AIpark, AI, AI,...",[],"[Chaoyang, Beijing, Beijing, Beijing, China, C...",,,
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,Some urban areas of the city started to use ET...,urban areas city started use etc system roadsi...,"[urban, areas, city, started, use, etc, system...",0,p,1,"(AI, ORG)","[ETC, ETC, AI, ETC, Wang, ETC, AIpark, AI, AI,...",[],"[Chaoyang, Beijing, Beijing, Beijing, China, C...",,,


In [None]:
len(df)

136927

In [None]:
len(df_exploded)

5813329

In [None]:
sentiment_distribution = df_exploded['predicted_sentiment_yelp_new'].value_counts(normalize=True) * 100
print("Sentiment Distribution:")
print(sentiment_distribution)

Sentiment Distribution:
predicted_sentiment_yelp_new
p      73.314929
n      15.110155
neu    11.574917
Name: proportion, dtype: float64


In [None]:
df_exploded['entities'] = df_exploded['entities'].astype(str)

In [None]:
date_range = pd.date_range(start=df['date'].min(), end=df['date'].max())
date_range

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06', '2020-01-07', '2020-01-08',
               '2020-01-09', '2020-01-10',
               ...
               '2023-04-19', '2023-04-20', '2023-04-21', '2023-04-22',
               '2023-04-23', '2023-04-24', '2023-04-25', '2023-04-26',
               '2023-04-27', '2023-04-28'],
              dtype='datetime64[ns]', length=1214, freq='D')