In [15]:
import sklearn
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn import metrics

from joblib import dump, load
from joblib import Parallel, delayed

In [16]:
#pip install pandarallel
import multiprocessing

num_processors = multiprocessing.cpu_count()
print(f'Available CPUs: {num_processors}')

import pandarallel
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False)

Available CPUs: 16
INFO: Pandarallel will run on 15 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [17]:
# Imports the Google Cloud client library
from google.cloud import storage
# Instantiates a client
storage_client = storage.Client()

# The name for the new bucket
bucket_name = "nlp_final_project_kshitijm"

# Creates the new bucket
bucket = storage_client.bucket(bucket_name)
print(f"Bucket {bucket.name} connected.")

Bucket nlp_final_project_kshitijm connected.


### Importing transformer from hugging face hub

In [4]:
import torch

In [18]:
# pip install transformers
from transformers import pipeline

2023-05-21 17:17:46.405635: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-21 17:18:00.244809: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-05-21 17:18:00.246291: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/loca

In [19]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

In [20]:
prompt = ["Chatgpt has made my life really good","Openai stock stock is going down"]
token=tokenizer(prompt, padding='max_length', max_length=256, truncation=True, return_tensors='pt')

In [21]:
output=model(**token)

In [22]:
output.logits

tensor([[-2.4181, -2.8373,  5.7378],
        [ 4.3566, -1.3070, -3.4122]], grad_fn=<AddmmBackward0>)

In [23]:
import torch.nn.functional as F
probs=F.softmax(output.logits)
probs=probs.detach().numpy()
print(probs)

[[2.8689814e-04 1.8867313e-04 9.9952447e-01]
 [9.9612242e-01 3.4564650e-03 4.2109768e-04]]


  


-----
### Predicting sentiments for our dataframe

In [41]:
df_filt_2020=pd.read_parquet('gs://nlp_final_project_kshitijm/00_Data/NLP_FP_Data5_2020_Topics.parquet')
df_filt_2020.head()

Unnamed: 0,url,date,language,title,text,cleaned_text,article_source,clean_title,title_tokens,cleaned_text_tokens,year_pub,month,month-year,flag_relevant,num_tokens,rake_phrases_articles,rake_phrases_joined,final_topic
0,https://fusionscienceacademy.com/artificial-in...,2020-01-30,en,Artificial Intelligence (AI) in Social Media ...,\n\nArtificial Intelligence (AI) in Social Med...,Artificial Intelligence AI in Social MediaMar...,2026 – Fusion Science Academy,Artificial Intelligence (AI) in Social Media ...,"['artificial', 'intelligence', 'ai', 'social',...","['artificial', 'intelligence', 'ai', 'social',...",2020,1,Jan 2020,1,22283,[burkert fluid control systems emerson electri...,burkert fluid control systems emerson electric...,6
1,https://health.economictimes.indiatimes.com/ne...,2020-01-10,en,artificial intelligence: Researchers develop A...,\n\nartificial intelligence: Researchers devel...,artificial intelligence: Researchers develop ...,,artificial intelligence: Researchers develop A...,"['artificial', 'intelligence', 'researchers', ...","['artificial', 'intelligence', 'researchers', ...",2020,1,Jan 2020,1,8087,[economic times ethealthworldhome news hospita...,economic times ethealthworldhome news hospital...,1
2,https://heraldpublicist.com/bet-gil-on-ai-fina...,2020-01-15,en,Bet Gil on AI Final Fantasy Tactics Matches in...,\n\nBet Gil on AI Final Fantasy Tactics Matche...,Bet Gil on AI Final Fantasy Tactics Matches i...,Herald Publicist,Bet Gil on AI Final Fantasy Tactics Matches in...,"['bet', 'gil', 'ai', 'final', 'fantasy', 'tact...","['bet', 'gil', 'ai', 'final', 'fantasy', 'tact...",2020,1,Jan 2020,1,4458,[hilarious twitch streamnewstechnologycricketp...,hilarious twitch streamnewstechnologycricketpo...,1
3,https://honestversion.com/2020/01/24/growth-of...,2020-01-24,en,Growth of Cloud Telecommunication AI market in...,\n\nGrowth of Cloud Telecommunication AI marke...,Growth of Cloud AI market in global industry...,Honest Version,Growth of Cloud Telecommunication AI market in...,"['growth', 'cloud', 'ai', 'market', 'global', ...","['growth', 'cloud', 'ai', 'market', 'global', ...",2020,1,Jan 2020,1,6048,[cloud telecommunication ai market strategic a...,cloud telecommunication ai market strategic as...,6
4,https://marketresearchsheets.com/2020/01/31/gl...,2020-01-31,en,Global Artificial Intelligence as a Service Ma...,\n\nGlobal Artificial Intelligence as a Servic...,Global Artificial Intelligence as a Service M...,Market Research Sheets,Global Artificial Intelligence as a Service Ma...,"['global', 'artificial', 'intelligence', 'serv...","['global', 'artificial', 'intelligence', 'serv...",2020,1,Jan 2020,1,10098,[service market trendsyou may also like news g...,service market trendsyou may also like news gl...,6


In [24]:
from tqdm import tqdm

def classify_sentiment(text):
    # Split or truncate the text if it exceeds the maximum sequence length
    # max_seq_length = 512
    token=tokenizer(text, padding='max_length', max_length=512, truncation=True, return_tensors='pt')
    output=model(**token)
    probs=F.softmax(output.logits)
    probs=probs.detach().numpy()
    return probs


In [84]:
# Apply the classification to the dataframe with progress tracker
tqdm.pandas()
df_filt_2020_samp=df_filt_2020_samp.sample(100)
df_filt_2020_samp['sentiment_text'] = df_filt_2020_samp['cleaned_text'].progress_apply(classify_sentiment)


  
100%|██████████| 100/100 [00:15<00:00,  6.56it/s]


In [89]:
df_filt_2020_samp['sentiment_label']=df_filt_2020_samp['sentiment_text'].apply(lambda x: np.argmax(x[0]))
df_filt_2020_samp['sentiment_label'].value_counts()

1    71
2    24
0     5
Name: sentiment_label, dtype: int64

### Running on 2020 dataset

In [94]:
df_filt_2020['sent_probs']=df_filt_2020['cleaned_text'].progress_apply(classify_sentiment)
df_filt_2020['sent_label']=df_filt_2020['sent_probs'].progress_apply(lambda x: np.argmax(x[0]))

  
100%|██████████| 32667/32667 [1:22:46<00:00,  6.58it/s]
100%|██████████| 32667/32667 [00:00<00:00, 294287.02it/s]


In [96]:
df_filt_2020['sent_label'].value_counts()

1    22548
2     9429
0      690
Name: sent_label, dtype: int64

In [98]:
%%time
df_filt_2020.to_csv('gs://nlp_final_project_kshitijm/00_Data/NLP_FP_Data5_2020_Topics_Sentiments.csv')                

CPU times: user 57.5 s, sys: 392 ms, total: 57.9 s
Wall time: 1min 22s


In [11]:
### Reading the dataset
df_filt_2020_sent=pd.read_csv('gs://nlp_final_project_kshitijm/00_Data/NLP_FP_Data5_2020_Topics_Sentiments.csv',lineterminator='\n')
df_filt_2020_sent.head(2)

Unnamed: 0.1,Unnamed: 0,url,date,language,title,text,cleaned_text,article_source,clean_title,title_tokens,...,year_pub,month,month-year,flag_relevant,num_tokens,rake_phrases_articles,rake_phrases_joined,final_topic,sent_probs,sent_label
0,0,https://fusionscienceacademy.com/artificial-in...,2020-01-30,en,Artificial Intelligence (AI) in Social Media ...,\n\nArtificial Intelligence (AI) in Social Med...,Artificial Intelligence AI in Social MediaMar...,2026 – Fusion Science Academy,Artificial Intelligence (AI) in Social Media ...,"['artificial', 'intelligence', 'ai', 'social',...",...,2020,1,Jan 2020,1,22283,['burkert fluid control systems emerson electr...,burkert fluid control systems emerson electric...,6,[[2.1574432e-04 9.9863547e-01 1.1488239e-03]],1
1,1,https://health.economictimes.indiatimes.com/ne...,2020-01-10,en,artificial intelligence: Researchers develop A...,\n\nartificial intelligence: Researchers devel...,artificial intelligence: Researchers develop ...,,artificial intelligence: Researchers develop A...,"['artificial', 'intelligence', 'researchers', ...",...,2020,1,Jan 2020,1,8087,['economic times ethealthworldhome news hospit...,economic times ethealthworldhome news hospital...,1,[[1.9508268e-04 8.0524624e-04 9.9899966e-01]],2


### Re-Running on all samples

----
2021

In [13]:
tqdm.pandas()
df_filt_2021=pd.read_parquet('gs://nlp_final_project_kshitijm/00_Data/NLP_FP_Data5_2021_Topics.parquet')
df_filt_2021['sent_probs']=df_filt_2021['cleaned_text'].progress_apply(classify_sentiment)
df_filt_2021['sent_label']=df_filt_2021['sent_probs'].progress_apply(lambda x: np.argmax(x[0]))


  
100%|██████████| 42868/42868 [1:50:02<00:00,  6.49it/s]  
100%|██████████| 42868/42868 [00:00<00:00, 300952.10it/s]


In [14]:
%%time
df_filt_2021.to_csv('gs://nlp_final_project_kshitijm/00_Data/NLP_FP_Data5_2021_Topics_Sentiments.csv')

CPU times: user 1min 15s, sys: 528 ms, total: 1min 15s
Wall time: 1min 51s


In [33]:
df_filt_2021['sent_label'].value_counts()

1    27175
2    14681
0     1012
Name: sent_label, dtype: int64

----
2022

In [27]:
%%time

tqdm.pandas()
df_filt_2022=pd.read_parquet('gs://nlp_final_project_kshitijm/00_Data/NLP_FP_Data5_2022_Topics.parquet')
df_filt_2022['sent_probs']=df_filt_2022['cleaned_text'].progress_apply(classify_sentiment)
df_filt_2022['sent_label']=df_filt_2022['sent_probs'].progress_apply(lambda x: np.argmax(x[0]))

  
100%|██████████| 54941/54941 [2:14:11<00:00,  6.82it/s]  
100%|██████████| 54941/54941 [00:00<00:00, 314783.65it/s]

CPU times: user 16h 47min 41s, sys: 10min 23s, total: 16h 58min 4s
Wall time: 2h 14min 31s





In [28]:
%%time
df_filt_2022.to_csv('gs://nlp_final_project_kshitijm/00_Data/NLP_FP_Data5_2022_Topics_Sentiments.csv')

CPU times: user 1min 30s, sys: 632 ms, total: 1min 31s
Wall time: 2min 4s


In [32]:
df_filt_2022['sent_label'].value_counts()

1    31806
2    21917
0     1218
Name: sent_label, dtype: int64

------
2023

In [29]:
%%time

tqdm.pandas()
df_filt_2023=pd.read_parquet('gs://nlp_final_project_kshitijm/00_Data/NLP_FP_Data5_2023_Topics.parquet')
df_filt_2023['sent_probs']=df_filt_2023['cleaned_text'].progress_apply(classify_sentiment)
df_filt_2023['sent_label']=df_filt_2023['sent_probs'].progress_apply(lambda x: np.argmax(x[0]))

  
100%|██████████| 56339/56339 [2:34:36<00:00,  6.07it/s]  
100%|██████████| 56339/56339 [00:00<00:00, 310839.79it/s]

CPU times: user 18h 45min 33s, sys: 38min 39s, total: 19h 24min 12s
Wall time: 2h 34min 57s





In [30]:
%%time
df_filt_2023.to_csv('gs://nlp_final_project_kshitijm/00_Data/NLP_FP_Data5_2023_Topics_Sentiments.csv')

CPU times: user 1min 46s, sys: 801 ms, total: 1min 47s
Wall time: 2min 21s


In [31]:
df_filt_2023['sent_label'].value_counts()

1    39723
2    14854
0     1762
Name: sent_label, dtype: int64

In [45]:
pd.options.display.max_colwidth=None
df_filt_2023[df_filt_2023['sent_label']==0][['clean_title','sent_probs']].sample(5)

Unnamed: 0,clean_title,sent_probs
19164,Sexting chatbot ban points to looming battle over AI rules,"[[0.81558263, 0.18129273, 0.003124689]]"
10530,Can AI revive the handset market?,"[[0.9969887, 0.0023647698, 0.00064651243]]"
42106,Why Developers Are Turning to AI to Deliver Complex Projects,"[[0.9829107, 0.016453687, 0.0006356551]]"
21248,\r Naver to introduce search GPT in first half of year,"[[0.9225757, 0.07364595, 0.003778383]]"
15365,"Google's new AI bot made a gaffe because it was 'rushed' out to compete with ChatGPT, experts say","[[0.99728346, 0.0020851572, 0.00063134957]]"
