In [1]:
import csv

queries = [
    "A.I.",
    "Artificial Intelligence",
    "Augmented reality",
    "Automation",
    "Chatbot",
    "Data Science",
    "Deepfake",
    "GPT",
    "M.L.",
    "Machine Learning",
    "Natural Language Processing",
    "NLP",
    "Virtual Reality",
    "Digital",
    "Virtual",
    "Technology",
    "Data",
    "Tech",
    "Computers",
    "Computer",
    "Social Media",
    "Robot",
    "Metaverse",
    "Facial Recognition",
    "End"
]

# Open the input file
with open('data.csv', 'r', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    
    # Define the output files
    with open('datafilter.csv', 'w', newline='', encoding='utf-8') as dataa_file, open('datanfilter.csv', 'w', newline='', encoding='utf-8') as datan_file:
        fieldnames = reader.fieldnames
        dataa_writer = csv.DictWriter(dataa_file, fieldnames=fieldnames)
        datan_writer = csv.DictWriter(datan_file, fieldnames=fieldnames)
        
        # Write headers to the output files
        dataa_writer.writeheader()
        datan_writer.writeheader()
        
        # Iterate over each row in the input file
        for row in reader:

            for query in queries:
                # Check if the query is in the Headline or Snippet
                if query.lower() in row['Headline'].lower() or query.lower() in row['Snippet'].lower() or query.lower() in row['URL']:
                    dataa_writer.writerow(row)
                    break
                elif query=="End":
                    datan_writer.writerow(row)
                    break
                else:
                    pass

We have tried to filter using various methods to make sure the topic is as concise as possible. One way to do that can be to check if any technology related words are included in the title that we can use as a measure of its relevance. However, we end up realising that there is no determined way to do so since we get both false positives and false negatives in each case. 

"Gang Rape Defendants at Risk in Tihar Jail, Lawyers and Family Say" includes M.L. in context of the Indian political positions and not 'Machine Learning'.

"Ray Kurzweil Says We’re Going to Live Forever" doesn't explicitly mention any A.I. or similar words in the title but still discusses rather important themes of how A.I. enables us to document ourselves eternally.

We also notice that there are more false negatives than false positives, as multiple titles...
- "Confronting the Fact of Fiction and the Fiction of Fact"
- "At Google Conference, Cameras Even in the Bathroom"
- "Yes, Economics Is a Science"

Therefore, we choose to not put any filter on the basis of content.

Further, there is no certain way to determine A.I. related articles unless performing advanced Contextual Analysis.

### Adding Sentiment for each Entry - **Roberta's Pretrained Model**

In [2]:
import pandas as pd

file_path = 'data.csv'
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,Query,Headline,Publication Date,Snippet,URL
0,Artificial Intelligence,Today’s Scuttlebot: Tech Design Comics and Art...,2013-01-04 09:15:58+00:00,The technology reporters and editors of The Ne...,https://bits.blogs.nytimes.com/2013/01/04/toda...
1,Artificial Intelligence,A Motherboard Walks Into a Bar ...,2013-01-04 21:02:48+00:00,Researchers are teaching computers to be funny...,https://www.nytimes.com/2013/01/06/opinion/sun...
2,Virtual Reality,"A Bull in Stocks, but a Bear for Free Speech",2013-01-08 16:43:12+00:00,Expectations among investors and others are hi...,https://dealbook.nytimes.com/2013/01/08/a-bull...
3,Virtual Reality,"Taking in Paris Any Day, Any Century",2013-01-09 22:25:45+00:00,"“Paris 3D” lets viewers see an interactive, 3-...",https://www.nytimes.com/2013/01/10/arts/design...
4,Virtual Reality,Some future gadgets I’d maybe buy (aka a reali...,2013-01-10 00:00:00+00:00,Eight Wirecutter writers and I convened at the...,https://www.nytimes.com/wirecutter/blog/some-f...


In [3]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)



In [4]:
def classify_sentiment(text):
    encoded_text = tokenizer(text, return_tensors='pt')
    results = model(**encoded_text)
    scores = results[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }

    return scores_dict['roberta_pos']-scores_dict['roberta_neg']
    

df['Sentiment_H'] = df['Headline'].apply(lambda x: classify_sentiment(x) if isinstance(x, str) and x.strip() else None)
df['Sentiment_S'] = df['Snippet'].apply(lambda x: classify_sentiment(x) if isinstance(x, str) and x.strip() else None)

df.head()


Unnamed: 0,Query,Headline,Publication Date,Snippet,URL,Sentiment_H,Sentiment_S
0,Artificial Intelligence,Today’s Scuttlebot: Tech Design Comics and Art...,2013-01-04 09:15:58+00:00,The technology reporters and editors of The Ne...,https://bits.blogs.nytimes.com/2013/01/04/toda...,0.012738,0.31097
1,Artificial Intelligence,A Motherboard Walks Into a Bar ...,2013-01-04 21:02:48+00:00,Researchers are teaching computers to be funny...,https://www.nytimes.com/2013/01/06/opinion/sun...,0.010714,0.716826
2,Virtual Reality,"A Bull in Stocks, but a Bear for Free Speech",2013-01-08 16:43:12+00:00,Expectations among investors and others are hi...,https://dealbook.nytimes.com/2013/01/08/a-bull...,-0.223068,0.197209
3,Virtual Reality,"Taking in Paris Any Day, Any Century",2013-01-09 22:25:45+00:00,"“Paris 3D” lets viewers see an interactive, 3-...",https://www.nytimes.com/2013/01/10/arts/design...,0.13413,0.481339
4,Virtual Reality,Some future gadgets I’d maybe buy (aka a reali...,2013-01-10 00:00:00+00:00,Eight Wirecutter writers and I convened at the...,https://www.nytimes.com/wirecutter/blog/some-f...,0.605519,0.228054


When we use the Roberta pre-trained model, we are given three measures of sentiment- positive, neutral and negative. In order to have an accurate sentiment determination, we must use all of these variables and combine them into a single variable which can be used as the sole determining value of 'sentiment' for both, the Headline and the Snippet. 

In [5]:
df['Sentiment'] = (0.2 * df['Sentiment_H'] + 0.8 * df['Sentiment_S']) # Getting one value to capture the general tone with all the informations we have
df.to_csv('finaldata.csv', index=False)