### Install Dependencies

In [None]:
!pip install newspaper3k
!pip install langchain
!pip install tiktoken

Installing collected packages: tiktoken
Successfully installed tiktoken-0.6.0


In [None]:
import os
import pandas as pd
import numpy as np
import langchain
from langchain.memory import ConversationBufferMemory
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
import regex as re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.output_parsers import PydanticOutputParser
from langchain.pydantic_v1 import BaseModel, Field, validator
from newspaper import Article
import datetime

Get News content

In [None]:
def remove_characters(text):
    text = text.replace("\n", " ")
    text = re.sub(r'https?://\S+|www\.\S+', '', text)   #subject to change according to article
    text = re.sub(r'[^\w\s",.-]', '', text)
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'\[[^\]]*\]', '', text)
    text = re.sub(r'\u200d',"",text)
    text = re.sub(r"This is AI generated summarization, which may have errors. For context, always refer to the full article.","",text)
    return text

In [None]:
def get_page_content(url):
    article = Article(url)
    article.download()
    article.parse()
    txt = article.text
    return remove_characters(txt)

### Scraping

In [None]:
from langchain.docstore.document import Document

In [None]:
doc_list = []

In [None]:
import requests

api_key = 'your_cloud_api'
api_url = 'https://factchecktools.googleapis.com/v1alpha1/claims:search'

# Define list of publisher websites (replace with your desired URLs)
publisher_list = [
    "fullfact.org",
    "https://www.indiatoday.in/fact-check"
    # Add more website URLs here
]

num_claims_to_retrieve = 5500  # To retrieve specific no. of claims
claims_upto_days = 10  # retrieve claims upto spicific no. of days

for publisher_site in publisher_list:
  params = {
    'key': api_key,
    'reviewPublisherSiteFilter': publisher_site,
    'pageSize': num_claims_to_retrieve,  # comment out to retrieve all claims
    'maxAgeDays': claims_upto_days,
    # 'language' : 'en'  # optional language filter
  }

  response = requests.get(api_url, params=params)
  ccount = 0

  if response.status_code == 200:
    data = response.json()
    claims = data.get('claims', [])

    if claims:
      for claim in claims:
        ccount += 1

        # Process claim data
        publishDate = claim.get("claimDate")
        claim_review_title = claim.get('claimReview')[0].get('title')
        claim_url = claim.get('claimReview')[0].get('url')
        publisher = claim.get('claimReview')[0].get('publisher').get('name')
        siteorigin = claim.get('claimReview')[0].get('publisher').get('site')
        dateofclaim = claim.get('claimReview')[0].get('reviewDate')
        claim_status = claim.get('claimReview')[0].get('textualRating')

        try:
          # Add claim data to your document list (adjust based on your setup)
          doc_list.append(
              Document(
                  page_content=claim_review_title + get_page_content(claim_url) + "\n",
                  metadata={
                      "title": claim_review_title,
                      "source": claim_url,
                      "publisher": publisher,
                      "validity": claim_status,
                      "publishDate": dateofclaim
                  }
              )
          )
        except Exception as e:
          pass

      print(f"{ccount} claims added to source list for {publisher_site}")
    else:
      print(f"No relevant claims found for {publisher_site}.")
  else:
    print(f"Error: {response.status_code}")

print("Finished processing all publisher websites.")


17 claims added to source list for fullfact.org
No relevant claims found for https://www.indiatoday.in/fact-check.
Finished processing all publisher websites.


In [None]:
len(doc_list)  #check if number same as ccount

17

In [None]:
chunksplitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 2048,
    chunk_overlap = 0,
)

In [None]:
chunk_splitted_docs = chunksplitter.split_documents(
    doc_list
)

In [None]:
len(chunk_splitted_docs)

17

In [None]:
dat = pd.DataFrame(columns=["title","content","source","publisher","validity","publish_date"])

In [None]:
dat

Unnamed: 0,title,content,source,publisher,validity,publish_date


In [None]:
i = 0
for doc in chunk_splitted_docs:
    if i==1:
        print(doc.metadata["title"])
    i+=1

How has disposable income changed since 2010?


In [None]:
dat.columns

Index(['title', 'content', 'source', 'publisher', 'validity', 'publish_date'], dtype='object')

In [None]:
data_list = []

for doc in chunk_splitted_docs:
    for j in doc.metadata:
        if doc.metadata[j] is None:
            doc.metadata[j] = "Not Available"

    # Create a dictionary with the data for each column
    data_dict = {
        "title": doc.metadata["title"],
        "content": doc.page_content,
        "source": doc.metadata["source"],
        "publisher": doc.metadata["publisher"],
        "validity": doc.metadata["validity"],
        "publish_date" : doc.metadata["publishDate"]
    }

    # Append the data_dict to the data_list
    data_list.append(data_dict)

# Create a DataFrame from the list of dictionaries
dat = pd.DataFrame(data_list)

In [None]:
dat.to_csv("fullfactallclaimstest.csv", encoding='utf-8' ,sep="\t")