In [36]:
!pip install tiktoken
import tiktoken
import torch
import numpy as np
import torch.nn as nn

from torch.nn import functional as F




In [37]:
torch.manual_seed(256)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

block_size        = 40      ## N tokens in sequence
batch_size        = 64
max_iters         = 6000
eval_interval     = 500
learning_rate     = 0.0003
eval_iters        = 300
vocab_size        = 65

## every id for a given token is embedded to vector of this size
n_embd            = 512
n_head            = 8         ## 8 attention heads
n_layer           = 6         ## 6 eoncoder layers
dropout           = 0.2


In [38]:
from google.colab import files
uploaded = files.upload()

with open('/content/Trained_newsroom_dataset.csv', 'r', encoding='utf-8') as f:
    text = f.read().split('.')



Saving Trained_newsroom_dataset.csv to Trained_newsroom_dataset (2).csv


In [39]:
# Initialize tiktoken encoder/decoder
tokenizer = tiktoken.get_encoding("cl100k_base")  # using the GPT-3 base encoding
encode = tokenizer.encode
decode = tokenizer.decode



In [40]:
text = ''

input_file2 = '/content/purdue_news_all_pages-2024.csv'

with open(input_file2, 'r', encoding='utf-8') as f:
    text = f.read()
print(f"Loaded text length: {len(text)}")
print(f"Sample content: {text[:500]}")
# Tokenize the text
data = torch.tensor(encode(text), dtype=torch.long)


Loaded text length: 79748
Sample content: title,link,date
Purdue scientist expecting new world to reveal itself to Mars rover,https://www.purdue.edu/newsroom/2024/Q4/purdue-scientist-expecting-new-world-to-reveal-itself-to-mars-rover,"December 3, 2024"
Participants in the Purdue Ukrainian Scholars Initiative to be featured in annual panel discussion,https://www.purdue.edu/newsroom/2024/Q4/participants-in-the-purdue-ukrainian-scholars-initiative-to-be-featured-in-annual-panel-discussion,"December 3, 2024"
"Cement grows stronger, more res


In [41]:
print("length of data in letter or characters")
len(text)


length of data in letter or characters


79748

In [42]:
list(set(text))


['5',
 'J',
 'x',
 '\xa0',
 'j',
 'q',
 '‘',
 'è',
 'U',
 '$',
 'B',
 '.',
 'L',
 'I',
 'p',
 'g',
 '7',
 ' ',
 ';',
 'S',
 'u',
 ':',
 'P',
 'K',
 'N',
 'c',
 'r',
 '4',
 '/',
 '2',
 '3',
 'e',
 '&',
 'Q',
 't',
 'R',
 'O',
 ',',
 '?',
 '9',
 's',
 'a',
 'l',
 'H',
 'w',
 'M',
 'A',
 'h',
 'T',
 'G',
 '6',
 '-',
 '—',
 'i',
 'E',
 'v',
 '0',
 '8',
 'k',
 'z',
 'b',
 'C',
 'Y',
 '"',
 'd',
 'o',
 'f',
 '\n',
 'm',
 'F',
 '’',
 'V',
 'W',
 '–',
 '%',
 'n',
 '1',
 'D',
 'y']

In [43]:
the_chars  = sorted(     list(set(text))     )

vocab_size = len( the_chars )      ## 65

print(  len(the_chars)  )

print(  ''.join(the_chars)  )


79

 "$%&,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWYabcdefghijklmnopqrstuvwxyz è–—‘’


In [44]:
stoi = { ch:i for i, ch in enumerate(the_chars) }
itos = { i:ch for i, ch in enumerate(the_chars) }



In [45]:
print( stoi )
print( itos )



{'\n': 0, ' ': 1, '"': 2, '$': 3, '%': 4, '&': 5, ',': 6, '-': 7, '.': 8, '/': 9, '0': 10, '1': 11, '2': 12, '3': 13, '4': 14, '5': 15, '6': 16, '7': 17, '8': 18, '9': 19, ':': 20, ';': 21, '?': 22, 'A': 23, 'B': 24, 'C': 25, 'D': 26, 'E': 27, 'F': 28, 'G': 29, 'H': 30, 'I': 31, 'J': 32, 'K': 33, 'L': 34, 'M': 35, 'N': 36, 'O': 37, 'P': 38, 'Q': 39, 'R': 40, 'S': 41, 'T': 42, 'U': 43, 'V': 44, 'W': 45, 'Y': 46, 'a': 47, 'b': 48, 'c': 49, 'd': 50, 'e': 51, 'f': 52, 'g': 53, 'h': 54, 'i': 55, 'j': 56, 'k': 57, 'l': 58, 'm': 59, 'n': 60, 'o': 61, 'p': 62, 'q': 63, 'r': 64, 's': 65, 't': 66, 'u': 67, 'v': 68, 'w': 69, 'x': 70, 'y': 71, 'z': 72, '\xa0': 73, 'è': 74, '–': 75, '—': 76, '‘': 77, '’': 78}
{0: '\n', 1: ' ', 2: '"', 3: '$', 4: '%', 5: '&', 6: ',', 7: '-', 8: '.', 9: '/', 10: '0', 11: '1', 12: '2', 13: '3', 14: '4', 15: '5', 16: '6', 17: '7', 18: '8', 19: '9', 20: ':', 21: ';', 22: '?', 23: 'A', 24: 'B', 25: 'C', 26: 'D', 27: 'E', 28: 'F', 29: 'G', 30: 'H', 31: 'I', 32: 'J', 33: '

In [46]:
encode = lambda s: [ stoi[c]          for c in s   ]

encode("bahh")


[48, 47, 54, 54]

In [47]:
decode = lambda l: ''.join(   itos[i] for i in l   )

decode([40, 39, 46, 46])



'RQYY'

In [48]:
data = torch.tensor(   encode(text), dtype=torch.long   )

print( data )



tensor([66, 55, 66,  ..., 14,  2,  0])


In [49]:
n          = int(   0.9*len(data)   )

train_data = data[:n]
val_data   = data[n:]


In [50]:
def get_batch(split):
    if split == "train":
        data = train_data
    else:
        data = val_data

    ix = torch.randint(   len(data) - block_size, (batch_size,)   )

    x  = torch.stack(    [  data[   i : i+block_size ]     for i in ix ]    )
    y  = torch.stack(    [  data[ i+1 : i+1+block_size ]   for i in ix ]    )

    x, y = x.to(device), y.to(device)

    return x, y


In [51]:
temp_batch_size = 4
temp_block_size = 16

## select random starting points for the 4 sentences
ix = torch.randint(
            len(data) - block_size,
            (temp_batch_size,)
)

print( ix )



for index_temp in ix:
    print(  data[index_temp]  )


tensor([  268, 36154, 31199, 37681])
tensor(66)
tensor(12)
tensor(66)
tensor(50)


In [52]:
x  = torch.stack(
    [ data[   i : i+  temp_block_size ]   for i in ix ]

)

y  = torch.stack(
    [ data[ i+1 : i+1+ temp_block_size ]  for i in ix ]
)

print(x)
print(y)


tensor([[66, 61,  1, 48, 51,  1, 52, 51, 47, 66, 67, 64, 51, 50,  1, 55],
        [12, 10, 12, 15,  7, 54, 51, 47, 58, 66, 54,  7, 62, 58, 47, 60],
        [66, 61, 62,  7, 15,  7, 52, 64, 61, 59,  7, 62, 67, 64, 50, 67],
        [50, 55, 65, 49, 55, 62, 58, 55, 60, 47, 64, 71,  1, 47, 62, 62]])
tensor([[61,  1, 48, 51,  1, 52, 51, 47, 66, 67, 64, 51, 50,  1, 55, 60],
        [10, 12, 15,  7, 54, 51, 47, 58, 66, 54,  7, 62, 58, 47, 60, 65],
        [61, 62,  7, 15,  7, 52, 64, 61, 59,  7, 62, 67, 64, 50, 67, 51],
        [55, 65, 49, 55, 62, 58, 55, 60, 47, 64, 71,  1, 47, 62, 62, 64]])


In [53]:
@torch.no_grad()    ## for efficient processing
def estimate_loss():
    out = {}
    model.eval()   ## set to no training
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()  ## back to training
    return out


In [54]:
## NN Architectures



class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()

        self.key   = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]
        self.query = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]
        self.value = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]

        tril_def = torch.tril( torch.ones(block_size, block_size) )  ## [40, 40]

        self.register_buffer(
                  'tril',
                  tril_def
               )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        B, T, E = x.shape   ## [batch_size, 40, 512]

        k = self.key(   x )            ## k = (B, T, 64)
        q = self.query( x )            ## q = (B, T, 64)

        E2 = 64     ## I think this is 64 and not 512
        ## (B, T, E) @ (B, E, T)  -> (B, T, T)
        wei = q @ k.transpose(-2, -1) * E2 ** -0.5

        wei = wei.masked_fill(
                      self.tril[:T, :T] == 0,
                      float('-inf')
        )

        ## (B, T, T)
        wei = F.softmax( wei, dim= -1 )         ## (B, T, T)
        wei = self.dropout(   wei   )

        ## perform weighted aggregation of values

        v   = self.value(  x  )   ## x = (B, 40, E)
        out = wei @ v             ## (B, T, T) @ (B, T, 64) -> (B, T, 64)

        return out


In [55]:
class FeedForward(nn.Module):

    def __init__(self, n_embd):         ## 512

        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),      ## [512, 4*512]
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),      ## [4*512, 512]
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


In [56]:
class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, head_size):    ## (8, 64)
        super().__init__()
        self.heads = nn.ModuleList(  [ Head(head_size) for _ in range(num_heads) ] )
        self.proj  = nn.Linear(n_embd, n_embd)   ## 512, 512
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat(   [ h(x) for h in self.heads ], dim = -1   )
        out = self.proj(  out   )
        out = self.dropout(   out   )
        return out


In [57]:

class Block(nn.Module):

    def __init__(self, n_embd, n_head):     ## (512, 8)
        super().__init__()
        head_size = n_embd // n_head        ## 64
        self.sa   = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward( n_embd)    ## 512
        self.ln1  = nn.LayerNorm(n_embd)
        self.ln2  = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(     self.ln1(x)      )
        x = x + self.ffwd(   self.ln2(x)      )
        return x


In [58]:
class GPTModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)   ## [65, 512]
        self.pos_emb_table = nn.Embedding(block_size, n_embd)     ## [block, 512]

        self.blocks = nn.Sequential(
                *[   Block(n_embd, n_head=n_head) for _ in range(n_layer)    ]
        )

        self.ln_f    = nn.LayerNorm(  n_embd    )
        self.lm_ffw_head = nn.Linear(n_embd, vocab_size)  ## [512, 65] # FFW Layer

    def forward(self, idx, targets=None):
        B, T = idx.shape     ## (Batch, 40)
        ## ids and targets are both (B, T) tensors of integers

        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.pos_emb_table(torch.arange(T, device=device))

        x = tok_emb + pos_emb    ## [B, T, E] or [64, 40, 512]

        ## This is the architecture
        x = self.blocks(  x  )   ## (B, T, E)
        x = self.ln_f(    x  )   ## (B, T, E)   ## norm
        logits = self.lm_ffw_head(x)         ## [B, 40, 65]

        if targets is None:
            loss = None
        else:
            B, T, E  = logits.shape
            logits  = logits.view( B*T, E)
            targets = targets.view(B*T)
            loss    = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):    ## idx is (B, T)
        for _ in range(max_new_tokens):
            ## crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)    ## ## get preds
            logits = logits[:, -1, :]    ## focus on last one (B, E)
            probs = F.softmax(logits, dim= -1)    ## (B, E) get probs
            idx_next = torch.multinomial(probs, num_samples=1)     ## (B, 1) selected
            idx = torch.cat(  (idx, idx_next), dim=1  )   ## (B, T+1) append sample to running sequence
        return idx


In [59]:
model   = GPTModel()

m       = model.to(device)

optimizer = torch.optim.Adam(  m.parameters(), lr=learning_rate   )



In [60]:
for iter in range(max_iters):

    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')

    ## eval the loss
    logits, loss = m(xb, yb)

    optimizer.zero_grad(set_to_none=True)   ## zero out
    loss.backward()
    optimizer.step()



step 0: train loss 4.6155, val loss 4.6082
step 500: train loss 0.9434, val loss 1.4807
step 1000: train loss 0.5123, val loss 1.6217
step 1500: train loss 0.3491, val loss 1.8562
step 2000: train loss 0.2954, val loss 2.0876
step 2500: train loss 0.2712, val loss 2.2831
step 3000: train loss 0.2565, val loss 2.3121
step 3500: train loss 0.2474, val loss 2.4133
step 4000: train loss 0.2399, val loss 2.4684
step 4500: train loss 0.2344, val loss 2.5626
step 5000: train loss 0.2315, val loss 2.5907
step 5500: train loss 0.2281, val loss 2.6477


In [62]:
import torch
import pandas as pd
from newspaper import Article

# Load the CSV file
def load_csv(file_path):
    return pd.read_csv(file_path)

# Fetch the article text using the link
def fetch_article_text(link):
    article = Article(link)
    article.download()
    article.parse()
    return article.text

# Summarize the article
def summarize_article(text, max_length=150):
    from transformers import pipeline  # Import pipeline here
    # Specify the model you want to use here
    # You can replace this with any summarization model from Hugging Face
    model_name = "sshleifer/distilbart-cnn-12-6"

    # Use GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    summarizer = pipeline("summarization", model=model_name, device=device)

    # Truncate the text if it's too long
    max_input_length = summarizer.model.config.max_position_embeddings  # Get max input length
    if len(summarizer.tokenizer.encode(text)) > max_input_length:
        text = text[:max_input_length]  # Truncate to max length

    summary = summarizer(text, max_length=max_length, min_length=30, do_sample=False)
    return summary[0]['summary_text']

# Search and summarize by title
def get_summary_by_title(csv_file_path, title):
    # Load data
    df = load_csv(csv_file_path)
    # Search for the title
    article_info = df[df['title'] == title]
    if article_info.empty:
        return "Title not found in the dataset."

    link = article_info.iloc[0]['link']
    # Fetch and summarize the article
    try:
        article_text = fetch_article_text(link)
        summary = summarize_article(article_text)
        return summary
    except Exception as e:
        return f"Error fetching or summarizing article: {str(e)}"

# Example usage
csv_file_path = "/content/purdue_news_all_pages-2024.csv"
title = "Purdue scientist expecting new world to reveal itself to Mars rover"
print(get_summary_by_title(csv_file_path, title))


Token indices sequence length is longer than the specified maximum sequence length for this model (1483 > 1024). Running this sequence through the model will result in indexing errors


 NASA's Mars 2020 rover is only weeks away from emerging from the 28-mile-wide Jezero Crater to explore new terrain . The crater rim is like the edge of the world, and it feels like we’re so close to going over the edge, says planetary scientist Briony Horgan .


In [63]:
!pip install lxml_html_clean



In [64]:
!pip install newspaper3k # Install the missing library with pip



In [65]:
import torch
import pandas as pd
from newspaper import Article
from transformers import pipeline  # Import pipeline at the top

# Load the CSV file
def load_csv(file_path):
    return pd.read_csv(file_path)

# Fetch the article text using the link
def fetch_article_text(link):
    article = Article(link)
    article.download()
    article.parse()
    return article.text

# Summarize the article
def summarize_article(text, word_count=100):
    # Specify the model you want to use here
    # You can replace this with any summarization model from Hugging Face
    model_name = "sshleifer/distilbart-cnn-12-6"

    # Use GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    summarizer = pipeline("summarization", model=model_name, device=device)

    # Estimate max_length based on word_count
    # Assuming ~1.3 tokens per word
    token_count = int(word_count * 1.3)
    max_length = token_count
    min_length = int(token_count * 0.8)  # Ensure some flexibility for shorter summaries

    # Truncate the text if it's too long
    max_input_length = summarizer.model.config.max_position_embeddings  # Get max input length
    if len(summarizer.tokenizer.encode(text)) > max_input_length:
        text = text[:max_input_length]  # Truncate to max length

    summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]['summary_text']

# Search and summarize by title
def get_summary_by_title(csv_file_path, title):
    # Load data
    df = load_csv(csv_file_path)
    # Search for the title
    article_info = df[df['title'] == title]
    if article_info.empty:
        return "Title not found in the dataset."

    link = article_info.iloc[0]['link']
    # Fetch and summarize the article
    try:
        article_text = fetch_article_text(link)
        summary = summarize_article(article_text, word_count=100)
        return summary
    except Exception as e:
        return f"Error fetching or summarizing article: {str(e)}"

# Main program
csv_file_path = "/content/purdue_news_all_pages-2024.csv"
print("Enter the title of the article you want to summarize:")
title = input("> ")  # Taking user input for the title
summary = get_summary_by_title(csv_file_path, title)
print("\nSummary:")
print(summary)


Enter the title of the article you want to summarize:
> Participants in the Purdue Ukrainian Scholars Initiative to be featured in annual panel discussion

Summary:
 The hourlong event will be in the East Faculty Lounge, on the Purdue Memorial Union’s second floor . The discussion will include a brief overview of the Purdue Ukrainian Scholars Initiative . The program is one of the first and largest of similar university-led programs in the United States . This year Purdue hosts 15 scholars, who are continuing their research here . The event is free and open to the public (registration required), and will be on Tuesday (Dec. 10) at 4:30 p.m. The discussion is led by Vijay Raghunathan, vice president for global partnerships .


In [1]:
import torch
import pandas as pd
from newspaper import Article
from transformers import pipeline  # Import pipeline at the top
from googletrans import Translator

# Load the CSV file
def load_csv(file_path):
    return pd.read_csv(file_path)

# Fetch the article text using the link
def fetch_article_text(link):
    article = Article(link)
    article.download()
    article.parse()
    return article.text

# Summarize the article
def summarize_article(text, word_count=100):
    model_name = "sshleifer/distilbart-cnn-12-6"

    # Use GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    summarizer = pipeline("summarization", model=model_name, device=device)

    # Estimate max_length based on word_count
    token_count = int(word_count * 1.3)
    max_length = token_count
    min_length = int(token_count * 0.8)

    # Truncate the text if it's too long
    max_input_length = summarizer.model.config.max_position_embeddings
    if len(summarizer.tokenizer.encode(text)) > max_input_length:
        text = text[:max_input_length]

    summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]['summary_text']

# Search and summarize by title
def get_summary_by_title(csv_file_path, title):
    # Load data
    df = load_csv(csv_file_path)

    # Search for the title
    article_info = df[df['title'] == title]
    if article_info.empty:
        return "Title not found in the dataset."

    link = article_info.iloc[0]['link']

    # Fetch and summarize the article
    try:
        article_text = fetch_article_text(link)
        summary = summarize_article(article_text, word_count=100)
        return summary
    except Exception as e:
        return f"Error fetching or summarizing article: {str(e)}"

# Translate text
def translate_text(text, src_lang='en', dest_lang='es'):
    translator = Translator()
    translation = translator.translate(text, src=src_lang, dest=dest_lang)
    return translation.text

# Main program
csv_file_path = "/content/purdue_news_all_pages-2024.csv"
print("Enter the title of the article you want to summarize:")
title = input("> ")  # Taking user input for the title
summary = get_summary_by_title(csv_file_path, title)

if "Error" not in summary and "not found" not in summary:
    print("\nSummary:")
    print(summary)

    # Translate the summary
    translated_summary = translate_text(summary, src_lang='en', dest_lang='es')
    print("\nTranslated Summary (Spanish):")
    print(translated_summary)
else:
    print("\n", summary)


Enter the title of the article you want to summarize:
> Purdue scientist expecting new world to reveal itself to Mars rover


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Token indices sequence length is longer than the specified maximum sequence length for this model (1483 > 1024). Running this sequence through the model will result in indexing errors



Summary:
 NASA's Mars 2020 rover is only weeks away from emerging from the 28-mile-wide Jezero Crater to explore new terrain . The crater rim is like the edge of the world, and it feels like we’re so close to going over the edge, says Purdue University planetary scientist Briony Horgan . Horgan anticipates some of the oldest rocks yet as NASA mission prepares to emerge from the crater . It's a point in the mission Horgan set her sights on following the rover’s landing four years ago .

Translated Summary (Spanish):
El Rover 2020 de la NASA está a solo unas semanas de emerger del cráter Jezero de 28 millas de ancho para explorar un nuevo terreno.El borde del cráter es como el borde del mundo, y parece que estamos tan cerca de ir al límite, dice el científico planetario de la Universidad de Purdue, Briony Horgan.Horgan anticipa algunas de las rocas más antiguas hasta ahora mientras la misión de la NASA se prepara para emerger del cráter.Es un punto en la Misión de la Misión, Horgan, se 

In [74]:
!pip install googletrans==4.0.0-rc1

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2024.12.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->goog

In [72]:
import requests
from bs4 import BeautifulSoup
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import io
import pandas as pd

# Load the CSV file
def load_csv(file_path):
    return pd.read_csv(file_path)

# Fetch the main image from the URL
def fetch_image_from_url(article_url):
    try:
        response = requests.get(article_url, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            # Look for 'og:image' meta tag
            image_url = soup.find("meta", property="og:image")
            if image_url and image_url["content"]:
                return image_url["content"]
    except Exception as e:
        print(f"Error fetching image URL from {article_url}: {e}")
    return None

# Fetch and process the image
def fetch_and_describe_image(image_url):
    try:
        response = requests.get(image_url, stream=True, timeout=10)
        if response.status_code == 200:
            image = Image.open(io.BytesIO(response.content))

            # Load BLIP model for generating image descriptions
            processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
            model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

            # Preprocess image
            inputs = processor(image, return_tensors="pt")

            # Generate caption
            caption = model.generate(**inputs)
            return processor.decode(caption[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error processing image: {e}")
    return "No description available"

# Fetch the link from the dataset and generate the description
def get_image_and_description_by_title(csv_file_path, title):
    # Load data
    df = load_csv(csv_file_path)

    # Search for the title
    article_info = df[df['title'] == title]
    if article_info.empty:
        return "Title not found in the dataset."

    link = article_info.iloc[0]['link']
    print(f"Fetched link: {link}")

    # Fetch the image and generate description
    try:
        image_url = fetch_image_from_url(link)
        if image_url:
            print(f"Fetched image URL: {image_url}")
            description = fetch_and_describe_image(image_url)
            return {
                "image_url": image_url,
                "description": description
            }
        else:
            return "No image found for this article."
    except Exception as e:
        return f"Error processing article: {str(e)}"

# Example usage
csv_file_path = "/content/purdue_news_all_pages-2024.csv"
title = input("Enter the title of the article: ")
result = get_image_and_description_by_title(csv_file_path, title)

# Output the result
if isinstance(result, dict):
    print(f"Image URL: {result['image_url']}")
    print(f"Description: {result['description']}")
else:
    print(result)


Enter the title of the article: Purdue scientist expecting new world to reveal itself to Mars rover
Fetched link: https://www.purdue.edu/newsroom/2024/Q4/purdue-scientist-expecting-new-world-to-reveal-itself-to-mars-rover
Fetched image URL: https://www.purdue.edu/newsroom/wp-content/uploads/2024/12/BrionyHorgan-PerseveranceOG-scaled.jpg




Image URL: https://www.purdue.edu/newsroom/wp-content/uploads/2024/12/BrionyHorgan-PerseveranceOG-scaled.jpg
Description: a woman standing next to a large object
