In [5]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

KeyboardInterrupt: 

In [3]:
import transformers
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [4]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

Using device: cuda


In [5]:
# Path Dataset
TRAIN_PATH = r"C:\Users\kumar\OneDrive\Desktop\train.csv"
TEST_PATH = "C:\\Users\\kumar\\OneDrive\\Desktop\\test.csv"

In [6]:
# Load Dataset
train_df = pd.read_csv(TRAIN_PATH, header=None, names=["class", "title", "description"])
test_df = pd.read_csv(TEST_PATH, header=None, names=["class", "title", "description"])

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
train_df=train_df[1:]
train_df.head()

Unnamed: 0,class,title,description
1,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
2,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
3,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
4,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
5,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [9]:
# Split the DataFrame
ttrain_df, train_df = train_test_split(train_df,
    test_size=0.20,  # 20% test data
    stratify=train_df["class"],  # Stratify based on the 'target' column
    random_state=42  # For reproducibility
)

In [10]:
train_df.shape
train_df["class"].value_counts()

class
3    6000
2    6000
1    6000
4    6000
Name: count, dtype: int64

In [11]:
test_df=test_df[1:]
test_df.head()

Unnamed: 0,class,title,description
1,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...
2,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o..."
3,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...
4,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...
5,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...


In [12]:
# Gabungkan title dan description
train_df["text"] = train_df["title"].fillna("") + ". " + train_df["description"].fillna("")
test_df["text"] = test_df["title"].fillna("") + ". " + test_df["description"].fillna("")

In [13]:
# Encode label
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["class"])
test_df["label"] = label_encoder.transform(test_df["class"])

In [14]:
train_df.head()

Unnamed: 0,class,title,description,text,label
112487,3,Wall Street's Designs on '05? A Merger Boom,After years of self-doubt and housecleaning in...,Wall Street's Designs on '05? A Merger Boom. A...,2
109164,2,Dodgers sign Ledee to two-year deal,The Los Angeles Dodgers signed Ricky Ledee to ...,Dodgers sign Ledee to two-year deal. The Los A...,1
6639,3,Olympians pursuit of marketing gold begins lon...,"For millions of Americans, gymnast Paul Hamm #...",Olympians pursuit of marketing gold begins lon...,2
3997,3,New Device: Flying Robot,Seiko Epson hopes the tiny robot will help in ...,New Device: Flying Robot. Seiko Epson hopes th...,2
39523,1,Spanish laureate 'was informer',Prize-winning Spanish writer Camilo Jose Cela...,Spanish laureate 'was informer'. Prize-winning...,0


In [15]:
train_df=train_df[["text","label"]]

In [16]:
train_df["tokenized_text"] = train_df["text"].apply(lambda x: tokenizer(x, truncation=True))

In [17]:
for key in train_df["tokenized_text"].iloc(0):
    print(len(key))

3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3


In [18]:
from datasets import Dataset
df=train_df[["tokenized_text","label"]]
# Unpack nested dictionary into separate columns
df["input_ids"] = df["tokenized_text"].apply(lambda x: x["input_ids"])
df["attention_mask"] = df["tokenized_text"].apply(lambda x: x["attention_mask"])

# Drop the original column if no longer needed
df = df.drop(columns=["tokenized_text"])

train_data_ = Dataset.from_pandas(df)


In [19]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
import numpy as np
import evaluate
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [20]:
#1 represents World, 2 represents Sports, 3 represents Business and 4 represents Sci/Tech.

In [20]:
id2label = {0: "World", 1: "Sports",2: "Business",3: "Science"}
label2id = {val:key for key,val in id2label.items()}

In [21]:
label2id

{'World': 0, 'Sports': 1, 'Business': 2, 'Science': 3}

In [22]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=4, id2label=id2label, label2id=label2id
)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
training_args = TrainingArguments(
    output_dir="test",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data_,
    eval_dataset=train_data_,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.134,0.074289,0.977125
2,0.11,0.037234,0.989458
3,0.073,0.015834,0.996167
4,0.0303,0.008602,0.997792
5,0.0148,0.005292,0.998875
6,0.0133,0.002681,0.999458
7,0.0034,0.001176,0.999625
8,0.0003,0.000766,0.999792


TrainOutput(global_step=12000, training_loss=0.040606718289355435, metrics={'train_runtime': 9173.5866, 'train_samples_per_second': 20.93, 'train_steps_per_second': 1.308, 'total_flos': 9544566034823040.0, 'train_loss': 0.040606718289355435, 'epoch': 8.0})

In [None]:
#model trained ,don't do any thing to above code 

In [3]:
#extracting text by performing OCR

def create_global():
    global image_path
    image_path = r"C:\Users\kumar\OneDrive\Pictures\Screenshots\Screenshot 2025-02-06 120823.png"

create_global()

from PIL import Image
import pytesseract

pytesseract.pytesseract.tesseract_cmd=r'C:\Users\kumar\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'

#image path


def extract_text_from_image(image_path):
    # Open the image using PIL
    img = Image.open(image_path)
    
    # Use pytesseract to extract text
    text = pytesseract.image_to_string(img)
    
    return text
text=extract_text_from_image(image_path)

In [2]:
#Alternative if we don't have image ,input text
text=input("enter text")

KeyboardInterrupt: Interrupted by user

In [4]:
text=extract_text_from_image(image_path)
print(text)

The demand for data engineers to build and scale cutting-edge Al models is
expected to skyrocket as the excitement around its potential continues to
surge. Despite plenty of discussion in recent years around Al replacing a
vast number of jobs — experienced data engineers are crucial to developing

intelligent Al models for the future.

Generative Al models require a vast amount of data pipelines to be
developed and maintained. For data engineers, the challenges and
expectations to develop robust Al models will also increase alongside
demand, with Al rapidly becoming an extension of human intelligence and

how we live right now.



In [5]:
# classification

from transformers import pipeline

def classify_text(text):
    classifier = pipeline("text-classification", model="test/checkpoint-12000")
    output =classifier(text)
    return output
print(classify_text(text))


  from .autonotebook import tqdm as notebook_tqdm





Device set to use cuda:0


[{'label': 'Science', 'score': 0.9999731779098511}]


In [6]:
import ollama

def summarize_text(text, category):
    prompt = f"You are a highly trained professional news anchor in the field of {category}. Summarize the following news in less than 80 words: {text}"

    summary= ollama.chat(model="deepseek-r1:8b", messages=[{"role": "user", "content": prompt}])
    
    return summary["message"]["content"]  # Extracts the model's response


In [7]:
def process_image_and_summarize(image_path):
    # Step 1: Extract text from image
    extracted_text = extract_text_from_image(image_path)
    
    # Step 2: Classify the text
    category = classify_text(extracted_text)
    
    # Step 3: Summarize the text based on the category
    summary = summarize_text(extracted_text, category)
    
    return summary
summary = process_image_and_summarize(image_path)


print(summary)


Device set to use cuda:0


<think>
Okay, so I'm trying to figure out how to summarize this news article about data engineers and AI models. The user provided a response that's under 80 words, but they want me to walk through my thought process as if I'm a novice.

Alright, first, the article is talking about a surge in demand for data engineers because of AI's potential. It mentions that even though people often talk about AI replacing jobs, experienced data engineers are still crucial because they build and scale these models. Then it goes into generative AI needing lots of data pipelines, so the challenges for data engineers increase as demand rises.

So, I need to capture all that in a concise summary. Let me break it down:

1. The main point is the skyrocketing demand for data engineers.
2. They are essential despite talk about job replacement by AI.
3. Generative AI needs robust models which require data pipelines.
4. This creates more challenges and expectations for data engineers.

I should make sure to i