In [None]:
import json
import pandas as pd
from tqdm import tqdm
import time
import re
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
with open("authentification.json","r") as f:
    AUTH_KEYS = json.load(f)

with open("prompts.json","r") as f:
    PROMPTS = json.load(f)

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")


model_name = "meta-llama/Llama-3.2-3B-instruct"
TOKENIZER = AutoTokenizer.from_pretrained(model_name,token=AUTH_KEYS["hf_token"])
MODEL = AutoModelForCausalLM.from_pretrained(model_name,token=AUTH_KEYS["hf_token"],device_map="auto")

In [None]:
def extract_json(text):
    # Regex pattern to match JSON objects
    json_pattern = r'\{[^{}]*\}'
    matches = re.findall(json_pattern, text)
    
    valid_jsons = []
    for match in matches:
        try:
            # Attempt to parse the JSON
            match = re.sub(r'(":\s*)([A-Za-z][^",\n}]*)', r'\1"\2"', match)
            json_obj = json.loads(match)
            valid_jsons.append(json_obj)
        except json.JSONDecodeError:
            continue
    
    return valid_jsons


def extract_summary(text_arg): 
    return text_arg.split("YOUR SUMMARY:", 1)[-1].strip()


def extract_output(text_arg): 
    return text_arg.split("YOUR OUTPUT:", 1)[-1].strip()

def summarize(chunk,device=DEVICE,tokenizer=TOKENIZER,model=MODEL):
    prompt_template = PROMPTS["summary_prompt"]
    prompt = prompt_template.format(text=chunk["text"])
    inputs = tokenizer(prompt, return_tensors="pt").to(device) 
    outputs = model.generate(**inputs,max_new_tokens=512,do_sample=True,temperature=1e-12,top_k=50)
    output = tokenizer.decode(outputs[0], skip_special_tokens=True,past_key_values=None)
    output = extract_summary(output)
    return output

def structure(summary,device=DEVICE,tokenizer=TOKENIZER,model=MODEL):
    prompt_template = PROMPTS["structuration_prompt"]
    prompt = prompt_template.format(summary=summary)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)  
    outputs = model.generate(**inputs, max_new_tokens=512,do_sample=True,temperature=1e-12,top_k=50)
    output = tokenizer.decode(outputs[0], skip_special_tokens=True,past_key_values=None)
    output = extract_output(output)
    output = extract_json(output)
    return output

def extract_data(full_texts,output_file=None):
    columns = ["vector","location","country","date","year","source_type"]
    counter = 0
    for chunk in tqdm(full_texts):
        try:
            TOKENIZER.truncation_side = "left"
            summary = summarize(chunk)
            output = structure(summary) 
            counter += 1
            if output:  # Only append valid JSON data
                data = []
                for item in output:
                    item["source_type"] = chunk["source_type"]
                    item_data = dict((k, item[k]) for k in columns if k in item)
                    data.append(item_data)
                df = pd.DataFrame(data)
                df.to_csv(output_file, mode="a", index=False, header=not counter > 1)   
        except RuntimeError as e:  # OutOfMemoryError 
            if 'out of memory' in str(e).lower():
                print(f"Out of memory error on chunk {chunk.get('document_id', 'unknown')}, skipping...")
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                continue
            else:
                raise         
        except Exception as e:  
            print(f"Error processing chunk {chunk.get('document_id', 'unknown')}: {str(e)}, skipping...")
            continue

In [None]:
with open("input.json","r") as f:
    inputs = json.load(f)

In [None]:
extract_data(inputs,"output_dataset.csv")