In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
file = "../Data/PC_US_Elections_clean.pdf"
import fitz 
full_text = []
with fitz.open(file) as doc:
    for page_number, page in enumerate(doc, start =1):
        text = page.get_text()
        full_text.append(text)
        #print(f"Page {page_number}:\n{text}\n{'='*50}")

In [3]:
full_text

['INSTITUTIONAL EQUITY RESEARCH \n \nPage | 1 | PHILLIPCAPITAL INDIA RESEARCH \nDISCLAIMER FOR U.S. BASED INVESTORS. The Agent of PhillipCapital (India) Pvt. Ltd. in the United Sta\nExchange Commission. The activities of PhillipCapital (India) Pvt. Ltd. in the United States will be affected \naccordance with the Services Agreement entered into between PhillipCapital (India) Pvt. Ltd. and Marco Po\nclientservices@mpsecurities.com ; 1-347-745-6448. \nPowered by EQUITEC \nUS Elections and India \nNeutral to long-term positive for India  \n \nINDIA | STRATEGY/MACRO | THEMATIC \n \n \n \n24 October 2024 \nThe U.S. presidential election in 2024 is seeing an intense contest between Kamala Harris and Donald Tru\nglobal superpower, these elections will have a significant impact: A Kamala Harris win will largely be consid\ncurrent regime while Trump?s win will bring lasting changes in international relations, trade, inflation, and c\ntrends. As per exit polls, swing states are showing only a mar

In [None]:
document_text = " ".join(full_text)
qa_dict = {
    "Q1": {
        "question": "When are the 2024 US presidential elections scheduled?",
        "answer": "The elections are scheduled for November 5, 2024."
    },
    "Q2": {
        "question": "Who are the two primary candidates for the 2024 US presidential elections?",
        "answer": "Kamala Harris (Democratic Party) and Donald Trump (Republican Party)."
    },
    "Q3": {
        "question": "What is the required number of electoral votes to win the presidency?",
        "answer": "A candidate needs at least 270 out of 538 electoral votes to win."
    },
    "Q4": {
        "question": "What are 'swing states,' and why are they significant?",
        "answer": "Swing states are states with a negligible difference in vote share between candidates. They significantly influence the election outcome."
    },
    "Q5": {
        "question": "Name some key swing states mentioned in the text.",
        "answer": "Arizona, Georgia, Michigan, Nevada, North Carolina, Pennsylvania, and Wisconsin."
    },
    "Q6": {
        "question": "What is the Democrats' policy on food costs and housing?",
        "answer": "Democrats aim to reduce food costs, restrict price increases, and provide incentives for first-time homebuyers."
    },
    "Q7": {
        "question": "What are the Republicans' proposals for corporate taxes?",
        "answer": "Republicans propose lowering the corporate tax rate to 20% and making deductions for local and state taxes permanent."
    },
    "Q8": {
        "question": "How do the two parties differ on trade with China?",
        "answer": "Democrats focus on targeted tariffs on Chinese technology and investments, while Republicans advocate for broad tariffs (10-20%) on Chinese goods and revoking China's 'most favored nation' trade status."
    },
    "Q9": {
        "question": "What are the Republican plans for healthcare reforms?",
        "answer": "Republicans aim to accelerate efforts to privatize Medicare and reduce outpatient care payments."
    },
    "Q10": {
        "question": "How does the Democratic Party approach clean energy?",
        "answer": "The Democratic Party emphasizes green energy initiatives and renewable energy development."
    },
    "Q11": {
        "question": "How might Trump's win impact Indian IT companies?",
        "answer": "Trump's anti-immigration stance could challenge Indian IT companies, but they may adapt by hiring more locals and relying on subcontractors."
    },
    "Q12": {
        "question": "What is the likely impact on the energy sector if Trump wins?",
        "answer": "Trump's policies favor increased US crude oil and natural gas production, which could lower energy costs globally and benefit Indian consumers and industries."
    },
    "Q13": {
        "question": "How could defence partnerships evolve under Harris or Trump?",
        "answer": "Harris is expected to emphasize deeper defense cooperation with India, while Trump's approach may be more transactional, focusing on arms sales and strategic interests."
    },
    "Q14": {
        "question": "What is the expected impact on Indian pharmaceutical exports under both candidates?",
        "answer": "While both aim to lower healthcare costs, rising competition could challenge Indian generic drugs. However, export volumes to the US are likely to remain steady."
    },
    "Q15": {
        "question": "How would higher tariffs on Chinese imports benefit Indian industries?",
        "answer": "Higher tariffs on Chinese goods could benefit Indian textiles, auto components, and consumer electronics as global companies look for alternative suppliers."
    },
    "Q16": {
        "question": "How did the US and Indian equities perform during previous administrations?",
        "answer": "Both US and Indian equities have rallied during Trump and Harris' respective regimes."
    },
    "Q17": {
        "question": "How does Harris' policy approach compare to Trump's regarding emerging markets?",
        "answer": "Harris' win is expected to maintain continuity and stability, while Trump’s policies could lead to de-globalization, negatively impacting emerging markets."
    },
    "Q18": {
        "question": "How would a Trump presidency affect India's trade competitiveness with the US?",
        "answer": "Higher tariffs on Chinese goods would strengthen India’s trade competitiveness in the US market."
    },
    "Q19": {
        "question": "What is a positive geopolitical outcome for India regardless of the election result?",
        "answer": "Both candidates support an anti-China stance, which could benefit India strategically and economically."
    },
    "Q20": {
        "question": "What are the broader implications of Trump's softer stance on Russia for India?",
        "answer": "Trump's softer stance on Russia aligns with India's broader strategic interests, potentially facilitating better cooperation."
    },
    "Q21": {
        "question": "How would Kamala Harris' victory impact immigration policy?",
        "answer": "Harris is likely to promote a more lenient immigration policy compared to Trump’s restrictive stance."
    },
    "Q22": {
        "question": "What is Trump's stance on corporate taxation?",
        "answer": "Trump advocates for tax cuts for corporations and individuals, aiming to stimulate economic growth."
    },
    "Q23": {
        "question": "What is one of the economic goals of the Biden administration?",
        "answer": "One of Biden's economic goals is to reduce income inequality by taxing the wealthy more heavily."
    },
    "Q24": {
        "question": "What energy-related policy has been central to Biden's presidency?",
        "answer": "Biden has focused on transitioning the US towards renewable energy sources and reducing carbon emissions."
    },
    "Q25": {
        "question": "How do Trump and Biden's trade policies differ regarding China?",
        "answer": "Trump has advocated for tariffs and trade wars with China, while Biden is more focused on multilateral trade agreements."
    },
    "Q26": {
        "question": "What is Biden's stance on the Paris Climate Agreement?",
        "answer": "Biden rejoined the Paris Climate Agreement as part of his commitment to combat climate change."
    },
    "Q27": {
        "question": "What is one major feature of the Republican Party's foreign policy?",
        "answer": "The Republican Party often favors a more isolationist foreign policy, focusing on American interests."
    },
    "Q28": {
        "question": "What impact could Biden's infrastructure plan have on the economy?",
        "answer": "Biden's infrastructure plan is expected to create jobs and stimulate economic growth through investments in roads, bridges, and clean energy."
    },
    "Q29": {
        "question": "What would be the likely effect of a Trump administration on international alliances?",
        "answer": "Trump's foreign policy may reduce US involvement in international alliances, preferring bilateral agreements instead."
    },
    "Q30": {
        "question": "What could a Biden presidency mean for healthcare reform?",
        "answer": "Biden is likely to expand healthcare access, building on the Affordable Care Act and pushing for more comprehensive coverage."
    },
    "Q31": {
        "question": "How might a Trump presidency impact climate change policy?",
        "answer": "Trump's administration is likely to prioritize fossil fuel production and roll back environmental regulations."
    },
    "Q32": {
        "question": "What would Kamala Harris' victory likely mean for social justice initiatives?",
        "answer": "Harris is expected to prioritize social justice, including addressing racial inequality and reforming the criminal justice system."
    },
    "Q33": {
        "question": "What is Trump's stance on foreign trade agreements?",
        "answer": "Trump prefers bilateral trade deals, such as the USMCA, and has been critical of multilateral agreements like the TPP."
    },
    "Q34": {
        "question": "How would Trump's trade policies affect global supply chains?",
        "answer": "Trump’s trade policies could disrupt global supply chains due to tariffs and protectionist measures."
    },
    "Q35": {
        "question": "What is a likely effect of a Biden presidency on the technology sector?",
        "answer": "Biden is likely to regulate the technology sector more strictly, especially regarding data privacy and monopolistic practices."
    },
    "Q36": {
        "question": "How does Harris' policy on climate change differ from Trump's?",
        "answer": "Harris supports aggressive climate action, including rejoining the Paris Agreement and promoting green energy, while Trump downplays climate change concerns."
    },
    "Q37": {
        "question": "What could a Trump victory mean for the future of NATO?",
        "answer": "Trump has historically been critical of NATO and may push for a reduced US commitment to the alliance."
    },
    "Q38": {
        "question": "How would a Biden presidency likely impact healthcare insurance costs?",
        "answer": "Biden aims to lower healthcare insurance costs through government subsidies and expanded Medicaid coverage."
    },
    "Q39": {
        "question": "What is a significant challenge for Trump in the 2024 election?",
        "answer": "A significant challenge for Trump is his polarizing reputation and controversies from his previous presidency."
    },
    "Q40": {
        "question": "How would a Harris presidency affect US relations with Europe?",
        "answer": "Harris is expected to restore and strengthen US relations with European allies, emphasizing cooperation on climate change, trade, and security."
    }
}


In [None]:
from datasets import Dataset
# Example: Generate dummy QA pairs
qa_list = [{"question": v["question"], "answer": v["answer"]} for v in qa_dict.values()]
qa_df = pd.DataFrame(qa_list)

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(qa_df)


# Split dataset for training and evaluation
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]



In [None]:
dataset[20]

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad").to("cuda")




In [None]:
def tokenize_qa(example):
    # Tokenize the question and answer
    inputs = tokenizer(
        example["question"],
        example["answer"],
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    # Find the start and end positions of the answer in the tokenized input
    context = str(example["question"]) + " " + str(example["answer"])
    print(context)
    start_idx = context.find(str(example["answer"]))  # Find the position of the answer in the text
    end_idx = start_idx + len(example["answer"])  # Find the end position of the answer
    
    # Convert the start and end positions to token indices
    start_token_idx = tokenizer.encode(context[:start_idx], add_special_tokens=False, truncation=True)
    end_token_idx = tokenizer.encode(context[:end_idx], add_special_tokens=False, truncation=True)
    
    # Get the actual token positions for start and end
    start_positions = len(start_token_idx)
    end_positions = len(end_token_idx) - 1  # Subtract 1 because token indices are inclusive

    # Add the start and end positions
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    
    return inputs

# Apply tokenization
train_dataset = train_dataset.map(tokenize_qa, batched=True)
eval_dataset = eval_dataset.map(tokenize_qa, batched=True)

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=10,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

In [None]:
# !pip install 'accelerate>=0.26.0'
import accelerate
print(accelerate.__version__)


In [None]:
def preprocess_data(example):
    tokenized = tokenizer(
        example["question"],
        example["context"],
        truncation="only_second",  # Truncate context if too long
        max_length=384,           # Maximum length of the input
        stride=128,               # Stride for overlapping context
        padding="max_length",
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
    )
    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    tokenized["start_positions"] = []
    tokenized["end_positions"] = []

    for i, offsets in enumerate(tokenized["offset_mapping"]):
        input_ids = tokenized["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Get answer start and end positions
        answers = example["answers"][sample_mapping[i]]
        if len(answers["answer_start"]) == 0:  # No answer
            tokenized["start_positions"].append(cls_index)
            tokenized["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            token_start_index = 0
            token_end_index = len(offsets) - 1

            # Find the start token
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                token_start_index += 1
            tokenized["start_positions"].append(token_start_index - 1)

            # Find the end token
            while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            tokenized["end_positions"].append(token_end_index + 1)

    tokenized.pop("offset_mapping")  # Remove offset mapping as it's not needed
    return tokenized

# Process Train and Validation Data
train_data = dataset["train"].map(preprocess_data, batched=True, remove_columns=dataset["train"].column_names)
val_data = dataset["validation"].map(preprocess_data, batched=True, remove_columns=dataset["validation"].column_names)


In [None]:
prompt = f"Question: {dataset[20]['question']}\nAnswer:"

In [None]:
from transformers import LlamaTokenizer, LlamaForCausalLM

tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.1")
model = LlamaForCausalLM.from_pretrained("lmsys/vicuna-7b-v1.1").to("cuda")


In [None]:
!pip uninstall tensorflow keras -y


In [None]:
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizer, TrainingArguments, Trainer
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForQuestionAnswering.from_pretrained(model_name)

In [None]:
!pip uninstall keras
!pip install keras==2.11.0


In [None]:
!pip install tf-keras

In [None]:
from huggingface_hub import notebook_login

notebook_login()
#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

In [None]:
qa_list = [{"question": v["question"], "answer": v["answer"]} for v in qa_dict.values()]
qa_df = pd.DataFrame(qa_list)

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(qa_df)

# Print to verify
print(dataset[0])

In [None]:
dataset[23]

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments


In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
