In [1]:
import numpy as np
import pandas as pd
import requests
import re
import openai

from collections import Counter
from typing import List, Dict

import torch
import torch.nn as nn

from consts import *

In [2]:
df = pd.read_csv('kns_csv_files/kns_committee.csv')
df = df[df['KnessetNum'] >= 25]
df = df[df['CategoryID'].isin([MONEY_COM_CATEGORY_ID, DEFENSE_COM_CATEGORY_ID, LAW_ORDER_COM_CATEGORY_ID, MESADERET_COM_CATEGORY_ID, KNESSET_COM_CATEGORY_ID])]
commitee_ids = df['CommitteeID'].to_list()

In [3]:
def get_meeting_protocol_text(text_path):
    # Define the URL to fetch
    base_url = 'https://production.oknesset.org/pipelines/data/committees/meeting_protocols_text/'
    # Send GET request to the URL
    response = requests.get(base_url + text_path)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        response.encoding = 'utf-8'
        # Retrieve the content of the file
        return response.text
    else:
        raise ValueError(f"Failed to retrieve content. Status code: {response.status_code}")


In [4]:
com_session_df = pd.read_csv('kns_csv_files/kns_committeesession.csv')
com_session_df = com_session_df[com_session_df['CommitteeID'].isin(commitee_ids)]

com_session_df.dropna(subset=['text_parsed_filename'], inplace=True)
text_paths = com_session_df['text_parsed_filename'].to_list()
texts = [get_meeting_protocol_text(path) for path in text_paths]
agg_scores = [rate_aggressiveness(text) for text in texts]



NameError: name 'rate_aggressiveness' is not defined

In [None]:
knesset_members_df = pd.read_csv('kns_csv_files/kns_person.csv')
first_names, last_names = knesset_members_df['FirstName'].to_list(), knesset_members_df['LastName'].to_list()
knesset_members = [' '.join([first_name, last_name]) for first_name, last_name in zip(first_names, last_names)]

warnings = {mem: [0, 0, 0] for mem in knesset_members}

# handle members with a middle name or a nickname
new_first_names, new_last_names = [], []

for fn, ln in zip(first_names, last_names):
    names = re.findall('\w+', fn)
    
    for name in names:
        warnings[name + ' ' + ln] = warnings[fn + ' ' + ln]
        new_first_names.append(name)
        new_last_names.append(ln)

# update first and last names
first_names = new_first_names
last_names = new_last_names

knesset_members = [' '.join([first_name, last_name]) for first_name, last_name in zip(first_names, last_names)]

In [None]:
def get_meeting_warnings(text, warnings, knesset_members) -> None:
    """
    Return warnings from the meeting protocol text.

    Parameters
    ----------
    text : str
        Meeting protocol text.

    warnings: Dict[str, List[int]]
        Number of warnings for each Knesset member.

    knesset_members: List[str]
        List of Knesset members.
    """

    # find all warnings
    matches = re.findall(WARNING_REGEX, text, flags=re.MULTILINE)
    print(len(matches))
    for i, match in enumerate(matches):
        print(f'match #{i}:')
        print(match)
        sentences = match.split('\n')
        first_sentence, last_sentence = sentences[0], sentences[-1]
        for kns_member in knesset_members:
            if kns_member in first_sentence:
                word2idx = {'ראש': 0, 'שני': 1, 'שליש': 2}
                for word, idx in word2idx.items():
                    if word in last_sentence:
                        warnings[kns_member][idx] += 1
                        break
    
    

In [51]:
def filter_protocol_sentences(text: str) -> str:
    ind = re.search("<< יור >>", text)
    txt2 = text[ind.span()[0]:]
    txt2 = re.sub("<<.*","", txt2)
    txt2 = re.sub(">>.*","", txt2)
    txt2 = re.sub("-", " ", txt2)
    txt2 = re.sub("\n\s+","\n", txt2)
    txt2 = re.sub(" +"," ", txt2)
    return txt2

In [54]:
with open('protocols/2159679.txt', 'r', encoding='utf-8') as f:
    text = f.read()
filtered_text = filter_protocol_sentences(text)
with open('filtered_protocols/2159679.txt', 'w', encoding='utf-8') as f:
    f.write(filtered_text)

In [40]:
def rate_aggressiveness(text):
    filtered_text = filter_protocol_sentences(text)
    lines = filtered_text.split('\n')

    # Filter non-conversation related lines
    lines = [line for line in lines if line.strip() != '' and '>' not in line and '<' not in line]
    print(lines)

    # Prompt Chat-GPT to rate the aggressiveness of the text
    scores = []
    query = 'דרג את מידת האגרסיביות של השיחה (תשובה מספרית בלבד! מ-1 עד 5 כש-5 זה אגרסיבי מאוד)\n'

    prompt = query
    n_tokens = len(prompt.split(' '))

    for line in lines:
        print('prompt:', prompt)
        print('n_tokens:', n_tokens)
        print('curr line:', line)

        num_new_tokens = len(line.split(' '))
        print('num new tokens:', num_new_tokens)
        if n_tokens + num_new_tokens < CHATGPT_MAX_TOKENS:
            # Maximum amount of tokens not yet exceeded, can add another sentence
            prompt = prompt + line + '\n'
            n_tokens += num_new_tokens
        else:
            # Maximum amount of tokens exceeded, send the accumulated prompt to ChatGPT
            print('prompting', prompt)
            ans = prompt_chatgpt(prompt)
            score = int(re.findall('\d', ans)[0])
            print('got score of', score)
            scores.append(score)

            prompt = query + line + '\n'
            n_tokens = len(prompt.split(' '))
        
    if n_tokens > len(query.split(' ')):
        ans = prompt_chatgpt(prompt)
        score = int(re.findall('\d', ans)[0])
        print('got score of', score)
        scores.append(score)

    print(scores)
    return np.mean(scores)
    

## AlephBert Transfer Learning

In [9]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [40]:
alephbert_tokenizer = AutoTokenizer.from_pretrained('onlplab/alephbert-base')
alephbert = AutoModelForSequenceClassification.from_pretrained('onlplab/alephbert-base', num_labels=2)

# Freeze the weights of the model
for param in list(alephbert.parameters())[:-1]:
    param.requires_grad = False

Some weights of the model checkpoint at onlplab/alephbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at onlplab/alephbert-base

In [30]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        return text, label
    
    def __len__(self):
        return len(self.texts)

In [31]:
from sklearn.model_selection import train_test_split
agg_scores_df = pd.read_csv('agg_score_bin.csv')
agg_scores_df.dropna(inplace=True, ignore_index=True)

In [32]:
train_set, test_set = train_test_split(agg_scores_df, test_size=0.2)

# Reset the index of each DataFrame
train_set.reset_index(drop=True, inplace=True)
test_set.reset_index(drop=True, inplace=True)

agg_train = Dataset(train_set['text'], train_set['score'])
agg_test = Dataset(test_set['text'], test_set['score'])

train_dataloader = torch.utils.data.DataLoader(agg_train, batch_size=16, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(agg_test, batch_size=16, shuffle=False)

In [42]:
from tqdm import tqdm

In [48]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
alephbert.to(device)

# Set optimizer and loss function
optimizer = torch.optim.Adam(alephbert.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

# Train model
alephbert.train()
for epoch in range(25):
    print('epoch', epoch)
    for texts, scores in tqdm(train_dataloader):
        optimizer.zero_grad()
        scores = scores.long().to(device)
        inputs = alephbert_tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(device)
        outputs = alephbert(**inputs, return_dict=False)[0]

        loss = criterion(outputs, scores)
        loss.backward()
        optimizer.step()

    


  0%|          | 0/23 [00:00<?, ?it/s]

epoch 0


100%|██████████| 23/23 [01:51<00:00,  4.86s/it]
  0%|          | 0/23 [00:00<?, ?it/s]

epoch 1


100%|██████████| 23/23 [01:29<00:00,  3.91s/it]
  0%|          | 0/23 [00:00<?, ?it/s]

epoch 2


 22%|██▏       | 5/23 [00:17<00:59,  3.28s/it]

In [None]:
# Evaluate model
alephbert.eval()
preds_df = pd.DataFrame(columns=['predictions', 'scores'])

total = 0
correct = 0

with torch.no_grad():
    for texts, scores in test_dataloader:
        inputs = alephbert_tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(device)
        scores = scores.to(device)

        outputs = alephbert(**inputs, return_dict=False)[0]
        _, predicted = torch.max(outputs.data, 1)
        total += scores.size(0)
        correct += (predicted == scores).sum().item()

        # outputs = outputs.reshape((16,))

        # agg score is between 1 to 7
        # outputs = np.round(outputs)
        
        d = {'predictions': [o.item() for o in predicted], 'scores': [s.item() for s in scores]}
        df = pd.DataFrame.from_dict(d)
        preds_df = pd.concat([preds_df, df], ignore_index=True, copy=False)

print('total accuracy on test set:', correct/total)

total accuracy on test set: 0.5869565217391305


In [47]:
preds_df.describe()

Unnamed: 0,predictions,scores
count,92,92
unique,2,2
top,0,0
freq,81,49


## AlephBert Regression Fine Tuning

In [None]:
agg_scores_df.describe()

In [70]:
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['a', 'b', 'c'])

In [71]:
df

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6


In [75]:
df['c'] = df['c'].apply(lambda x: int(x>=5))

In [76]:
df

Unnamed: 0,a,b,c
0,1,2,0
1,4,5,1


Unnamed: 0,text,score
0,"התחלנו, בוקר טוב - - -\n",1.0
1,"אפשר לקרוא את החומר, אני לא הספקתי לקרוא. \n",4.0
2,"אז בבקשה, יש לכם את החומרים וגם שלחתי לכם את ז...",3.0
3,"יש לנו עכשיו, את תתני לנו לקרוא?\n",5.0
4,"בטח, בטח, בבקשה, תוכלו לקרוא - - -\n",1.0
...,...,...
455,"אה, אוסאמה, אני נותן לך לדבר.\n",3.0
456,"תודה, תודה ביטן שאתה נותן לי לדבר.\n",3.0
457,מאה אחוז. \n,1.0
458,אתם מתנהגים עם טריקים ועם שטיקים. \n,5.0


In [81]:
agg_scores_df.to_csv('agg_score.csv')