# Claim Sentence Query
___

This model is based on:

```Bibtex
@inproceedings{levyUnsupervisedCorpuswideClaim2017,
  title = {Unsupervised Corpus-Wide Claim Detection},
  author = {Levy, Ran and Gretz, Shai and Sznajder, Benjamin and Hummel, Shay and Aharonov, Ranit and Slonim, Noam},
  date = {2017},
  doi = {10.18653/v1/w17-5110},
}
```

Parameter:
- Threshold for the retrieval score

In [23]:
import os
import json

from config import CLAIM_LEXICON_PATH, INDEX_PATH, PYSERINI_PATH

from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

from src.searcher import convert_data, create_index
from src.dataset import load_dataset
from src.evaluation import confusion_matrix_plot

from config import CLAIM_LEXICON_PATH, PROJECT_NAME, DATASETS

import wandb

In [24]:
MODEL_NAME = "CSQ"
TRACKING = True
FALSE_CLASS_BALANCE = 1.0

dataset = DATASETS["dataset_2014"]

### 0. Load data

In [25]:
X_train, X_test, y_train, y_test = load_dataset(dataset_path=os.path.join(dataset["base_path"], dataset["data"]), false_class_balance=FALSE_CLASS_BALANCE)

In [26]:
with open(CLAIM_LEXICON_PATH, "r") as inFile:  # load claim lexicon
    claim_lexicon = inFile.read().split("\n")

In [27]:
convert_data(X_test["Sentence"], data_path=PYSERINI_PATH)  # convert data

[Errno 17] File exists: 'data/pyserini'


### 1. Setup index

In [28]:
searcher = create_index(data_path=PYSERINI_PATH, index_path=INDEX_PATH, language="english")

[Errno 17] File exists: 'data/pyserini/index'
2021-12-22 14:23:46,025 INFO  [main] index.IndexCollection (IndexCollection.java:643) - Setting log level to INFO
2021-12-22 14:23:46,033 INFO  [main] index.IndexCollection (IndexCollection.java:646) - Starting indexer...
2021-12-22 14:23:46,033 INFO  [main] index.IndexCollection (IndexCollection.java:648) - DocumentCollection path: data/pyserini
2021-12-22 14:23:46,033 INFO  [main] index.IndexCollection (IndexCollection.java:649) - CollectionClass: JsonCollection
2021-12-22 14:23:46,033 INFO  [main] index.IndexCollection (IndexCollection.java:650) - Generator: DefaultLuceneDocumentGenerator
2021-12-22 14:23:46,033 INFO  [main] index.IndexCollection (IndexCollection.java:651) - Threads: 1
2021-12-22 14:23:46,033 INFO  [main] index.IndexCollection (IndexCollection.java:652) - Stemmer: porter
2021-12-22 14:23:46,033 INFO  [main] index.IndexCollection (IndexCollection.java:653) - Keep stopwords? false
2021-12-22 14:23:46,033 INFO  [main] index

### 2. Search index

In [29]:
if TRACKING:
    wandb.init(project=PROJECT_NAME,
            config={
                "model": MODEL_NAME,
                "setup": "pyserini index base config",
                "dataset": dataset["name"],
                "train_data_size": len(X_train),
                "validation_data_size": 0,
                "test_data_size": len(X_test),
                "batch_size": None,
                "learning_rate": None,
                "epochs": None,
                "false_class_balance": FALSE_CLASS_BALANCE
            })

[34m[1mwandb[0m: wandb version 0.12.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [30]:
predicted = {idx: False for idx in X_test.index}  # create column for reults
for main_concept in X_train["Article"].unique():
    # create query
    should = ["that"] + main_concept.split(" ") + claim_lexicon
    # should = ["that"] + claim_lexicon

    # search index
    hits = searcher.search(" ".join(should), k=1000)

    # parse results
    scores = []
    for hit in hits:
        if hit.score > 4:  # threshold for acaptable results
            ids = json.loads(hit.raw)["id"]
            predicted[ids] = True
        scores.append(hit.score)
    y_pred = list(predicted.values())
    
    # pd.DataFrame(scores).plot(xlabel="position", ylabel="score")

### 3. Evaluate results

In [31]:
print(classification_report(y_test.to_list(), y_pred))

              precision    recall  f1-score   support

       False       0.54      0.58      0.56       147
        True       0.55      0.51      0.53       147

    accuracy                           0.54       294
   macro avg       0.54      0.54      0.54       294
weighted avg       0.54      0.54      0.54       294



In [32]:
f1 = f1_score(y_test.to_list(), y_pred)
recall = recall_score(y_test.to_list(), y_pred)
precision = precision_score(y_test.to_list(), y_pred)
accuracy = accuracy_score(y_test.to_list(), y_pred)

In [33]:
if TRACKING:
    wandb.log({'test_f1': f1})
    wandb.log({'test_recall': recall})
    wandb.log({'test_precision': precision})
    wandb.log({'test_accuracy': accuracy})
    wandb.finish()

0,1
test_accuracy,▁
test_f1,▁
test_precision,▁
test_recall,▁

0,1
test_accuracy,0.54422
test_f1,0.52817
test_precision,0.54745
test_recall,0.5102


In [None]:
confusion_matrix_plot(y_test, y_pred, label=[False, True], title=MODEL_NAME+" confusion matrix")