# Claim Sentence Query
___

This model is based on:

```Bibtex
@inproceedings{levyUnsupervisedCorpuswideClaim2017,
  title = {Unsupervised Corpus-Wide Claim Detection},
  author = {Levy, Ran and Gretz, Shai and Sznajder, Benjamin and Hummel, Shay and Aharonov, Ranit and Slonim, Noam},
  date = {2017},
  doi = {10.18653/v1/w17-5110},
}
```

Parameter:
- Threshold for the retrieval score

In [1]:
import json
import os

from config import CLAIM_LEXICON_PATH, DATA_PATH, INDEX_PATH, PYSERINI_PATH

import pandas as pd
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

from src.searcher import convert_data, create_index, build_query
from src.dataset import load_dataset

from config import PROJECT_NAME, DATASET

import wandb

"-ip=127.0.0.1" is not a valid option
 -bm25.accurate               : Boolean switch to use AccurateBM25Similarity (computes accurate
                                document lengths). (default: false)
 -collection [class]          : Collection class in package 'io.anserini.collection'.
 -es                          : Indexes into Elasticsearch. (default: false)
 -es.batch [n]                : Elasticsearch batch index requests size. (default: 1000)
 -es.bulk [n]                 : Elasticsearch max bulk requests size in bytes. (default: 80000000)
 -es.connectTimeout [ms]      : Elasticsearch (low level) REST client connect timeout (in ms).
                                (default: 600000)
 -es.hostname [host]          : Elasticsearch host. (default: localhost)
 -es.index [name]             : Elasticsearch index name.
 -es.password [password]      : Elasticsearch password. (default: changeme)
 -es.poolSize [num]           : Elasticsearch client pool size. (default: 10)
 -es.port [port] 

In [None]:
MODEL_NAME = "CSQ"
TRACKING = False

### 0. Load data

In [16]:
X_train, X_test, y_train, y_test = load_dataset()

In [4]:
with open(CLAIM_LEXICON_PATH, "r") as inFile:  # load claim lexicon
    claim_lexicon = inFile.read().split("\n")

In [5]:
convert_data(X_train["Sentence"], data_path=PYSERINI_PATH)  # convert data

### 1. Setup index

In [6]:
searcher = create_index(data_path=PYSERINI_PATH, index_path=INDEX_PATH, language="english")

2021-12-13 16:35:28,294 INFO  [main] index.IndexCollection (IndexCollection.java:643) - Setting log level to INFO
2021-12-13 16:35:28,296 INFO  [main] index.IndexCollection (IndexCollection.java:646) - Starting indexer...
2021-12-13 16:35:28,297 INFO  [main] index.IndexCollection (IndexCollection.java:648) - DocumentCollection path: data/pyserini
2021-12-13 16:35:28,298 INFO  [main] index.IndexCollection (IndexCollection.java:649) - CollectionClass: JsonCollection
2021-12-13 16:35:28,299 INFO  [main] index.IndexCollection (IndexCollection.java:650) - Generator: DefaultLuceneDocumentGenerator
2021-12-13 16:35:28,300 INFO  [main] index.IndexCollection (IndexCollection.java:651) - Threads: 1
2021-12-13 16:35:28,300 INFO  [main] index.IndexCollection (IndexCollection.java:652) - Stemmer: porter
2021-12-13 16:35:28,301 INFO  [main] index.IndexCollection (IndexCollection.java:653) - Keep stopwords? false
2021-12-13 16:35:28,302 INFO  [main] index.IndexCollection (IndexCollection.java:654) - 

### 2. Search index

In [None]:
if TRACKING:
    wandb.init(project=PROJECT_NAME,
            config={
                "model": MODEL_NAME,
                "dataset": DATASET,
                "train_data_size": len(X_train),
                "validation_data_size": 0,
                "test_data_size": len(X_test),
                "batch_size": None,
                "learning_rate": None,
                "epochs": None
            })

In [7]:
predicted = {idx: False for idx in X_train.index}  # create column for reults

In [8]:
for main_concept in X_train["Article"].unique():
    # create query
    should = ["that"] + main_concept.split(" ") + claim_lexicon

    # search index
    hits = searcher.search(" ".join(should), k=1000)

    # parse results
    scores = []
    for hit in hits:
        if hit.score > 5:  # threshold for acaptable results
            ids = json.loads(hit.raw)["id"]
            predicted[ids] = True
        scores.append(hit.score)
    Y_pred = list(predicted.values())
    
    # pd.DataFrame(scores).plot(xlabel="position", ylabel="score")

### 3. Evaluate results

In [9]:
print(classification_report(y_train.to_list(), Y_pred))

              precision    recall  f1-score   support

       False       0.55      0.81      0.66      1234
        True       0.64      0.34      0.44      1234

    accuracy                           0.58      2468
   macro avg       0.60      0.58      0.55      2468
weighted avg       0.60      0.58      0.55      2468



In [None]:
f1 = f1_score(y_test, Y_pred)
recall = recall_score(y_test, Y_pred)
precision = precision_score(y_test, Y_pred)
accuracy = accuracy_score(y_test, Y_pred)

In [None]:
if TRACKING:
    wandb.log({'test_f1': f1})
    wandb.log({'test_recall': recall})
    wandb.log({'test_precision': precision})
    wandb.log({'test_accuracy': accuracy})
