In [None]:
# Install the latest release of Haystack in your own environment
#! pip install farm-haystack

# Install the latest master of Haystack
!pip install --upgrade pip
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]

# Install  pygraphviz
!apt install libgraphviz-dev
!pip install pygraphviz

# In Colab / No Docker environments: Start Elasticsearch from source
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.9.2

import os
from subprocess import Popen, PIPE, STDOUT

es_server = Popen(
    ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1)  # as daemon
)
# wait until ES has started
! sleep 30

In [None]:
from haystack.utils import print_answers, fetch_archive_from_http, convert_files_to_dicts, clean_wiki_text, launch_es
from haystack.pipelines import Pipeline, RootNode
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import (
    ElasticsearchRetriever,
    DensePassageRetriever,
    FARMReader,
    TransformersQueryClassifier,
    SklearnQueryClassifier,
)

# Download and prepare data - 517 Wikipedia articles for Game of Thrones
doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# convert files to dicts containing documents that can be indexed to our datastore
got_dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)

# Initialize DocumentStore and index documents
launch_es()
document_store = ElasticsearchDocumentStore()
document_store.delete_documents()
document_store.write_documents(got_dicts)

# Initialize Sparse retriever
es_retriever = ElasticsearchRetriever(document_store=document_store)

# Initialize dense retriever
dpr_retriever = DensePassageRetriever(document_store)
document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False)

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

## Question vs Statement Classifier


In [None]:
# Here we build the pipeline
transformer_question_classifier = Pipeline()
transformer_question_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"])
transformer_question_classifier.add_node(
    component=TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier"),
    name="QueryClassifier",
    inputs=["DPRRetriever"],
)
transformer_question_classifier.add_node(component=reader, name="QAReader", inputs=["QueryClassifier.output_1"])
transformer_question_classifier.draw("question_classifier.png")

# Run only the QA reader on the question query
res_1 = transformer_question_classifier.run(query="Who is the father of Arya Stark?")
print("DPR Results" + "\n" + "=" * 15)
print_answers(res_1, details="minimum")

# Show only DPR results
res_2 = transformer_question_classifier.run(query="Arya Stark was the daughter of a Lord.")
print("ES Results" + "\n" + "=" * 15)
print_answers(res_2, details="minimum")

In [None]:
# Here we create the question vs statement query classifier
from haystack.pipelines import TransformersQueryClassifier

queries = [
    "Lord Eddard was the father of Arya Stark.",
    "Jon Snow was filmed in United Kingdom.",
    "who is the father of arya stark?",
    "Which country was jon snow filmed in?",
    "Did Pope Francis Just Pave the Way for Women Priests?"
]

question_classifier = TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier")

for query in queries:
    result = question_classifier.run(query=query)
    if result[1] == "output_1":
        category = "question"
    else:
        category = "statement"

    print(f"Query: {query}, raw_output: {result}, class: {category}")

apply to our dataframe


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/Clean CS224N folder

In [None]:
import pandas as pd
df = pd.read_csv('SCBAll.csv')
#Check what preprocessing (Beicheng uses pruned data set)
df = df.dropna() #remove nones
df['summary'] = df['summary'].str.replace('#StopClickbait', '')
df

In [None]:
df['classifier']=''
for ind in range(786):
    try:
      query = df['title'][ind]
      result = question_classifier.run(query=query)
      if result[1] == "output_1":
          category = "question"
      else:
          category = "statement"
      df['classifier'][ind] = category
    except:
      pass


In [None]:
df

In [None]:
df.to_excel("output_class.xlsx")

In [None]:
df['classifier'].value_counts()

In [None]:
print("statements= "+str(590/(590+141)))
print("question= "+str(141/(590+141)))

See if extr or abs performs better for which task!

In [None]:
# the longformer model i saved as output_ext excel file
longformer_df = pd.read_excel(r'output_ext.xlsx')

In [None]:
longformer_df['classifier']=''
for ind in range(786):
    try:
      query = longformer_df['title'][ind]
      result = question_classifier.run(query=query)
      if result[1] == "output_1":
          category = "question"
      else:
          category = "statement"
      longformer_df['classifier'][ind] = category
    except:
      pass


In [None]:
longformer_df

In [None]:
longformer_df = longformer_df.dropna()
longformer_df_S = longformer_df[longformer_df['classifier'] == "statement"]
longformer_df_Q = longformer_df[longformer_df['classifier'] == "question"]

In [None]:
print(len(longformer_df_S))
print(len(longformer_df_Q))


BERTscore for longformer_df_S and longformer_df_Q

In [None]:
!pip install bert-score
!pip install torch

In [None]:
from bert_score import score
import numpy as np
import torch

In [None]:
tmp = longformer_df_S
Pb, Rb, Fb = score([str(i) for i in tmp['summary'].tolist()], [str(i) for i in tmp["ext answer val_Sq1"].tolist()], lang='en')
print("Longformer Statements")
print("Precision: "+str(torch.mean(Pb)))
print("Recall: "+str(torch.mean(Rb[~torch.isnan(Rb)])))
print("Fbert: "+str(torch.mean(Fb)))

tmp = longformer_df_Q
Pb, Rb, Fb = score([str(i) for i in tmp['summary'].tolist()], [str(i) for i in tmp["ext answer val_Sq1"].tolist()], lang='en')
print("Longformer Questions")
print("Precision: "+str(torch.mean(Pb)))
print("Recall: "+str(torch.mean(Rb[~torch.isnan(Rb)])))
print("Fbert: "+str(torch.mean(Fb)))

Rouge scores Longformer Q and S

In [None]:
!pip install rouge/requirements.txt
!pip install rouge-score

In [None]:
import numpy as np
from rouge_score import rouge_scorer

tmp = longformer_df_S
# a list of the hypothesis documents
hyp = [str(i) for i in tmp['summary'].tolist()]
# a list of the references documents
ref = [str(i) for i in tmp['ext answer val_Sq1'].tolist()]
for ind in ['1','2','L']:
  print("Rouge"+ind)
  scorer = rouge_scorer.RougeScorer(['rouge'+ind])
  results = {'precision': [], 'recall': [], 'fmeasure': []}
  for (h, r) in zip(hyp, ref):
      score = scorer.score(h, r)
      precision, recall, fmeasure = score['rouge'+ind]
      results['precision'].append(precision)
      results['recall'].append(recall)
      results['fmeasure'].append(fmeasure)
  print("results['precision']"+ str(np.mean(results['precision'])))
  print("results['recall']"+ str(np.mean(results['recall'])))
  print("results['fmeasure']"+ str(np.mean(results['fmeasure'])))

tmp = longformer_df_Q
# a list of the hypothesis documents
hyp = [str(i) for i in tmp['summary'].tolist()]
# a list of the references documents
ref = [str(i) for i in tmp['ext answer val_Sq1'].tolist()]
for ind in ['1','2','L']:
  print("Rouge"+ind)
  scorer = rouge_scorer.RougeScorer(['rouge'+ind])
  results = {'precision': [], 'recall': [], 'fmeasure': []}
  for (h, r) in zip(hyp, ref):
      score = scorer.score(h, r)
      precision, recall, fmeasure = score['rouge'+ind]
      results['precision'].append(precision)
      results['recall'].append(recall)
      results['fmeasure'].append(fmeasure)
  print("results['precision']"+ str(np.mean(results['precision'])))
  print("results['recall']"+ str(np.mean(results['recall'])))
  print("results['fmeasure']"+ str(np.mean(results['fmeasure'])))

# do it for T5 epoch 20

In [None]:
import pandas as pd
df = pd.read_csv('predictions.csv')

In [None]:
t5_df =df
t5_df

In [None]:
t5_df['classifier']=''
for ind in range(786):
    try:
      query = t5_df['title'][ind]
      result = question_classifier.run(query=query)
      if result[1] == "output_1":
          category = "question"
      else:
          category = "statement"
      longformer_df['classifier'][ind] = category
    except:
      pass
