<a href="https://colab.research.google.com/github/hkayesh/causal-qa/blob/master/Causal_QA_(using_Causal_Net).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import packages

In [0]:
import re
import nltk
import string
import warnings
import pandas as pd
import networkx as nx
import numpy as np

from gensim.utils import tokenize, lemmatize
from nltk.corpus import stopwords

from google.colab import drive 
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, confusion_matrix

In [4]:
# installing pattern package erquired for gensim's lemmatize() 
!pip install git+git://github.com/pattern3/pattern.git

Collecting git+git://github.com/pattern3/pattern.git
  Cloning git://github.com/pattern3/pattern.git to /tmp/pip-req-build-__vm84do
  Running command git clone -q git://github.com/pattern3/pattern.git /tmp/pip-req-build-__vm84do
Building wheels for collected packages: pattern
  Building wheel for pattern (setup.py) ... [?25l[?25hdone
  Created wheel for pattern: filename=pattern-2.6-py2.py3-none-any.whl size=18553736 sha256=b7452a231a6b74605b07490ced30b61bf189f083598806ada6ed943f36167832
  Stored in directory: /tmp/pip-ephem-wheel-cache-vfevb7ee/wheels/42/86/32/4c2c2365f5f4247ff44ae48bb2290f4fb024b2d2a48bf52a32
Successfully built pattern


In [5]:
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
# nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### Mount Google Drive

In [6]:
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


### Declare Variables

In [0]:
DATASET_DIR = 'gdrive/My Drive/Research Data/CausalQA/datasets/'
CAUSAL_TUPLES_DIR = 'gdrive/My Drive/Research Data/CausalQA/causalTuples/'
CAUSAL_NET_FILE = 'gdrive/My Drive/Research Data/CausalQA/causal_net/causal_net_100k.pickle'
CAUSAL_PAIRS_1M_ARTICLES_FILE = '/content/gdrive/My Drive/Research Data/CausalQA/causal_pairs_1M_articles.csv'

### Load background dataset

In [0]:
bg_tuples = []

# causal_tuples_files = ['nyt_mar30_combo.argsC', 'apw_mar30_combo.argsC', 'afp_mar30_combo.argsC', 'ltw_mar30_combo.argsC', 'simpleWiki_mar19b_combo.argsC', 'xin_mar30_combo.argsC']
# for causal_tuples_file in causal_tuples_files:
#   with open(CAUSAL_TUPLES_DIR + causal_tuples_file, 'r') as rows:
#     for index, row in enumerate(rows):
#       splits = row.split('-->', 2)
#       cause_phrase = re.sub('_[A-Z]+', '', splits[0]).strip().lower()
#       effect_phrase = re.sub('_[A-Z]+', '', splits[1]).strip().lower()

#       bg_tuples.append((cause_phrase, effect_phrase))

# len(bg_tuples)

In [9]:
causal_pairs_df = pd.read_csv(CAUSAL_PAIRS_1M_ARTICLES_FILE, nrows=100000, lineterminator='\n', error_bad_lines=False)
print(causal_pairs_df.shape)

bg_tuples.extend(list(zip(causal_pairs_df['sequence_a'], causal_pairs_df['sequence_b'])))
 

(100000, 3)


### Prepare Causal Network 

In [0]:
class Preprocessor():
  def __init__(self, params=list()):
    self.lemmatize = True if 'lemmatize' in params else False

  def preprocess(self, document):
    """
    Run the preprocessing operations on the input string and returns processed string

    :param document: a string to be preprocessed. 
    :return string: processed string 
    """
    processed_doc = document
    processed_doc = self.lemmatize_doc(processed_doc) if self.lemmatize else processed_doc

    return processed_doc.strip()


  def lemmatize_doc(self, document):
    """
    Apply lemmatization on each word of a string

    :param document: a string
    :return string: the string with lemmatized words 
    """

    processed_tokens= []
    lemma_tokens = list(lemmatize(document, stopwords=set(stopwords.words('english'))))
    for lemma_token in lemma_tokens:
        try:
          token = lemma_token.decode('ascii').split('/')[0] # Discard POS tags
          processed_tokens.append(token)
        except UnicodeDecodeError:
          continue

    return ' '.join(processed_tokens)

class CausalNetGenerator():
  def __init__(self):
     self.preprocessor = Preprocessor(['lemmatize'])
  
  def clean_up_text(self, causal_pairs):
    clean_causal_pairs = []
    for causal_pair in causal_pairs:
      cause_clean_tokens = self.preprocessor.preprocess(causal_pair[0])
      effect_clean_tokens = self.preprocessor.preprocess(causal_pair[1])

      if len(cause_clean_tokens) > 0 and len(effect_clean_tokens) > 0:
        clean_causal_pairs.append((cause_clean_tokens, effect_clean_tokens))

    return clean_causal_pairs

  def get_causal_token_pairs(self, causal_pair_phrase):
      cause_tokens = list(tokenize(causal_pair_phrase[0]))
      effect_tokens = list(tokenize(causal_pair_phrase[1]))
      causal_pairs = []
      for cause_token in cause_tokens:
          cause_replicated_list = [cause_token] * len(effect_tokens)

          causal_pairs += list(zip(cause_replicated_list, effect_tokens))

      return causal_pairs

  def create_or_update_directed_causal_graph(self, causal_pairs, graph=None):
    if graph is None:
      graph = nx.DiGraph()
    
    causal_pairs = self.clean_up_text(causal_pairs)

    for causal_pair in causal_pairs:
      causal_token_pairs = self.get_causal_token_pairs(causal_pair)
      
      for causal_token_pair in causal_token_pairs:
        cause_token = causal_token_pair[0]
        effect_token = causal_token_pair[1]

        if cause_token not in graph:
          graph.add_node(cause_token)
        if effect_token not in graph:
          graph.add_node(effect_token)

        if graph.has_successor(cause_token, effect_token):
          graph[cause_token][effect_token]['freq'] += 1
        else:
          graph.add_edge(cause_token, effect_token)
          graph[cause_token][effect_token]['freq'] = 1

    return graph

causal_net_generator = CausalNetGenerator()
causal_net = causal_net_generator.create_or_update_directed_causal_graph(bg_tuples)

nx.write_gpickle(causal_net, CAUSAL_NET_FILE)


### Calcualte Strength Calcualtion

In [0]:

class CausalStrengthCalculator:
    def __init__(self, causal_net_file):
        self.causal_net_file = causal_net_file
        self.causal_net = nx.read_gpickle(self.causal_net_file)
        self.N = len(self.causal_net.nodes())
        self.M = sum([edge[2]['freq'] for edge in self.causal_net.edges(data=True)])

    def get_prior_probas(self, i_c, j_e):
        prior_probas = {
            'p_of_i_c': 0,
            'p_of_j_e': 0,
            'p_of_i_c_and_j_e': 0
        }

        if self.causal_net.has_node(i_c) and self.causal_net.has_node(j_e) and self.causal_net.has_edge(i_c, j_e):
            f_of_i_and_j_e = self.causal_net[i_c][j_e]['freq']
            prior_probas['p_of_i_c_and_j_e'] = f_of_i_and_j_e/self.N

        if self.causal_net.has_node(i_c):
            number_of_pairs_with_i_c = sum([self.causal_net[i_c][successor]['freq'] for successor in self.causal_net.successors(i_c)])
            prior_probas['p_of_i_c'] = number_of_pairs_with_i_c/self.M

        if self.causal_net.has_node(j_e):
            number_of_pairs_with_j_e = sum([self.causal_net[predecessor][j_e]['freq'] for predecessor in self.causal_net.predecessors(j_e)])
            prior_probas['p_of_j_e'] = number_of_pairs_with_j_e/self.M

        return prior_probas

    def get_causal_strength(self, i_c, j_e, alpha=0.66, cs_lambda=0.5):
        cs_of_i_c_and_j_e = 0
        prior_probas = self.get_prior_probas(i_c, j_e)
        if prior_probas['p_of_i_c'] > 0 and prior_probas['p_of_j_e'] > 0:
            cs_nec_of_i_c_and_j_e = prior_probas['p_of_i_c_and_j_e'] / ((prior_probas['p_of_i_c'] ** alpha) * prior_probas['p_of_j_e'])
            cs_suf_of_i_c_and_j_e = prior_probas['p_of_i_c_and_j_e'] / (prior_probas['p_of_i_c'] * (prior_probas['p_of_j_e'] ** alpha))
            cs_of_i_c_and_j_e = (cs_nec_of_i_c_and_j_e ** cs_lambda) * (cs_suf_of_i_c_and_j_e ** (1 - cs_lambda))

        return cs_of_i_c_and_j_e

    def get_causality_score(self, candidate_causal_pair):
        T_1 = list(tokenize(candidate_causal_pair[0]))
        T_2 = list(tokenize(candidate_causal_pair[1]))

        total_causal_strength = 0

        for i_c in T_1:
            for j_e in T_2:
                causal_strength = self.get_causal_strength(i_c, j_e)
                total_causal_strength += causal_strength

        causal_score = total_causal_strength / (len(T_1) + len(T_2))

        return causal_score

# causal_strength_calculator = CausalStrengthCalculator(CAUSAL_NET_FILE)

# causal_strength_calculator.get_causality_score(('blindness', 'disease'))

### Load Evaluation datasts

In [0]:
semeval_file_path = DATASET_DIR + 'semeval-benchmark-v1.csv'
risk_file_path = DATASET_DIR + 'risk-models-benchmark-v1.csv'
nato_sfa_file_path = DATASET_DIR + 'nato-sfa-benchmark-v1.csv'
ce_me_file_path = DATASET_DIR + 'ce-me-benchmark-v1.csv'
ce_twitter_file_path = DATASET_DIR + 'twitter-causal-dataset.csv'

In [13]:
sem_eval_df = pd.read_csv(semeval_file_path, names=['cause', 'effect', 'label'], header=None)
sem_eval_df.head()

Unnamed: 0,cause,effect,label
0,dwarf,emission,causal
1,disease,blindness,causal
2,women,accident,causal
3,reading,rage,causal
4,snowstorm,losses,causal


In [14]:
sem_eval_df[865:].sample(n=6, random_state=1)

Unnamed: 0,cause,effect,label
1465,protein,researchers,non_causal
921,rocks,pile,non_causal
1349,copper,tissue,non_causal
949,article,criticisms,non_causal
1524,drum,ear,non_causal
1226,work,difficulties,non_causal


In [15]:
risk_df = pd.read_csv(risk_file_path, names=['cause', 'effect', 'label'], header=None)
risk_df['cause'] = risk_df['cause'].apply(lambda x: re.sub('\s\[\d+\]$', '', x))  # remove '[n]' from the end
risk_df['effect'] = risk_df['effect'].apply(lambda x: re.sub('\s\[\d+\]$', '', x))  # remove '[n]' from the end
risk_df.head()

Unnamed: 0,cause,effect,label
0,new competitors,increasing profits for our clients,causal
1,new competitors,"increased speed efficiency, and lower cost",causal
2,"increased speed efficiency, and lower cost",increasing profits for our clients,causal
3,"increased speed efficiency, and lower cost",increasing profits for our clients,causal
4,changing market driving the needs for new busi...,increased importance of bundling products,causal


In [16]:
nato_sfa_df = pd.read_csv(nato_sfa_file_path, names=['cause', 'effect', 'label'], header=None)
nato_sfa_df.head()

Unnamed: 0,cause,effect,label
0,Increased global inequality,Migration,causal
1,Natural disasters,Unavailability of national military assets due...,causal
2,Increasingly connected human networks,An increasing need to understand human networks,causal
3,Fractured and/or polarized societies,Instability and civil war,causal
4,Fractured and/or polarized societies,Instability along NATO’s border causing large-...,causal


In [17]:
ce_me_df = pd.read_csv(ce_me_file_path, names=['cause', 'effect', 'label'], header=None)
ce_me_df.head()

Unnamed: 0,cause,effect,label
0,A strong dollar and a low oil price,profits of multinationals have dropped by 25%,causal
1,country increases imports,country decreasing balance of trade,causal
2,plunge in the value of local currency,country decreasing foreign reserves,causal
3,increase in national debt,decrease in real annual economic growth,causal
4,increase in the demand for a currency,rise in the exchange rate,causal


In [18]:
ce_twitter_df = pd.read_csv(ce_twitter_file_path)
ce_twitter_df['label'] = ce_twitter_df['label'].apply(lambda x: 'non_causal' if x == 'not-causal' else x)
ce_twitter_df.head()

Unnamed: 0,cause,effect,label
0,i ned to be front and centre,it’s al about me,non_causal
1,families truly suport girl-child,we can se that sky to is not the limit,causal
2,blinding youth with pelet guns was a #comonwea...,india would win a gold every hour,non_causal
3,they were so intolerant to an individuals thou...,they cant telecast #comonwealthgames2018 medal...,causal
4,you can't wait until then,you can watch it here:,causal


### Perform Evaluation

In [23]:
causal_strength_calculator = CausalStrengthCalculator(CAUSAL_NET_FILE)

# causal_strength_calculator.get_causality_score(('blindness', 'disease'))

def get_causality_decisions_extended(candidate_pairs):
  preds = []

  for candidate_pair in candidate_pairs:
    x_may_cause_y_score = causal_strength_calculator.get_causality_score((candidate_pair[0], candidate_pair[1]))
    y_may_cause_x_score = causal_strength_calculator.get_causality_score((candidate_pair[1], candidate_pair[0]))

    pred = (0.0, 0.0)
    
    sum = x_may_cause_y_score + y_may_cause_x_score
    if sum > 0:
      pred = (y_may_cause_x_score/sum, x_may_cause_y_score/sum)
    preds.append(pred)
  # print(preds)
  pred_labels = np.argmax(preds, axis=1).flatten()

  # pred_labels = []
  # for pred in preds:
  #   pred_label = 1
  #   if pred[0] > 0.0 or pred[1] > 0.0:
  #     if pred[0] < pred[1]:
  #       pred_label = 0
  #   pred_labels.append(pred_label)

  return pred_labels

def get_causality_decisions(candidate_pairs):
  pred_labels = []

  for candidate_pair in candidate_pairs:
    x_may_cause_y_score = causal_strength_calculator.get_causality_score((candidate_pair[0], candidate_pair[1]))
    pred_label = 0
    if x_may_cause_y_score > 0:
      pred_label = 1
    pred_labels.append(pred_label)

  return pred_labels

def evaluate(candidate_pairs, gold_labels):

  predictions = get_causality_decisions(candidate_pairs)
  # predictions = get_causality_decisions_extended(candidate_pairs)

  tn, fp, fn, tp = confusion_matrix(gold_labels, predictions).ravel() 
  scores = {
    # 'matthews_corrcoef_acc': matthews_corrcoef(gold_labels, predictions),
    'true_positive': tp,
    'false_positive': fp,
    'precision': precision_score(gold_labels, predictions),
    'recall': recall_score(gold_labels, predictions),
    'f1_score': f1_score(gold_labels, predictions),
    'accuracy': accuracy_score(gold_labels, predictions),
    'auc': roc_auc_score(gold_labels, predictions)
  }

  return scores

warnings.filterwarnings("ignore", category=RuntimeWarning) 

dataset_dfs = [sem_eval_df, nato_sfa_df, risk_df, ce_me_df, ce_twitter_df]
dataset_names = ['SemEval', 'NATO-SFA', 'Risk Models', 'CE Pairs', 'Twitter']

# print('Dataset, accuracy, Precision, Recall, F1-score')

for dataset_df, dataset_name in zip(dataset_dfs, dataset_names):
  assert(len(dataset_df['cause']) == len(dataset_df['effect']))
  candidate_pairs = zip(dataset_df['cause'].tolist(), dataset_df['effect'].tolist())
  labels = [1 if label=='causal' else 0 for label in dataset_df['label'].tolist()]

  scores = evaluate(candidate_pairs, labels)
  print('{}, {}, {}, {:.4f}, {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format(dataset_name, scores['true_positive'], scores['false_positive'], scores['accuracy'], scores['precision'], scores['recall'], scores['f1_score'], scores['auc']))

SemEval, 79, 50, 0.5168, 0.6124, 0.0913, 0.1590, 0.5168
NATO-SFA, 20, 16, 0.5339, 0.5556, 0.3390, 0.4211, 0.5339
Risk Models, 320, 328, 0.4900, 0.4938, 0.7960, 0.6095, 0.4900
CE Pairs, 99, 93, 0.5188, 0.5156, 0.6188, 0.5625, 0.5187
Twitter, 232, 192, 0.5426, 0.5472, 0.5054, 0.5255, 0.5427
