<a href="https://colab.research.google.com/github/kanalive/notebooks/blob/main/RBA_data_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import packages

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize, word_tokenize
from datetime import datetime
import pandas as pd
import os


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


#Configs


In [None]:
directory_path = "/content/drive/MyDrive/Colab Notebooks/RBAStatements/"
similarity_thredhold = 0.6

#Functions

In [None]:
def preprocess(doc):
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    sentences = sent_tokenize(doc)
    preprocessed = [ ' '.join(ps.stem(w.lower()) for w in word_tokenize(sent) if not w in stop_words and w.isalnum()) for sent in sentences]
    return sentences, preprocessed

def compare_docs(document1, document2, similarity_threshold=0.5):
    print("comparing document:")
    print(document1[:20])
    print(document2[:20])
    # Preprocessing
    original_sentences1, sentences1 = preprocess(document1)
    original_sentences2, sentences2 = preprocess(document2)

    # Vectorization
    vectorizer = TfidfVectorizer().fit_transform(sentences1 + sentences2)

    # Compute similarity
    similar_sentences = []
    similarity_scores = {}

    for i in range(len(sentences1)):
        for j in range(len(sentences2)):
            similarity = cosine_similarity(vectorizer[i:i+1], vectorizer[len(sentences1)+j:len(sentences1)+j+1])
            similarity_scores[(original_sentences1[i], original_sentences2[j])] = similarity[0][0]
            if similarity > similarity_threshold:
                similar_sentences.append((original_sentences1[i], original_sentences2[j], similarity[0][0]))

    # Identify added and removed sentences
    added_sentences = [s for s in original_sentences2 if all(similarity_scores.get((sent1, s), 0) < similarity_threshold for sent1 in original_sentences1)]
    removed_sentences = [s for s in original_sentences1 if all(similarity_scores.get((s, sent2), 0) < similarity_threshold for sent2 in original_sentences2)]

    return similar_sentences, added_sentences, removed_sentences


In [None]:
def read_txt_file_from_drive(file_path):
    text = ""

    if file_path.endswith(".txt"):
      with open(file_path, 'r') as file:
          text = file.read()

    return text

def read_txt_files_from_drive(directory):
    # List to store the content of each text file
    contents = []

    # Go through each file in the directory
    for filename in os.listdir(directory):
        # If the file is a .txt file
        if filename.endswith(".txt"):
            # Create the full file path by joining the directory path and the filename
            file_path = os.path.join(directory, filename)

            # Open the file and read the contents
            with open(file_path, 'r') as file:
                text = file.read()

            # Append the file contents to the list
            contents.append(text)

    return contents

In [None]:
# Function to extract date from filename
def get_date(filename):
    date_str = filename[:-4] # Remove the .txt part
    return datetime.strptime(date_str, '%Y%m%d')

#Read RBA Statements data & Compare month by month

In [None]:
def compare_two_statements(file1_name, file2_name):

  file1 = read_txt_file_from_drive(directory_path + file1_name)
  file2 = read_txt_file_from_drive(directory_path + file2_name)

  similar_sentences, added_sentences, removed_sentences = compare_docs(file1, file2,similarity_thredhold)

  comparison_date_1 = get_date(file1_name)
  comparison_date_2 = get_date(file2_name)

  ssdf = pd.DataFrame(similar_sentences, columns= ["Sentence1", "Sentence2", "SimilarityScore"])
  asdf = pd.DataFrame(added_sentences, columns=["AddedSentence"])
  rsdf = pd.DataFrame(removed_sentences, columns= ["RemovedSentence"])

  ssdf["comparison_date_1"] = comparison_date_1
  asdf["comparison_date_1"] = comparison_date_1
  rsdf["comparison_date_1"] = comparison_date_1

  ssdf["comparison_date_2"] = comparison_date_2
  asdf["comparison_date_2"] = comparison_date_2
  rsdf["comparison_date_2"] = comparison_date_2

  return ssdf, asdf, rsdf

In [None]:
filenames = os.listdir(directory_path)

sdf_columns = ["Sentence1", "Sentence2", "SimilarityScore", "ComparisonDate1","ComparisonDate2"]
adf_columns = ["AddedSentence", "ComparisonDate1","ComparisonDate2"]
rdf_columns = ["RemovedSentence", "ComparisonDate1","ComparisonDate2"]

similar_df = pd.DataFrame(columns=sdf_columns)
added_df = pd.DataFrame(columns=adf_columns)
removed_df = pd.DataFrame(columns=rdf_columns)

for i, filename in enumerate(filenames):
    if i<len(filenames)-1:
      ssdf, asdf, rsdf = compare_two_statements(filename, filenames[i+1])
      ssdf.columns = sdf_columns
      asdf.columns = adf_columns
      rsdf.columns = rdf_columns

      similar_df = pd.concat([similar_df, ssdf])
      added_df = pd.concat([added_df, asdf])
      removed_df = pd.concat([removed_df, rsdf])



comparing document:
7 November 2007
At i
5 December 2007
At i
comparing document:
5 December 2007
At i
2 December 2008
At i
comparing document:
2 December 2008
At i
4 November 2008
At i
comparing document:
4 November 2008
At i
7 October 2008
At it
comparing document:
7 October 2008
At it
5 August 2008
At its
comparing document:
5 August 2008
At its
2 September 2008
At 
comparing document:
2 September 2008
At 
1 July 2008
At its m
comparing document:
1 July 2008
At its m
3 June 2008
At its m
comparing document:
3 June 2008
At its m
6 May 2008
At its me
comparing document:
6 May 2008
At its me
1 April 2008
At its 
comparing document:
1 April 2008
At its 
4 March 2008
At its 
comparing document:
4 March 2008
At its 
1 December 2009
At i
comparing document:
1 December 2009
At i
5 February 2008
At i
comparing document:
5 February 2008
At i
3 November 2009
At i
comparing document:
3 November 2009
At i
6 October 2009
At it
comparing document:
6 October 2009
At it
1 September 2009
At 
comparin

In [None]:
similar_df[similar_df["ComparisonDate1"] == datetime.strptime("2008-02-05", "%Y-%m-%d")]


Unnamed: 0,Sentence1,Sentence2,SimilarityScore,ComparisonDate1,ComparisonDate2
0,"5 February 2008\nAt its meeting today, the Boa...","3 November 2009\nAt its meeting today, the Boa...",0.606418,2008-02-05,2009-11-03


In [None]:
added_df[added_df["ComparisonDate1"] == datetime.strptime("2008-02-05", "%Y-%m-%d")]


Unnamed: 0,AddedSentence,ComparisonDate1,ComparisonDate2
0,The global economy has resumed growth.,2008-02-05,2009-11-03
1,With economic policy settings likely to remain...,2008-02-05,2009-11-03
2,The expansion is generally expected to be mode...,2008-02-05,2009-11-03
3,Prospects for Australia's Asian trading partne...,2008-02-05,2009-11-03
4,"Growth in China has been very strong, which is...",2008-02-05,2009-11-03
5,"For Australia's trading partner group, growth ...",2008-02-05,2009-11-03
6,Sentiment in global financial markets is much ...,2008-02-05,2009-11-03
7,"Nonetheless, the state of balance sheets in so...",2008-02-05,2009-11-03
8,Economic conditions in Australia have been str...,2008-02-05,2009-11-03
9,Some spending has probably been brought forwar...,2008-02-05,2009-11-03


#Compile all sentences and assign hawkish dovish score

In [None]:
list_sentences = read_txt_files_from_drive(directory_path)

In [None]:
list_s = []
list_preprocessed_s = []
for s in list_sentences:
  sentences, preprocessed = preprocess(s)
  list_s.append(sentences)
  list_preprocessed_s.append(preprocessed)

merged_list = [item for sublist in list_s for item in sublist]
distinct_list = list(set(merged_list))

In [None]:
rba_statement_senteces = pd.DataFrame(distinct_list, columns = ["Sentence"])

In [None]:
df_filtered = rba_statement_senteces[rba_statement_senteces['Sentence'].str.contains('wage growth', case=False, na=False)]
df_filtered

Unnamed: 0,Sentence
110,"Wage growth remains low, however, and this is ..."
590,"Notwithstanding the improving labour market, w..."
627,"Wage growth remains subdued in most countries,..."
1136,Wage growth remains slow and this is likely to...
1489,"Consistent with this, the rate of wage growth ..."
1622,"Against this, however, wage growth remains low..."
1865,"This is likely to continue for a while yet, al..."
1973,"However, wage growth remains low."
2162,Wage growth remains slow.
2809,"Wage growth remains low in most countries, as ..."
