<a href="https://colab.research.google.com/github/kanalive/notebooks/blob/main/RBA_data_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Functions

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize, word_tokenize
from datetime import datetime
import pandas as pd

def preprocess(doc):
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    sentences = sent_tokenize(doc)
    preprocessed = [ ' '.join(ps.stem(w.lower()) for w in word_tokenize(sent) if not w in stop_words and w.isalnum()) for sent in sentences]
    return sentences, preprocessed

def compare_docs(document1, document2, similarity_threshold=0.5):
    print("comparing document:")
    print(document1[:20])
    print(document2[:20])
    # Preprocessing
    original_sentences1, sentences1 = preprocess(document1)
    original_sentences2, sentences2 = preprocess(document2)

    # Vectorization
    vectorizer = TfidfVectorizer().fit_transform(sentences1 + sentences2)

    # Compute similarity
    similar_sentences = []
    similarity_scores = {}

    for i in range(len(sentences1)):
        for j in range(len(sentences2)):
            similarity = cosine_similarity(vectorizer[i:i+1], vectorizer[len(sentences1)+j:len(sentences1)+j+1])
            similarity_scores[(original_sentences1[i], original_sentences2[j])] = similarity[0][0]
            if similarity > similarity_threshold:
                similar_sentences.append((original_sentences1[i], original_sentences2[j], similarity[0][0]))

    # Identify added and removed sentences
    added_sentences = [s for s in original_sentences2 if all(similarity_scores.get((sent1, s), 0) < similarity_threshold for sent1 in original_sentences1)]
    removed_sentences = [s for s in original_sentences1 if all(similarity_scores.get((s, sent2), 0) < similarity_threshold for sent2 in original_sentences2)]

    return similar_sentences, added_sentences, removed_sentences


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [52]:
def read_txt_file_from_drive(file_path):
    text = ""

    if file_path.endswith(".txt"):
      with open(file_path, 'r') as file:
          text = file.read()

    return text

In [53]:
# Function to extract date from filename
def get_date(filename):
    date_str = filename[:-4] # Remove the .txt part
    return datetime.strptime(date_str, '%Y%m%d')

#Read RBA Statements data & Compare

In [54]:
directory_path = "/content/drive/MyDrive/Colab Notebooks/RBAStatements/"

In [59]:
def compare_two_statements(file1_name, file2_name):

  file1 = read_txt_file_from_drive(directory_path + file1_name)
  file2 = read_txt_file_from_drive(directory_path + file2_name)

  similar_sentences, added_sentences, removed_sentences = compare_docs(file1, file2,0.6)

  comparison_date_1 = get_date(file1_name)
  comparison_date_2 = get_date(file2_name)

  ssdf = pd.DataFrame(similar_sentences, columns= ["Sentence1", "Sentence2", "SimilarityScore"])
  asdf = pd.DataFrame(added_sentences, columns=["AddedSentence"])
  rsdf = pd.DataFrame(removed_sentences, columns= ["RemovedSentence"])

  ssdf["comparison_date_1"] = comparison_date_1
  asdf["comparison_date_1"] = comparison_date_1
  rsdf["comparison_date_1"] = comparison_date_1

  ssdf["comparison_date_2"] = comparison_date_2
  asdf["comparison_date_2"] = comparison_date_2
  rsdf["comparison_date_2"] = comparison_date_2

  return ssdf, asdf, rsdf

In [56]:
import os

filenames = os.listdir(directory_path)

def get_date(filename):
    date_str = filename[:-4] # Remove the .txt part
    return datetime.strptime(date_str, '%Y%m%d')

# Sort the list of filenames
filenames.sort(key=get_date)

print(filenames)


['20071107.txt', '20071205.txt', '20080205.txt', '20080304.txt', '20080401.txt', '20080506.txt', '20080603.txt', '20080701.txt', '20080805.txt', '20080902.txt', '20081007.txt', '20081104.txt', '20081202.txt', '20090203.txt', '20090303.txt', '20090407.txt', '20090505.txt', '20090602.txt', '20090707.txt', '20090804.txt', '20090901.txt', '20091006.txt', '20091103.txt', '20091201.txt', '20100202.txt', '20100302.txt', '20100406.txt', '20100504.txt', '20100601.txt', '20100706.txt', '20100803.txt', '20100907.txt', '20101005.txt', '20101102.txt', '20101207.txt', '20110201.txt', '20110301.txt', '20110405.txt', '20110503.txt', '20110607.txt', '20110705.txt', '20110802.txt', '20110906.txt', '20111004.txt', '20111101.txt', '20111206.txt', '20120207.txt', '20120306.txt', '20120403.txt', '20120501.txt', '20120605.txt', '20120703.txt', '20120807.txt', '20120904.txt', '20121002.txt', '20121106.txt', '20121204.txt', '20130205.txt', '20130305.txt', '20130402.txt', '20130507.txt', '20130604.txt', '201307

In [65]:
sdf_columns = ["Sentence1", "Sentence2", "SimilarityScore", "ComparisonDate1","ComparisonDate2"]
adf_columns = ["AddedSentence", "ComparisonDate1","ComparisonDate2"]
rdf_columns = ["RemovedSentence", "ComparisonDate1","ComparisonDate2"]

similar_df = pd.DataFrame(columns=sdf_columns)
added_df = pd.DataFrame(columns=adf_columns)
removed_df = pd.DataFrame(columns=rdf_columns)

for i, filename in enumerate(filenames):
    if i<len(filenames)-1:
      ssdf, asdf, rsdf = compare_two_statements(filename, filenames[i+1])
      ssdf.columns = sdf_columns
      asdf.columns = adf_columns
      rsdf.columns = rdf_columns

      similar_df = pd.concat([similar_df, ssdf])
      added_df = pd.concat([added_df, asdf])
      removed_df = pd.concat([removed_df, rsdf])



comparing document:
7 November 2007
At i
5 December 2007
At i
comparing document:
5 December 2007
At i
5 February 2008
At i
comparing document:
5 February 2008
At i
4 March 2008
At its 
comparing document:
4 March 2008
At its 
1 April 2008
At its 
comparing document:
1 April 2008
At its 
6 May 2008
At its me
comparing document:
6 May 2008
At its me
3 June 2008
At its m
comparing document:
3 June 2008
At its m
1 July 2008
At its m
comparing document:
1 July 2008
At its m
5 August 2008
At its
comparing document:
5 August 2008
At its
2 September 2008
At 
comparing document:
2 September 2008
At 
7 October 2008
At it
comparing document:
7 October 2008
At it
4 November 2008
At i
comparing document:
4 November 2008
At i
2 December 2008
At i
comparing document:
2 December 2008
At i
3 February 2009
At i
comparing document:
3 February 2009
At i
3 March 2009
At its 
comparing document:
3 March 2009
At its 
7 April 2009
At its 
comparing document:
7 April 2009
At its 
5 May 2009
At its me
comparin

In [67]:
similar_df[similar_df["ComparisonDate1"] == datetime.strptime("2008-02-05", "%Y-%m-%d")]


Unnamed: 0,Sentence1,Sentence2,SimilarityScore,ComparisonDate1,ComparisonDate2
0,"5 February 2008\nAt its meeting today, the Boa...","4 March 2008\nAt its meeting today, the Board ...",0.763774,2008-02-05,2008-03-04
1,CPI inflation on a year-ended basis picked up ...,"Inflation was high in 2007, with an annual CPI...",0.812197,2008-02-05,2008-03-04
2,Indicators of demand remained strong through t...,Labour market conditions remained strong into ...,0.677234,2008-02-05,2008-03-04
3,"In the short term, inflation is likely to rema...",Inflation is likely to remain relatively high ...,0.705629,2008-02-05,2008-03-04
4,The Board took careful note of recent events a...,The Board took account of events abroad and de...,0.748592,2008-02-05,2008-03-04
5,The world economy is slowing and it now appear...,The world economy is slowing and it appears li...,1.0,2008-02-05,2008-03-04
6,Recent trends in world commodity markets sugge...,"Recent trends in world commodity markets, howe...",0.73848,2008-02-05,2008-03-04
7,Having weighed both the international and dome...,Having weighed both the international and dome...,0.719334,2008-02-05,2008-03-04


In [68]:
added_df[added_df["ComparisonDate1"] == datetime.strptime("2008-02-05", "%Y-%m-%d")]


Unnamed: 0,AddedSentence,ComparisonDate1,ComparisonDate2
0,This adjustment was made in order to contain a...,2008-02-05,2008-03-04
1,Domestic demand grew at rates appreciably high...,2008-02-05,2008-03-04
2,Sentiment in global financial markets remains ...,2008-02-05,2008-03-04
3,Australian financial intermediaries are experi...,2008-02-05,2008-03-04
4,Some tightening in credit standards for more r...,2008-02-05,2008-03-04
5,There is tentative evidence that some moderati...,2008-02-05,2008-03-04
6,"The extent of that moderation is uncertain, ho...",2008-02-05,2008-03-04
7,"As the Board noted last month, a significant s...",2008-02-05,2008-03-04
8,"As a result of this and earlier actions, and r...",2008-02-05,2008-03-04
9,The Board will continue to evaluate prospects ...,2008-02-05,2008-03-04
