#### Reading News

In [1]:
# loading packages (pip/pip3 list for new packages)
import re
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import spacy
# python3 -m spacy download en_core_web_trf
nlp = spacy.load('en_core_web_trf') # Loading Spacy's English model | run python/python3 -m spacy download en_core_web_trf to download first
# try 'en_core_web_md' or 'en_core_web_lg' for higher accuracy

##### Task1: reading article by line

In [4]:
file_path = "sample.txt"

try:
    with open(file_path, 'r', encoding='utf-8') as file:
        articles = file.readlines()[:]  # Change the number to adjust the sample size
        # for article in articles:
        #     print(article)
except FileNotFoundError:
    print(f"File '{file_path}' not found.")

print(f'{len(articles)} articles have been imported.')

1870 articles have been imported.


##### Task2: count articles which contain 'gold', 'silver'

In [18]:
# Find 'gold' & 'silver' and count articles
def string_matcher(article):
    article_lower = article.lower()
    contains_gold = "gold" in article_lower
    contains_silver = "silver" in article_lower
    return contains_gold, contains_silver

gold_count = 0
silver_count = 0

for i, text in enumerate(articles):
    contains_gold, contains_silver = string_matcher(text)
    # print(f'Article{i} contains gold: {contains_gold} and silver: {contains_silver}.')
    
    if contains_gold:
        gold_count += 1
        # print(text)
    if contains_silver:
        silver_count += 1
        # print(text)
print(f'Total articles containing gold: {gold_count}')
print(f'Total articles containing silver: {silver_count}')

Article0 contains gold: True and silver: False.
Article1 contains gold: True and silver: False.
Article2 contains gold: True and silver: False.
Article3 contains gold: True and silver: False.
Article4 contains gold: True and silver: False.
Article5 contains gold: True and silver: False.
Article6 contains gold: True and silver: False.
Article7 contains gold: True and silver: False.
Article8 contains gold: True and silver: False.
Article9 contains gold: True and silver: False.
Article10 contains gold: True and silver: False.
Article11 contains gold: True and silver: False.
Article12 contains gold: True and silver: False.
Article13 contains gold: True and silver: False.
Article14 contains gold: True and silver: False.
Article15 contains gold: True and silver: False.
Article16 contains gold: True and silver: False.
Article17 contains gold: True and silver: False.
Article18 contains gold: True and silver: False.
Article19 contains gold: True and silver: False.
Article20 contains gold: True 

By looking at the result, it seems many articles contains gold or silver.

##### Task3: Further excluding any misleading content with gold/silver

In [17]:
# Create dictionary of context-specific conditions
false_positives = {
    "gold": ["gold standard", "goldilocks"],
    "silver": ["silver lining"]
}

import string 

# Find 'gold' & 'silver' with context-specific checks
def string_matcher(article):
    article_lower = article.lower()
    contains_gold = False
    contains_silver = False
    gold_matched_terms_list = [] # list to store matched terms
    silver_matched_terms_list = [] # list to store matched terms

    # Check for "gold" with context-specific conditions
    if "gold" in article_lower:
        if any(phrase in article_lower for phrase in false_positives["gold"]):
            contains_gold = False
        else:
            contains_gold = True
            matched_terms = [term for term in article_lower.split() if term != "gold" and "gold" in term 
                                                                            and term not in false_positives["gold"]
                                                                            and not term.startswith(tuple(string.punctuation))
                                                                            and not term.endswith(tuple(string.punctuation))]
            gold_matched_terms_list.extend(matched_terms)

    # Check for "silver" with context-specific conditions
    if "silver" in article_lower:
        if any(phrase in article_lower for phrase in false_positives["silver"]):
            contains_silver = False
        else:
            contains_silver = True
            matched_terms = [term for term in article_lower.split() if term != "silver" and "silver" in term 
                                                                            and term not in false_positives["silver"]
                                                                            and not term.startswith(tuple(string.punctuation))
                                                                            and not term.endswith(tuple(string.punctuation))]
            silver_matched_terms_list.extend(matched_terms)

    return contains_gold, contains_silver, gold_matched_terms_list, silver_matched_terms_list


gold_count = 0
silver_count = 0

for i, text in enumerate(articles):
    contains_gold, contains_silver, gold_matched_terms_list, silver_matched_terms_list = string_matcher(text)
#     print(f'Article{i} contains gold: {contains_gold} and silver: {contains_silver}.')
    if contains_gold:
        gold_count += 1
        false_positives["gold"].extend(gold_matched_terms_list)

    if contains_silver:
        silver_count += 1
        false_positives["silver"].extend(silver_matched_terms_list)

print(f'Total articles containing gold: {gold_count}')
print(f'Total articles containing silver: {silver_count}')
print(f'Updated false positives for gold: {false_positives["gold"]}')
print(f'Updated false positives for silver: {false_positives["silver"]}')

Total articles containing gold: 596
Total articles containing silver: 102
Updated false positives for gold: ['gold standard', 'goldilocks', "gold's", 'anglogold', 'anglogold', 'ashgold', 'randgold', 'golden', 'goldman', 'gold/copper', 'gold-plated', '23.0275.gold', 'goldsmith', '“gold-tinted', '24kgoldn', 'https://www.theaustralian.com.au/business/dgo-gold-aiming-to-strike-a-balance-in-gold-exploration/news-story/231d9ce048bfd7d7f87545741008db15dgo', '“gold', 'australia.“gold', 'gold’s', 'gold.dgo', 'golder', 'discovergold', 'goldmine', 'https://www.theedgesingapore.com/capital/investing-ideas/rotate-back-technology-stocks-gold-says-dbs', '1.9%.gold', 'goldsbury', 'goldsbury', 'goldsbury', 'goldfarming', 'gold-backed', 'https://www.history.com/this-day-in-history/fdr-takes-united-states-off-gold-standard', 'goldberg', 'gold--in', 'goldwyn', 'goldwyns', 'goldstein', 'added.gold', "snip.http://www.cratergold.com.au/irm/content/polymetallic-project.aspx?rid=310&redirectcount=1they've", 'g

[]