In [None]:
from docx import Document
import pandas as pd
from typeguard.importhook import optimized_cache_from_source


def is_bold(run):
    """Check if a text run is bolded."""
    return run.bold

def extract_articles(doc):
    titles, authors, times, descriptions = [], [], [], []
    current_title = current_author = current_time = None
    current_desc = []

    for paragraph in doc.paragraphs:
        text = paragraph.text.strip()

        # Check if paragraph is a TITLE (has bold runs)
        is_title = any(is_bold(run) for run in paragraph.runs)

        if is_title:
            # Save previous article if exists
            if current_title and current_author and current_time:
                titles.append(current_title)
                authors.append(current_author)
                times.append(current_time)
                descriptions.append(" ".join(current_desc))

            # Reset for new article
            current_title = text
            current_author = current_time = None
            current_desc = []

        # Detect AUTHOR (starts with "By")
        elif text.startswith("By "):
            current_author = text.replace("By ", "").strip()

        # Detect TIME (contains "GMT+8" or "Updated on")
        elif "GMT+8" in text or "Updated on" in text:
            current_time = text.split("Updated on")[0].strip()

        # Collect DESCRIPTION (non-title, non-metadata text)
        elif text and not is_title:
            current_desc.append(text)

    # Add the last article
    if current_title and current_author and current_time:
        titles.append(current_title)
        authors.append(current_author)
        times.append(current_time)
        descriptions.append(" ".join(current_desc))

    return pd.DataFrame({
        "Title": titles,
        "Author": authors,
        "Time": times,
        "Description": descriptions
    })

# Load the document
doc = Document("data/News Articles/Amy/3Mar25_News_Amy.docx")

# Extract and export data
df = extract_articles(doc)
df.to_excel("data/News Articles/Amy/Articles_combined.xlsx", index=False)
print(f"Extracted {len(df)} articles. Excel file saved!")

In [None]:
import os
from docx import Document
import pandas as pd

def is_bold(run):
    """Check if a text run is bolded."""
    return run.bold

def extract_articles(doc):
    """Extract articles from a single DOCX file."""
    titles, authors, times, descriptions = [], [], [], []
    current_title = current_author = current_time = None
    current_desc = []

    for paragraph in doc.paragraphs:
        text = paragraph.text.strip()

        # Check if paragraph is a TITLE (has bold runs)
        is_title = any(is_bold(run) for run in paragraph.runs)

        if is_title:
            # Save previous article if exists
            if current_title and current_author and current_time:
                titles.append(current_title)
                authors.append(current_author)
                times.append(current_time)
                descriptions.append(" ".join(current_desc))

            # Reset for new article
            current_title = text
            current_author = current_time = None
            current_desc = []

        # Detect AUTHOR (starts with "By")
        elif text.startswith("By "):
            current_author = text.replace("By ", "").strip()

        # Detect TIME (contains "GMT+8" or "Updated on")
        elif "GMT+8" in text or "Updated on" in text:
            current_time = text.split("Updated on")[0].strip()

        # Collect DESCRIPTION (non-title, non-metadata text)
        elif text and not is_title:
            current_desc.append(text)

    # Add the last article
    if current_title and current_author and current_time:
        titles.append(current_title)
        authors.append(current_author)
        times.append(current_time)
        descriptions.append(" ".join(current_desc))

    return pd.DataFrame({
        "Title": titles,
        "Author": authors,
        "Time": times,
        "Description": descriptions,
        "Source File": os.path.basename(doc_path)  # Track which file each article came from
    })

# Main processing
all_articles = pd.DataFrame()
folder_path = "data/News Articles/Amy/Teams"  # Replace with your folder path

# Process each DOCX file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".docx"):
        doc_path = os.path.join(folder_path, filename)
        try:
            print(f"Processing {filename}...")
            doc = Document(doc_path)
            df = extract_articles(doc)
            all_articles = pd.concat([all_articles, df], ignore_index=True)
        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Save combined results
output_path = "data/News Articles/Amy/Articles_combined.xlsx"
all_articles.to_excel(output_path, index=False)
print(f"\nDone! Combined {len(all_articles)} articles from {len(os.listdir(folder_path))} files.")
print(f"Saved to: {output_path}")


In [11]:
import pandas as pd
import re
from datetime import datetime

data = pd.read_excel("data/News Articles/Amy/Articles_combined.xlsx")

# Sample data (replace with your actual data)
timestamps = data["Time"]

def clean_timestamp(timestamp):
    try:
        # Standardize different formats
        if "GMT+8" in timestamp:
            if "." in timestamp:  # Format: "2025.3.3 at GMT+8 07:02"
                date_part, time_part = timestamp.split(" at GMT+8 ")
                year, month, day = date_part.split(".")
                time_obj = datetime.strptime(time_part, "%H:%M").time()
            else:  # Format: "June 2, 2025 at 6:30 PM GMT+8"
                dt_str = timestamp.replace(" at ", " ").replace(" GMT+8", "")
                dt_obj = datetime.strptime(dt_str, "%B %d, %Y %I:%M %p")
                return dt_obj.isoformat()
        else:
            # Handle other formats if they exist
            pass

        # Create datetime object
        dt_obj = datetime(int(year), int(month), int(day),
                         time_obj.hour, time_obj.minute)

        return dt_obj.isoformat()

    except Exception as e:
        print(f"Error parsing '{timestamp}': {e}")
        return None

# Create DataFrame
df = pd.DataFrame({"Original": timestamps})
df["Standardized"] = df["Original"].apply(clean_timestamp)

# Convert to pandas datetime
df["DateTime"] = pd.to_datetime(df["Standardized"]).dt.tz_localize("Asia/Singapore")
df["DateTime"] = df["DateTime"].dt.tz_convert("US/Eastern")
data['Datetime'] = df['DateTime'].dt.tz_localize(None)

data.to_excel("data/News Articles/Amy/Articles_combined.xlsx")

In [39]:
# Python libraries

# 1.DistilBERT
from transformers import pipeline
classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

def DistilBERT(sentence):
    cl = classifier(sentence)
    t = cl[0]['label']
    score = cl[0]['score']
    label = 'NEU'
    if t == 'POSITIVE':
        label = 'POS'
    elif t == 'NEGATIVE':
        label = 'NEG'
    return (label, score)

# 2.Flair
from flair.data import Sentence
from flair.nn import Classifier
tagger = Classifier.load('sentiment')

def Flair(sentence):
    sentence = Sentence(sentence)
    tagger.predict(sentence)
    cl = sentence.labels[0]
    t = cl.value
    label = 'NEU'
    if t == 'POSITIVE':
        label = 'POS'
    elif t == 'NEGATIVE':
        label = 'NEG'
    score = cl.score
    return (label, score)

# 3.FinBERT
classifier1 = pipeline("sentiment-analysis", model="ProsusAI/finbert")

def FinBERT(sentence):
    cl = classifier(sentence)
    t = cl[0]['label']
    score = cl[0]['score']
    label = 'NEU'
    if t == 'positive':
        label = 'POS'
    elif t == 'negative':
        label = 'NEG'
    return (label, score)

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
def Vader(sentence):
    return analyzer.polarity_scores(sentence)['compound']



In [40]:
data['Vader Title Score'] = data['Title'].apply(Vader)
data['Vader Description Score'] = data['Description'].apply(Vader)

data['Flair Title Label'] = data['Title'].apply(Flair).apply(lambda x:x[0])
data['Flair Title Score'] = data['Title'].apply(Flair).apply(lambda x:x[1])
data['Flair Description Label'] = data['Description'].apply(Flair).apply(lambda x:x[0])
data['Flair Description Score'] = data['Description'].apply(Flair).apply(lambda x:x[1])

data['DistilBERT Title Label'] = data['Title'].apply(DistilBERT).apply(lambda x:x[0])
data['DistilBERT Title Score'] = data['Title'].apply(DistilBERT).apply(lambda x:x[1])
data['DistilBERT Description Label'] = data['Description'].apply(DistilBERT).apply(lambda x:x[0])
data['DistilBERT Description Score'] = data['Description'].apply(DistilBERT).apply(lambda x:x[1])

data['FinBERT Title Label'] = data['Title'].apply(FinBERT).apply(lambda x:x[0])
data['FinBERT Title Score'] = data['Title'].apply(FinBERT).apply(lambda x:x[1])
data['FinBERT Description Label'] = data['Description'].apply(FinBERT).apply(lambda x:x[0])
data['FinBERT Description Score'] = data['Description'].apply(FinBERT).apply(lambda x:x[1])

In [41]:
# Simple ML models

import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    if not isinstance(text, str) or text is None:
        return ""
    # Remove extra spaces and normalize
    text = ' '.join(text.split())
    tokens = word_tokenize(text.lower())
    # Retain alphanumeric tokens to keep numbers (e.g., "20%")
    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
    return ' '.join(cleaned_tokens)

# data which they were trained on
df1 = pd.read_csv("data/News Articles/headlines_with_sentiment (grok).csv")
df1['Headline'] = df1['Headline'].apply(preprocess)
X = df1['Headline']
y = df1['Sentiment_label']

# 1.SVM
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.svm import LinearSVC
pipeline_svc = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', LinearSVC())
]); pipeline_svc.fit(X, y)

# 2.KNN
from sklearn.neighbors import KNeighborsClassifier
pipeline_knn = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', KNeighborsClassifier())
]); pipeline_knn.fit(X, y)

# 3.GBDT
from sklearn.ensemble import GradientBoostingClassifier
pipeline_GBDT = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', GradientBoostingClassifier())
]); pipeline_GBDT.fit(X, y)

# LR
from sklearn.linear_model import LogisticRegression
pipeline_LR = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', LogisticRegression())
]);pipeline_LR.fit(X,y)

# M NB
from sklearn.naive_bayes import MultinomialNB
pipeline_NB = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', MultinomialNB())
]); pipeline_NB.fit(X,y)

# RF
from sklearn.ensemble import RandomForestClassifier
pipeline_RF = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', RandomForestClassifier())
]); pipeline_RF.fit(X,y)

[nltk_data] Downloading package wordnet to C:\Users\Jay
[nltk_data]     Tai\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Jay
[nltk_data]     Tai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Jay
[nltk_data]     Tai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


0,1,2
,steps,"[('vect', ...), ('tfidf', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,norm,'l2'
,use_idf,True
,smooth_idf,True
,sublinear_tf,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [44]:
data['Title_processed'] = data['Title'].apply(preprocess)
data['Desc_processed'] = data['Description'].apply(preprocess)

data['SVC Title Label'] = pipeline_svc.predict(data['Title_processed'])
data['SVC Description Label'] = pipeline_svc.predict(data['Desc_processed'])

data['LR Title Label'] = pipeline_LR.predict(data['Title_processed'])
data['LR Title Score'] = pipeline_LR.predict_proba(data['Title_processed']).max(axis=1)
data['LR Description Label'] = pipeline_LR.predict(data['Desc_processed'])
data['LR Description Score'] = pipeline_LR.predict_proba(data['Desc_processed']).max(axis=1)

data['NB Title Label'] = pipeline_NB.predict(data['Title_processed'])
data['NB Title Score'] = pipeline_NB.predict_proba(data['Title_processed']).max(axis=1)
data['NB Description Label'] = pipeline_NB.predict(data['Desc_processed'])
data['NB Description Score'] = pipeline_NB.predict_proba(data['Desc_processed']).max(axis=1)

data['GBDT Title Label'] = pipeline_GBDT.predict(data['Title_processed'])
data['GBDT Title Score'] = pipeline_GBDT.predict_proba(data['Title_processed']).max(axis=1)
data['GBDT Description Label'] = pipeline_GBDT.predict(data['Desc_processed'])
data['GBDT Description Score'] = pipeline_GBDT.predict_proba(data['Desc_processed']).max(axis=1)

data['RF Title Label'] = pipeline_RF.predict(data['Title_processed'])
data['RF Title Score'] = pipeline_RF.predict_proba(data['Title_processed']).max(axis=1)
data['RF Description Label'] = pipeline_RF.predict(data['Desc_processed'])
data['RF Description Score'] = pipeline_RF.predict_proba(data['Desc_processed']).max(axis=1)

data['KNN Title Label'] = pipeline_knn.predict(data['Title_processed'])
data['KNN Title Score'] = pipeline_knn.predict_proba(data['Title_processed']).max(axis=1)
data['KNN Description Label'] = pipeline_knn.predict(data['Desc_processed'])
data['KNN Description Score'] = pipeline_knn.predict_proba(data['Desc_processed']).max(axis=1)

In [45]:
data.to_excel("data/News Articles/Amy/Articles_combined.xlsx")

In [52]:
new = data.iloc[: , 6:21].copy()
new1 = data.iloc[:, 23:].copy()
n = pd.concat([new, new1], axis=1)
n.to_csv('data/News Articles/Amy/Article_sentiments.csv')