author: Edwina Hon Kai Xin

In [1]:
from de_classes.data_loader import data_loader
import json

file = "data/full_sentiments.json"

try:
    data = data_loader.from_json_lines(file)
    print(f"Successfully loaded {len(data)} records")
except AttributeError:
    # Method 2: Alternative approach 
    print("Import failed. Trying alternative approach...")
    
    # Import the specific file directly
    import sys
    import os
    
    # Add the parent directory to the path if needed
    current_dir = os.path.dirname(os.getcwd())
    if current_dir not in sys.path:
        sys.path.append(current_dir)
    
    # Re-import with the full path
    import de_classes.data_loader
    # Force reload in case it was previously imported incorrectly
    import importlib
    importlib.reload(de_classes.data_loader)
    
    # Try again
    data = de_classes.data_loader.data_loader.from_json_lines(file)
    print(f"Successfully loaded {len(data)} records with alternative method")


Successfully loaded 300 records


In [2]:
#insert only valid data
from de_classes.tweet_validation import TweetValidator

validator = TweetValidator()
valid_data = validator.filter_valid(data)
invalid_data = validator.report_invalid(data)

print(f"✅ Valid records: {len(valid_data)}")
print(f"❌ Invalid records: {len(invalid_data)}")

✅ Valid records: 299
❌ Invalid records: 1


## Declaring the connection to MongoDB Atlas

In [3]:
from de_classes.pymongo_utils import PyMongoUtils

# Initialize your connection
mongo_obj = PyMongoUtils()

# Define database and collection names
mongo_db = "twitter_sentiment"  
mongo_collection = "sentiment_analysis"

# Get or create collection
collection = mongo_obj.get_collection(mongo_db, mongo_collection)


In [4]:

from de_classes.sentiment_insertion import sentiment_insertion

try:
    # Insert data 
    inserter = sentiment_insertion(collection)
    inserter.insert_many(valid_data)
    print("Data inserted successfully to MongoDB Atlas!")
    
except Exception as e:
    print(f"Error connecting to MongoDB Atlas: {e}")
    print("Please check your internet connection and verify the connection string.")



Data inserted successfully to MongoDB Atlas!


# Querying Section

In [5]:
collection.create_index("sentiment")

'sentiment_1'

### Find by sentiment

In [6]:
from de_classes.sentiment_query import sentiment_query

# Initialize the query object
query = sentiment_query(collection)

In [7]:

try:

    # Get tweets that mentionining "keyword"
    tourism_tweets = query.search_tweets("tourism")
    print(f"Found {len(tourism_tweets)} tweets mentioning 'tourism'")
    
    # print a sample:
    for tweet in tourism_tweets[:99]:
        print(tweet)

except Exception as e:
    print(f"Error connecting to MongoDB Atlas: {e}")
    print("Unable to query data")

Found 2 tweets mentioning 'tourism'
{'_id': ObjectId('67f7e2f712f262f491ee0b75'), 'Tweet': 'According to Widiyanti, the tourism sector, classified as a service industry, is not subject to such tariffs and continues to generate foreign exchange.', 'prediction': 0.0, 'sentiment': 'Neutral', 'name': 'The Star'}
{'_id': ObjectId('67fa090125466407db701a88'), 'Tweet': 'According to Widiyanti, the tourism sector, classified as a service industry, is not subject to such tariffs and continues to generate foreign exchange.', 'prediction': 0.0, 'sentiment': 'Neutral', 'name': 'The Star'}


### Aggregate by Sentiment and Source

In [8]:
counter = query.count_by_sentiment_and_source()
print("Tweet count by sentiment and source:")
for entry in counter:
    print(f"{entry['_id']['sentiment']}: {entry['count']} tweets from {entry['_id']['source']}")


Tweet count by sentiment and source:
Neutral: 54 tweets from Free Malaysia Today
Negative: 54 tweets from malaysiakini.com
Negative: 50 tweets from New Straits Times
Neutral: 48 tweets from The Star
Positive: 48 tweets from Herald Malaysia
Positive: 46 tweets from theSun
Negative: 42 tweets from Free Malaysia Today
Negative: 36 tweets from theSun
Neutral: 34 tweets from Herald Malaysia
Negative: 26 tweets from The Star
Positive: 26 tweets from The Star
Positive: 26 tweets from New Straits Times
Positive: 26 tweets from malaysiakini.com
Neutral: 24 tweets from New Straits Times
Neutral: 20 tweets from malaysiakini.com
Neutral: 18 tweets from theSun
Negative: 16 tweets from Herald Malaysia
Positive: 4 tweets from Free Malaysia Today
Unknown: 1 tweets from Herald Malaysia


### Find by prediction

In [14]:
predicted_neutral = query.find_by_prediction(0)
print(f"Found {len(predicted_neutral)} tweets with prediction 0")

#only showing top 3
for doc in predicted_neutral[:]:
    print(doc)

Found 242 tweets with prediction 0
{'_id': ObjectId('67f7e2f712f262f491ee0b74'), 'Tweet': '#Singapore Lee: “This is going to affect our trade, it’s going to affect our economy, it’s going to affect our region, and it’s going to affect our future. And it’s not good news.”', 'prediction': 0.0, 'sentiment': 'Negative', 'name': 'The Star'}
{'_id': ObjectId('67f7e2f712f262f491ee0b75'), 'Tweet': 'According to Widiyanti, the tourism sector, classified as a service industry, is not subject to such tariffs and continues to generate foreign exchange.', 'prediction': 0.0, 'sentiment': 'Neutral', 'name': 'The Star'}
{'_id': ObjectId('67f7e2f712f262f491ee0b77'), 'Tweet': 'Abdul Rais noted that the global economy is currently experiencing a "tariff war” and that the US remains a major trading partner for Malaysia.', 'prediction': 0.0, 'sentiment': 'Neutral', 'name': 'The Star'}
{'_id': ObjectId('67f7e2f712f262f491ee0b78'), 'Tweet': '#NSTnation  "Our economy remains resilient," Anwar said in a televi

### Find by source name

In [15]:
from_the_star = query.find_by_source("The Star")
print(f"Found {len(from_the_star)} tweets from 'The Star'")

for doc in from_the_star[:]:
    print(doc)

Found 50 tweets from 'The Star'
{'_id': ObjectId('67f7e2f712f262f491ee0b74'), 'Tweet': '#Singapore Lee: “This is going to affect our trade, it’s going to affect our economy, it’s going to affect our region, and it’s going to affect our future. And it’s not good news.”', 'prediction': 0.0, 'sentiment': 'Negative', 'name': 'The Star'}
{'_id': ObjectId('67f7e2f712f262f491ee0b75'), 'Tweet': 'According to Widiyanti, the tourism sector, classified as a service industry, is not subject to such tariffs and continues to generate foreign exchange.', 'prediction': 0.0, 'sentiment': 'Neutral', 'name': 'The Star'}
{'_id': ObjectId('67f7e2f712f262f491ee0b76'), 'Tweet': 'Abdul Rasheed said the central bank is still awaiting further details from the United States about the tariffs.', 'prediction': 2.0, 'sentiment': 'Positive', 'name': 'The Star'}
{'_id': ObjectId('67f7e2f712f262f491ee0b77'), 'Tweet': 'Abdul Rais noted that the global economy is currently experiencing a "tariff war” and that the US rem

### Count by sentiment

In [16]:
sentiment_summary = query.count_by_sentiment()
print("Tweet count by sentiment:")
for entry in sentiment_summary:
    print(f"{entry['_id']}: {entry['count']}")

Tweet count by sentiment:
Negative: 112
Neutral: 99
Positive: 88
Unknown: 1


### Count by source

In [17]:
source_summary = query.count_by_source()
print("Tweet count by news source:")
for entry in source_summary:
    print(f"{entry['_id']}: {entry['count']}")

Tweet count by news source:
The Star: 50
theSun: 50
Free Malaysia Today: 50
Herald Malaysia: 50
malaysiakini.com: 50
New Straits Times: 50
