author: Edwina Hon Kai Xin

In [1]:
from mongodb.data_loader import data_loader
import json

file = "data/full_sentiments.json"

try:
    data = data_loader.from_json_lines(file)
    print(f"Successfully loaded {len(data)} records")
except AttributeError:
    print("Import failed. Trying alternative approach...")
    
    # Import the specific file directly
    import sys
    import os
    
    # Add the parent directory to the path if needed
    current_dir = os.path.dirname(os.getcwd())
    if current_dir not in sys.path:
        sys.path.append(current_dir)
    
    # Re-import with the full path
    import mongodb.data_loader
    # Force reload in case it was previously imported incorrectly
    import importlib
    importlib.reload(mongodb.data_loader)
    
    # Try again
    data = mongodb.data_loader.data_loader.from_json_lines(file)
    print(f"Successfully loaded {len(data)} records with alternative method")


Successfully loaded 300 records


In [2]:
#insert only valid data
from mongodb.tweet_validation import TweetValidator

validator = TweetValidator()
valid_data = validator.filter_valid(data)
invalid_data = validator.report_invalid(data)

print(f"✅ Valid records: {len(valid_data)}")
print(f"❌ Invalid records: {len(invalid_data)}")

✅ Valid records: 300
❌ Invalid records: 0


## Declaring the connection to MongoDB Atlas

In [4]:
from mongodb.pymongo_utils import PyMongoUtils

# Initialize your connection
mongo_obj = PyMongoUtils()

# Define database and collection names
mongo_db = "news_sentiment"  
mongo_collection = "labeled_tweets"

# Get or create collection
collection = mongo_obj.get_collection(mongo_db, mongo_collection)


In [7]:

from mongodb.sentiment_insertion import sentiment_insertion

try:
    inserter = sentiment_insertion(collection)
    
    # Preprocess the data types
    cleaned_data = sentiment_insertion.preprocess_data(valid_data)

    # Now insert the cleaned data
    inserter.insert_many(cleaned_data)
    print("Data inserted successfully to MongoDB Atlas!")
    
except Exception as e:
    print(f"Error connecting to MongoDB Atlas: {e}")
    print("Please check your internet connection and verify the connection string.")

Data inserted successfully to MongoDB Atlas!


# Querying Section

In [8]:
collection.create_index("sentiment")

'sentiment_1'

In [9]:
from mongodb.sentiment_query import sentiment_query
import pprint

query = sentiment_query(collection)

### Find by sentiment

In [10]:
### Find by sentiment
predicted_neutral = query.find_by_prediction("0.0")
print(f"Found {len(predicted_neutral)} tweets with prediction 0.0")

#only showing top 3
for doc in predicted_neutral[:]:
    pprint.pprint(doc)
    print("\n")

Found 212 tweets with prediction 0.0
{'Location': 'Kuala Lumpur, Malaysia',
 'Time': 'Sat Apr 12 03:40:15 +0000 2025',
 'Tweet': '"Can we have the best of both worlds? Big, sporty, exciting, '
          'luxurious cars that are also economical to buy and run?\n'
          '\n'
          '"Simply put, yes." - Ravindran Kurusamy',
 '_id': ObjectId('67fdbb403a7d16cc0ecedcb6'),
 'followers_count': 1903879,
 'name': 'The Star',
 'prediction': '0.0',
 'sentiment': 'Positive',
 'topic': 'BusinessNewsTopic',
 'user_id': '22594051'}


{'Location': 'Kuala Lumpur, Malaysia',
 'Time': 'Sat Apr 12 03:30:16 +0000 2025',
 'Tweet': 'These toys have managed to transcend the traditional toy market, '
          'appealing to the emotional and psychological needs of young adults.',
 '_id': ObjectId('67fdbb403a7d16cc0ecedcb7'),
 'followers_count': 1903879,
 'name': 'The Star',
 'prediction': '0.0',
 'sentiment': 'Positive',
 'topic': 'BusinessNewsTopic',
 'user_id': '22594051'}


{'Location': 'Kuala Lumpur

### Find by prediction

In [11]:
predicted_neutral = query.find_by_prediction("0.0")
print(f"Found {len(predicted_neutral)} tweets with prediction 0.0")

for doc in predicted_neutral[:]:
    pprint.pprint(doc)
    print("\n")

Found 212 tweets with prediction 0.0
{'Location': 'Kuala Lumpur, Malaysia',
 'Time': 'Sat Apr 12 03:40:15 +0000 2025',
 'Tweet': '"Can we have the best of both worlds? Big, sporty, exciting, '
          'luxurious cars that are also economical to buy and run?\n'
          '\n'
          '"Simply put, yes." - Ravindran Kurusamy',
 '_id': ObjectId('67fdbb403a7d16cc0ecedcb6'),
 'followers_count': 1903879,
 'name': 'The Star',
 'prediction': '0.0',
 'sentiment': 'Positive',
 'topic': 'BusinessNewsTopic',
 'user_id': '22594051'}


{'Location': 'Kuala Lumpur, Malaysia',
 'Time': 'Sat Apr 12 03:30:16 +0000 2025',
 'Tweet': 'These toys have managed to transcend the traditional toy market, '
          'appealing to the emotional and psychological needs of young adults.',
 '_id': ObjectId('67fdbb403a7d16cc0ecedcb7'),
 'followers_count': 1903879,
 'name': 'The Star',
 'prediction': '0.0',
 'sentiment': 'Positive',
 'topic': 'BusinessNewsTopic',
 'user_id': '22594051'}


{'Location': 'Kuala Lumpur

### Find by source name

In [12]:
from_the_star = query.find_by_source("The Star")
print(f"Found {len(from_the_star)} tweets from 'The Star'")

for doc in from_the_star[:]:
    pprint.pprint(doc)
    print("\n")

Found 50 tweets from 'The Star'
{'Location': 'Kuala Lumpur, Malaysia',
 'Time': 'Sat Apr 12 03:40:15 +0000 2025',
 'Tweet': '"Can we have the best of both worlds? Big, sporty, exciting, '
          'luxurious cars that are also economical to buy and run?\n'
          '\n'
          '"Simply put, yes." - Ravindran Kurusamy',
 '_id': ObjectId('67fdbb403a7d16cc0ecedcb6'),
 'followers_count': 1903879,
 'name': 'The Star',
 'prediction': '0.0',
 'sentiment': 'Positive',
 'topic': 'BusinessNewsTopic',
 'user_id': '22594051'}


{'Location': 'Kuala Lumpur, Malaysia',
 'Time': 'Sat Apr 12 03:30:16 +0000 2025',
 'Tweet': 'These toys have managed to transcend the traditional toy market, '
          'appealing to the emotional and psychological needs of young adults.',
 '_id': ObjectId('67fdbb403a7d16cc0ecedcb7'),
 'followers_count': 1903879,
 'name': 'The Star',
 'prediction': '0.0',
 'sentiment': 'Positive',
 'topic': 'BusinessNewsTopic',
 'user_id': '22594051'}


{'Location': 'Kuala Lumpur, Mal

### Get tweets that mention "keyword"

In [13]:
tourism_tweets = query.search_tweets("Petrol")
print(f"Found {len(tourism_tweets)} tweets mentioning 'Petrol'")

for tweet in tourism_tweets[:]:
    pprint.pprint(tweet)
    print("\n")

Found 1 tweets mentioning 'Petrol'
{'Location': 'Malaysia',
 'Time': 'Sat Apr 12 00:12:48 +0000 2025',
 'Tweet': '#NSTnation  Petrol station company fined RM22,000 for operating '
          'machinery without certificate',
 '_id': ObjectId('67fdbb403a7d16cc0ecedcbd'),
 'followers_count': 813966,
 'name': 'New Straits Times',
 'prediction': '0.0',
 'sentiment': 'Neutral',
 'topic': 'BusinessNewsTopic',
 'user_id': '55186601'}




### Aggregate by Sentiment and Source

In [14]:
counter = query.count_by_sentiment_and_source()
print("Tweet count by sentiment and source:")
for entry in counter:
    pprint.pprint(f"{entry['_id']['sentiment']}: {entry['count']} tweets from {entry['_id']['source']}")

Tweet count by sentiment and source:
'Neutral: 28 tweets from Free Malaysia Today'
'Negative: 22 tweets from The Star'
'Negative: 21 tweets from malaysiakini.com'
'Positive: 20 tweets from heraldmalaysia'
'Negative: 20 tweets from theSun'
'Negative: 19 tweets from New Straits Times'
'Neutral: 18 tweets from malaysiakini.com'
'Positive: 17 tweets from theSun'
'Neutral: 17 tweets from New Straits Times'
'Positive: 17 tweets from The Star'
'Negative: 15 tweets from heraldmalaysia'
'Neutral: 15 tweets from heraldmalaysia'
'Positive: 14 tweets from New Straits Times'
'Negative: 13 tweets from Free Malaysia Today'
'Neutral: 12 tweets from theSun'
'Neutral: 11 tweets from The Star'
'Positive: 11 tweets from malaysiakini.com'
'Positive: 9 tweets from Free Malaysia Today'
'Negative: 1 tweets from thxeSun'


### Count by sentiment

In [8]:
sentiment_summary = query.count_by_sentiment()
print("Tweet count by sentiment:")
for entry in sentiment_summary:
    pprint.pprint(f"{entry['_id']}: {entry['count']}")

Tweet count by sentiment:
'Negative: 111'
'Neutral: 101'
'Positive: 88'


### Count by source

In [9]:
source_summary = query.count_by_source()
print("Tweet count by news source:")
for entry in source_summary:
    pprint.pprint(f"{entry['_id']}: {entry['count']}")

Tweet count by news source:
'The Star: 50'
'New Straits Times: 50'
'malaysiakini.com: 50'
'Free Malaysia Today: 50'
'heraldmalaysia: 50'
'theSun: 49'
'thxeSun: 1'


### Find by location

In [15]:
location = "Malaysia"
from_location = query.find_by_location(location)
print(f"Found {len(from_location)} tweets from '{location}'")
for doc in from_location:
    pprint.pprint(doc)
    print("\n")

Found 200 tweets from 'Malaysia'
{'Location': 'Malaysia',
 'Time': 'Sat Apr 12 02:03:31 +0000 2025',
 'Tweet': '#NSTnation  Pedal to the metal: Man accidentally accelerates, '
          'crashes into bank',
 '_id': ObjectId('67fb35370222f19f0c7951e0'),
 'followers_count': '813966',
 'name': 'New Straits Times',
 'prediction': '0.0',
 'sentiment': 'Negative',
 'topic': 'BusinessNewsTopic',
 'user_id': '55186601'}


{'Location': 'Malaysia',
 'Time': 'Sat Apr 12 00:30:01 +0000 2025',
 'Tweet': '#NSTcolumnists  Profiteers have flooded social media with fake news '
          'and bogus videos since a powerful earthquake devastated Myanmar '
          'last month, exploiting the chaos with clickbait that can reap tens '
          'of thousands in ad revenues.',
 '_id': ObjectId('67fb35370222f19f0c7951e1'),
 'followers_count': '813966',
 'name': 'New Straits Times',
 'prediction': '0.0',
 'sentiment': 'Negative',
 'topic': 'BusinessNewsTopic',
 'user_id': '55186601'}


{'Location': 'Malaysia'

### Find tweets within date range

In [4]:
start_date = "2025-04-01"
end_date = "2025-04-09"
tweets_in_range = query.find_by_date_range(start_date, end_date)
print(f"Found {len(tweets_in_range)} tweets between {start_date} and {end_date}")
for doc in tweets_in_range:
    pprint.pprint(doc)
    print("\n")

Found 16 tweets between 2025-04-01 and 2025-04-09
{'Location': 'Malaysia',
 'Time': 'Tue Apr 08 16:30:00 +0000 2025',
 'Tweet': 'Dozens of Palestinians have been killed almost daily since Israel '
          'restarted its military offensive on March 18.',
 '_id': ObjectId('67fb35370222f19f0c7951f3'),
 'followers_count': '869',
 'name': 'heraldmalaysia',
 'prediction': '0.0',
 'sentiment': 'Negative',
 'topic': 'EntertainmentNewsTopic',
 'user_id': '145550026'}


{'Location': 'Malaysia',
 'Time': 'Tue Apr 08 07:41:01 +0000 2025',
 'Tweet': '“When you start imparting your knowledge, that’s when you really  '
          'learn, and if one does not teach, he will forget what he has '
          'learnt,”  said Cardinal Goh, who taught theology at the seminary '
          'for over 20  years.',
 '_id': ObjectId('67fb35370222f19f0c7951f4'),
 'followers_count': '869',
 'name': 'heraldmalaysia',
 'prediction': '0.0',
 'sentiment': 'Negative',
 'topic': 'EntertainmentNewsTopic',
 'user_id': '1455

### Sentiment distribution over time

In [7]:
sentiment_by_date = query.sentiment_over_time()
from collections import defaultdict
import pprint

# Organize results by date
organized = defaultdict(dict)

for record in sentiment_by_date:
    date = record['_id']['date']
    sentiment = record['_id']['sentiment']
    count = record['count']
    organized[date][sentiment] = count

# Pretty print
print("\n📊 Sentiment Distribution Over Time:\n")
for date in sorted(organized):
    print(f"{date}")
    for sentiment in ["Positive", "Negative", "Neutral"]:
        count = organized[date].get(sentiment, 0)
        print(f"  {sentiment}: {count}")
    print("-" * 30)


📊 Sentiment Distribution Over Time:

Fri Apr 11
  Positive: 22
  Negative: 35
  Neutral: 25
------------------------------
Mon Apr 07
  Positive: 0
  Negative: 0
  Neutral: 2
------------------------------
Sat Apr 12
  Positive: 51
  Negative: 65
  Neutral: 66
------------------------------
Thu Apr 10
  Positive: 9
  Negative: 3
  Neutral: 3
------------------------------
Tue Apr 08
  Positive: 4
  Negative: 7
  Neutral: 3
------------------------------
Wed Apr 09
  Positive: 2
  Negative: 1
  Neutral: 2
------------------------------


### Find tweets from users with a high follower count

In [20]:
min_followers = 5000000
popular_sources = query.find_by_followers_min(min_followers)
print(f"Found {len(popular_sources)} tweets from users with ≥ {min_followers} followers")
for doc in popular_sources:
    pprint.pprint(doc)
    print("\n")

Found 150 tweets from users with ≥ 5000000 followers
{'Location': 'Malaysia',
 'Time': 'Sat Apr 12 02:03:31 +0000 2025',
 'Tweet': '#NSTnation  Pedal to the metal: Man accidentally accelerates, '
          'crashes into bank',
 '_id': ObjectId('67fb35370222f19f0c7951e0'),
 'followers_count': '813966',
 'name': 'New Straits Times',
 'prediction': '0.0',
 'sentiment': 'Negative',
 'topic': 'BusinessNewsTopic',
 'user_id': '55186601'}


{'Location': 'Malaysia',
 'Time': 'Sat Apr 12 00:30:01 +0000 2025',
 'Tweet': '#NSTcolumnists  Profiteers have flooded social media with fake news '
          'and bogus videos since a powerful earthquake devastated Myanmar '
          'last month, exploiting the chaos with clickbait that can reap tens '
          'of thousands in ad revenues.',
 '_id': ObjectId('67fb35370222f19f0c7951e1'),
 'followers_count': '813966',
 'name': 'New Straits Times',
 'prediction': '0.0',
 'sentiment': 'Negative',
 'topic': 'BusinessNewsTopic',
 'user_id': '55186601'}


{'L

### Count of tweets by topic

In [8]:
topic_distribution = query.count_by_topic()
print("Tweet count by topic:")
for topic in topic_distribution:
    pprint.pprint(topic)


Tweet count by topic:
{'_id': 'MalaysiaNewsTopic', 'count': 199}
{'_id': 'PoliticsNewsTopic', 'count': 45}
{'_id': 'SportsNewsTopic', 'count': 30}
{'_id': 'BusinessNewsTopic', 'count': 15}
{'_id': 'EntertainmentNewsTopic', 'count': 11}
