author: Edwina Hon Kai Xin

### Loading Data

In [1]:
from mongodb.data_loader import data_loader
import json

file = "data/full_sentiments.json"

try:
    data = data_loader.from_json_lines(file)
    print(f"Successfully loaded {len(data)} records")
except AttributeError:
    print("Import failed. Trying alternative approach...")
    
    # Import the specific file directly
    import sys
    import os
    
    # Add the parent directory to the path if needed
    current_dir = os.path.dirname(os.getcwd())
    if current_dir not in sys.path:
        sys.path.append(current_dir)
    
    # Re-import with the full path
    import mongodb.data_loader
    # Force reload in case it was previously imported incorrectly
    import importlib
    importlib.reload(mongodb.data_loader)
    
    # Try again
    data = mongodb.data_loader.data_loader.from_json_lines(file)
    print(f"Successfully loaded {len(data)} records with alternative method")


Successfully loaded 300 records


### Validating Data 

In [2]:
#insert only valid data
from mongodb.tweet_validation import TweetValidator

validator = TweetValidator()
valid_data = validator.filter_valid(data)
invalid_data = validator.report_invalid(data)

print(f"✅ Valid records: {len(valid_data)}")
print(f"❌ Invalid records: {len(invalid_data)}")

✅ Valid records: 300
❌ Invalid records: 0


## Declaring the connection to MongoDB Atlas

In [2]:
from mongodb.pymongo_utils import PyMongoUtils

mongo_obj = PyMongoUtils()

# Define database and collection names
mongo_db = "news_sentiment"  
tweets_collection = "labeled_tweets"
users_collection = "users"

users_collection = mongo_obj.get_collection(mongo_db, users_collection)
tweets_collection = mongo_obj.get_collection(mongo_db, tweets_collection)

In [5]:
from mongodb.sentiment_insertion import sentiment_insertion

try:
    
    # Preprocess the data types
    cleaned_data = sentiment_insertion.preprocess_data(valid_data)
    normalized_data = sentiment_insertion.normalize_field_names(cleaned_data)
    
    # Extract unique users and insert them
    users_inserter = sentiment_insertion(users_collection)
    unique_users = sentiment_insertion.extract_users(normalized_data)
    users_inserter.insert_many(unique_users)
    print(f"Inserted {len(unique_users)} unique users to MongoDB Atlas!")
    
    # Remove user fields from tweets and insert them
    tweets_inserter = sentiment_insertion(tweets_collection)
    modified_tweets = sentiment_insertion.remove_user_fields(normalized_data)
    tweets_inserter.insert_many(modified_tweets)
    print(f"Inserted {len(modified_tweets)} tweets to MongoDB Atlas!")
    
except Exception as e:
    print(f"Error connecting to MongoDB Atlas: {e}")
    print("Please check your internet connection and verify the connection string.")

Inserted 6 unique users to MongoDB Atlas!
Inserted 300 tweets to MongoDB Atlas!


# Querying Section

In [3]:
from mongodb.sentiment_query import sentiment_query
import pprint

tweets_query = sentiment_query(tweets_collection, users_collection)

### Find by prediction

In [5]:
predic = tweets_query.find_by_prediction("0.0")
formatted = tweets_query.format_time_fields(predic)
print(f"Found {len(predic)} tweets with prediction 0.0")

for doc in formatted:
    pprint.pprint(doc)
    print("\n")

Found 212 tweets with prediction 0.0
{'_id': ObjectId('67fdce026a25177fe7a5e77e'),
 'prediction': '0.0',
 'sentiment': 'Negative',
 'time': '2025-04-12 00:57:23',
 'topic': 'EntertainmentNewsTopic',
 'tweet': 'The accident occurred at Jalan Bestari, Ulu Tiram, at around 3pm on '
          'Thursday.',
 'user_id': '22594051'}


{'_id': ObjectId('67fdce026a25177fe7a5e78d'),
 'prediction': '0.0',
 'sentiment': 'Positive',
 'time': '2025-04-12 02:35:14',
 'topic': 'MalaysiaNewsTopic',
 'tweet': 'Their discussions focused on how Asean can collectively strengthen '
          'and position the region as an effective and credible dispute '
          'resolution hub.',
 'user_id': '22594051'}


{'_id': ObjectId('67fdce026a25177fe7a5e7a4'),
 'prediction': '0.0',
 'sentiment': 'Negative',
 'time': '2025-04-12 01:00:29',
 'topic': 'MalaysiaNewsTopic',
 'tweet': 'The High Court has upheld the acquittal of an insurance consultant '
          'trainer from a charge of cheating and 25 counts of money 

### Find by sentiment

In [4]:
results = tweets_query.find_by_sentiment("Positive")
formatted = tweets_query.format_time_fields(results)
print(f"Found {len(formatted)} Positive tweets")

for doc in formatted:
    pprint.pprint(doc)
    print("\n")

Found 88 Positive tweets
{'_id': ObjectId('67fdce026a25177fe7a5e78d'),
 'prediction': '0.0',
 'sentiment': 'Positive',
 'time': '2025-04-12 02:35:14',
 'topic': 'MalaysiaNewsTopic',
 'tweet': 'Their discussions focused on how Asean can collectively strengthen '
          'and position the region as an effective and credible dispute '
          'resolution hub.',
 'user_id': '22594051'}


{'_id': ObjectId('67fdce026a25177fe7a5e7b3'),
 'prediction': '0.0',
 'sentiment': 'Positive',
 'time': '2025-04-12 03:03:40',
 'topic': 'MalaysiaNewsTopic',
 'tweet': '#NSTnation  Selangor floods: 1,411 from 380 families in relief '
          'shelters',
 'user_id': '55186601'}


{'_id': ObjectId('67fdce026a25177fe7a5e7cb'),
 'prediction': '0.0',
 'sentiment': 'Positive',
 'time': '2025-04-11 10:47:06',
 'topic': 'MalaysiaNewsTopic',
 'tweet': 'The Selangor Islamic Religious Department (Jais) said it will '
          'inspect all lands under its control within the next month to ensure '
          'ther

### Find by topic

In [6]:
business = tweets_query.find_by_topic("BusinessNewsTopic")
business = tweets_query.format_time_fields(business)
print(f"Found {len(business)} tweets in topic 'BusinessNewsTopic'")

for doc in business:
    pprint.pprint(doc)
    print("\n")

Found 15 tweets in topic 'BusinessNewsTopic'
{'_id': ObjectId('67fdce026a25177fe7a5e76e'),
 'prediction': '0.0',
 'sentiment': 'Positive',
 'time': '2025-04-12 03:40:15',
 'topic': 'BusinessNewsTopic',
 'tweet': '"Can we have the best of both worlds? Big, sporty, exciting, '
          'luxurious cars that are also economical to buy and run?\n'
          '\n'
          '"Simply put, yes." - Ravindran Kurusamy',
 'user_id': '22594051'}


{'_id': ObjectId('67fdce026a25177fe7a5e77b'),
 'prediction': '0.0',
 'sentiment': 'Neutral',
 'time': '2025-04-12 04:27:52',
 'topic': 'BusinessNewsTopic',
 'tweet': 'Taiwan holds first tariff talks with US #FMTNews #FMTBusiness',
 'user_id': '102098902'}


{'_id': ObjectId('67fdce026a25177fe7a5e771'),
 'prediction': '0.0',
 'sentiment': 'Positive',
 'time': '2025-04-12 02:15:11',
 'topic': 'BusinessNewsTopic',
 'tweet': 'Fahmi expressed optimism that the visit would not only strengthen '
          'economic ties but also deepen cultural and people-to-pe

### find by user id

In [7]:
user_tweets = tweets_query.find_by_user_id("22594051")
formatted = tweets_query.format_time_fields(user_tweets)
print(f"Found {len(formatted)} tweets by user 22594051")

for doc in formatted:
    pprint.pprint(doc)
    print("\n")

Found 50 tweets by user 22594051
{'_id': ObjectId('67fdce026a25177fe7a5e77e'),
 'prediction': '0.0',
 'sentiment': 'Negative',
 'time': '2025-04-12 00:57:23',
 'topic': 'EntertainmentNewsTopic',
 'tweet': 'The accident occurred at Jalan Bestari, Ulu Tiram, at around 3pm on '
          'Thursday.',
 'user_id': '22594051'}


{'_id': ObjectId('67fdce026a25177fe7a5e77f'),
 'prediction': '1.0',
 'sentiment': 'Neutral',
 'time': '2025-04-11 18:02:13',
 'topic': 'EntertainmentNewsTopic',
 'tweet': 'Four Datuks nabbed\n'
          'From flames to floods\n'
          'Tech and sex among the young\n'
          '\n'
          'Pick up a copy of The Star newspaper to read these reports and '
          'more. Alternatively, visit',
 'user_id': '22594051'}


{'_id': ObjectId('67fdce026a25177fe7a5e78d'),
 'prediction': '0.0',
 'sentiment': 'Positive',
 'time': '2025-04-12 02:35:14',
 'topic': 'MalaysiaNewsTopic',
 'tweet': 'Their discussions focused on how Asean can collectively strengthen '
        

### Find users by location

In [18]:
kl_users = tweets_query.find_by_location("Kuala Lumpur, Malaysia")
print(f"Found {len(kl_users)} users in Kuala Lumpur")

for doc in kl_users:
    pprint.pprint(doc)
    print("\n")

Found 1 users in Kuala Lumpur
{'_id': ObjectId('67fdce026a25177fe7a5e768'),
 'followers_count': 1903879,
 'location': 'Kuala Lumpur, Malaysia',
 'name': 'The Star',
 'user_id': '22594051'}




### Find users by name

In [19]:
star = tweets_query.find_by_name("The Star")
print(f"Found {len(star)} users named 'The Star'")

for doc in star:
    pprint.pprint(doc)
    print("\n")

Found 1 users named 'The Star'
{'_id': ObjectId('67fdce026a25177fe7a5e768'),
 'followers_count': 1903879,
 'location': 'Kuala Lumpur, Malaysia',
 'name': 'The Star',
 'user_id': '22594051'}




### Find users by follower count

In [20]:
influencers = tweets_query.find_by_followers_min(700000)
print(f"Found {len(influencers)} users with 700k+ followers")

for doc in influencers:
    pprint.pprint(doc)
    print("\n")

Found 3 users with 1M+ followers
{'_id': ObjectId('67fdce026a25177fe7a5e769'),
 'followers_count': 813966,
 'location': 'Malaysia',
 'name': 'New Straits Times',
 'user_id': '55186601'}


{'_id': ObjectId('67fdce026a25177fe7a5e76a'),
 'followers_count': 1679201,
 'location': 'Malaysia',
 'name': 'malaysiakini.com',
 'user_id': '18040230'}


{'_id': ObjectId('67fdce026a25177fe7a5e768'),
 'followers_count': 1903879,
 'location': 'Kuala Lumpur, Malaysia',
 'name': 'The Star',
 'user_id': '22594051'}




### Find tweets by date range

In [8]:
date_range = tweets_query.find_by_date_range("2025-04-10", "2025-04-13")
formatted = tweets_query.format_time_fields(date_range)
print(f"Found {len(formatted)} tweets in date range")

for doc in formatted:
    pprint.pprint(doc)
    print("\n")

Found 279 tweets in date range
{'_id': ObjectId('67fdce026a25177fe7a5e77e'),
 'prediction': '0.0',
 'sentiment': 'Negative',
 'time': '2025-04-12 00:57:23',
 'topic': 'EntertainmentNewsTopic',
 'tweet': 'The accident occurred at Jalan Bestari, Ulu Tiram, at around 3pm on '
          'Thursday.',
 'user_id': '22594051'}


{'_id': ObjectId('67fdce026a25177fe7a5e77f'),
 'prediction': '1.0',
 'sentiment': 'Neutral',
 'time': '2025-04-11 18:02:13',
 'topic': 'EntertainmentNewsTopic',
 'tweet': 'Four Datuks nabbed\n'
          'From flames to floods\n'
          'Tech and sex among the young\n'
          '\n'
          'Pick up a copy of The Star newspaper to read these reports and '
          'more. Alternatively, visit',
 'user_id': '22594051'}


{'_id': ObjectId('67fdce026a25177fe7a5e780'),
 'prediction': '1.0',
 'sentiment': 'Neutral',
 'time': '2025-04-12 04:01:55',
 'topic': 'EntertainmentNewsTopic',
 'tweet': '#NSTTV Making a memorable entrance, Parti Sosialis Malaysia (#PSM) '
      

### Joining tweets with user info

In [9]:
joined = tweets_query.tweets_with_user_info()
joined = tweets_query.format_time_fields(joined)
print(f"Found {len(joined)} tweets with user info")

for doc in joined:
    pprint.pprint(doc)
    print("\n")

Found 300 tweets with user info
{'_id': ObjectId('67fdce026a25177fe7a5e77e'),
 'prediction': '0.0',
 'sentiment': 'Negative',
 'time': '2025-04-12 00:57:23',
 'topic': 'EntertainmentNewsTopic',
 'tweet': 'The accident occurred at Jalan Bestari, Ulu Tiram, at around 3pm on '
          'Thursday.',
 'user_id': '22594051',
 'user_info': {'_id': ObjectId('67fdce026a25177fe7a5e768'),
               'followers_count': 1903879,
               'location': 'Kuala Lumpur, Malaysia',
               'name': 'The Star',
               'user_id': '22594051'}}


{'_id': ObjectId('67fdce026a25177fe7a5e77f'),
 'prediction': '1.0',
 'sentiment': 'Neutral',
 'time': '2025-04-11 18:02:13',
 'topic': 'EntertainmentNewsTopic',
 'tweet': 'Four Datuks nabbed\n'
          'From flames to floods\n'
          'Tech and sex among the young\n'
          '\n'
          'Pick up a copy of The Star newspaper to read these reports and '
          'more. Alternatively, visit',
 'user_id': '22594051',
 'user_info': {'_i

### Tweets from user in location

In [10]:
malaysia = tweets_query.find_tweets_by_user_location("Malaysia")
malaysia = tweets_query.format_time_fields(joined)
print(f"Found {len(malaysia)} tweets from users in Malaysia")

for doc in malaysia:
    pprint.pprint(doc)
    print("\n")

Found 300 tweets from users in Malaysia
{'_id': ObjectId('67fdce026a25177fe7a5e77e'),
 'prediction': '0.0',
 'sentiment': 'Negative',
 'time': '2025-04-12 00:57:23',
 'topic': 'EntertainmentNewsTopic',
 'tweet': 'The accident occurred at Jalan Bestari, Ulu Tiram, at around 3pm on '
          'Thursday.',
 'user_id': '22594051',
 'user_info': {'_id': ObjectId('67fdce026a25177fe7a5e768'),
               'followers_count': 1903879,
               'location': 'Kuala Lumpur, Malaysia',
               'name': 'The Star',
               'user_id': '22594051'}}


{'_id': ObjectId('67fdce026a25177fe7a5e77f'),
 'prediction': '1.0',
 'sentiment': 'Neutral',
 'time': '2025-04-11 18:02:13',
 'topic': 'EntertainmentNewsTopic',
 'tweet': 'Four Datuks nabbed\n'
          'From flames to floods\n'
          'Tech and sex among the young\n'
          '\n'
          'Pick up a copy of The Star newspaper to read these reports and '
          'more. Alternatively, visit',
 'user_id': '22594051',
 'user_inf

### Sentiment breakdown over time

In [12]:
timeline = tweets_query.sentiment_over_time()
print(f"Sentiment breakdown over time:")

for doc in timeline:
    pprint.pprint(doc)

Sentiment breakdown over time:
{'_id': {'date': '2025-04-07', 'sentiment': 'Neutral'}, 'count': 2}
{'_id': {'date': '2025-04-08', 'sentiment': 'Neutral'}, 'count': 3}
{'_id': {'date': '2025-04-08', 'sentiment': 'Positive'}, 'count': 4}
{'_id': {'date': '2025-04-08', 'sentiment': 'Negative'}, 'count': 7}
{'_id': {'date': '2025-04-09', 'sentiment': 'Negative'}, 'count': 1}
{'_id': {'date': '2025-04-09', 'sentiment': 'Neutral'}, 'count': 2}
{'_id': {'date': '2025-04-09', 'sentiment': 'Positive'}, 'count': 2}
{'_id': {'date': '2025-04-10', 'sentiment': 'Positive'}, 'count': 9}
{'_id': {'date': '2025-04-10', 'sentiment': 'Negative'}, 'count': 3}
{'_id': {'date': '2025-04-10', 'sentiment': 'Neutral'}, 'count': 3}
{'_id': {'date': '2025-04-11', 'sentiment': 'Neutral'}, 'count': 25}
{'_id': {'date': '2025-04-11', 'sentiment': 'Negative'}, 'count': 35}
{'_id': {'date': '2025-04-11', 'sentiment': 'Positive'}, 'count': 22}
{'_id': {'date': '2025-04-12', 'sentiment': 'Neutral'}, 'count': 66}
{'_id

### Count by sentiment

In [13]:
sentiment_count = tweets_query.count_by_sentiment()
print("Tweet count by sentiment:")

for doc in sentiment_count:
    pprint.pprint(doc)
    print("\n")

Tweet count by sentiment:
{'_id': 'Negative', 'count': 111}


{'_id': 'Neutral', 'count': 101}


{'_id': 'Positive', 'count': 88}




### Count by topic

In [14]:
topic_count = tweets_query.count_by_topic()
print("Tweet count by topic:")

for doc in topic_count:
    pprint.pprint(doc)
    print("\n")

Tweet count by topic:
{'_id': 'MalaysiaNewsTopic', 'count': 199}


{'_id': 'PoliticsNewsTopic', 'count': 45}


{'_id': 'SportsNewsTopic', 'count': 30}


{'_id': 'BusinessNewsTopic', 'count': 15}


{'_id': 'EntertainmentNewsTopic', 'count': 11}




### Get tweets that mention "keyword"

In [15]:
mention_results = tweets_query.find_tweets_mentioning("Asean")
mention_results = tweets_query.format_time_fields(joined)
print(f"Found {len(mention_results)} tweets mentioning 'Asean'")

for doc in mention_results:
    pprint.pprint(doc)
    print("\n")

Found 300 tweets mentioning 'Asean'
{'_id': ObjectId('67fdce026a25177fe7a5e77e'),
 'prediction': '0.0',
 'sentiment': 'Negative',
 'time': '2025-04-12 00:57:23',
 'topic': 'EntertainmentNewsTopic',
 'tweet': 'The accident occurred at Jalan Bestari, Ulu Tiram, at around 3pm on '
          'Thursday.',
 'user_id': '22594051',
 'user_info': {'_id': ObjectId('67fdce026a25177fe7a5e768'),
               'followers_count': 1903879,
               'location': 'Kuala Lumpur, Malaysia',
               'name': 'The Star',
               'user_id': '22594051'}}


{'_id': ObjectId('67fdce026a25177fe7a5e77f'),
 'prediction': '1.0',
 'sentiment': 'Neutral',
 'time': '2025-04-11 18:02:13',
 'topic': 'EntertainmentNewsTopic',
 'tweet': 'Four Datuks nabbed\n'
          'From flames to floods\n'
          'Tech and sex among the young\n'
          '\n'
          'Pick up a copy of The Star newspaper to read these reports and '
          'more. Alternatively, visit',
 'user_id': '22594051',
 'user_info': 