In [1]:
from dataingestion.tweet_fetcher import fetch_tweets
from dataingestion.twitter_api import get_user_data
from dataingestion.label_data import analyze_tweet_sentiment
from dataingestion.kafka_producer import send_tweets_to_kafka
from dataingestion.kafka_consumer import run_kafka_consumer
from dataingestion.store_to_HDFS import consume_tweets_to_hdfs


# get tweetID

In [2]:

# Get user input
username = input("Enter the username: ")

# Call the function
user_data = get_user_data(username)

# Check if we got valid data
if user_data[0] and user_data[1]:
    print(f"Username: {user_data[0]}, Rest ID: {user_data[1]}")
else:
    print("Failed to retrieve user data.")


Enter the username:  123


Querystring set to: {'username': '123'}
Username: 123, Rest ID: 221445859


# Tweet fetcher

In [3]:
def main():
    # Define user IDs here in main
    user_ids = ["22594051", "55186601", "18040230", "61083422", "102098902", "145550026"]
    
    # Call the function with the user IDs
    tweets_data = fetch_tweets(
        user_ids=user_ids,
        count_per_user=50,
        output_file='tweets_output.csv'
    )
    
    # Example of how to use the returned data
    for user_id, tweets in tweets_data.items():
        print(f"User {user_id} has {len(tweets)} tweets")

# Fix the syntax in the main check
if __name__ == "__main__":
    main()

Fetched 50 tweets for user 22594051
Fetched 50 tweets for user 55186601
Fetched 50 tweets for user 18040230
Fetched 50 tweets for user 61083422
Fetched 50 tweets for user 102098902
Fetched 50 tweets for user 145550026

Successfully fetched data for all users!
Total tweets fetched: 300
User 22594051 has 50 tweets
User 55186601 has 50 tweets
User 18040230 has 50 tweets
User 61083422 has 50 tweets
User 102098902 has 50 tweets
User 145550026 has 50 tweets


# label data


In [4]:
analyzed_data = analyze_tweet_sentiment('tweets_output.csv')

Sentiment distribution:
Sentiment
Positive    118
Neutral      98
Negative     84
Name: count, dtype: int64

Sample of tweets with sentiment:
    User ID      Name  Followers Count  \
0  22594051  The Star          1903943   
1  22594051  The Star          1903943   
2  22594051  The Star          1903943   
3  22594051  The Star          1903943   
4  22594051  The Star          1903943   

                                               Tweet                Location  \
0  It's a new day, yes it is! 🎺🦄\n\nHere's what y...  Kuala Lumpur, Malaysia   
1  The victim, in her 50s, first became aware of ...  Kuala Lumpur, Malaysia   
2  Myanmar pardoned over 4,900 prisoners on April...  Kuala Lumpur, Malaysia   
3  The 88-year-old pontiff made a short foray out...  Kuala Lumpur, Malaysia   
4  Police discovered 468.3kg of crystal meth wort...  Kuala Lumpur, Malaysia   

                             Time  Friends Count Sentiment  
0  Thu Apr 17 19:01:24 +0000 2025            274  Positive  
1 

# kafka producer

In [5]:
from pathlib import Path
from dataingestion.kafka_producer import send_tweets_to_kafka
def main():
    """
    Main function to call the send_tweets_to_kafka function.
    """
    # Define parameters directly in the code
    csv_file_path = 'tweets_output_with_sentiment.csv'
    kafka_servers = ['localhost:9092']
    verbose = True
    
    # Validate CSV file path
    csv_path = Path(csv_file_path)
    if not csv_path.exists():
        print(f"Error: CSV file '{csv_file_path}' does not exist")
        return 1
    
    try:
        # Process the tweets
        tweet_count = send_tweets_to_kafka(
            csv_file=csv_file_path,
            bootstrap_servers=kafka_servers,
            verbose=verbose
        )
        
        print(f"Successfully processed {tweet_count} tweets")
        return 0
    except Exception as e:
        print(f"Error processing tweets: {e}")
        return 1

if __name__ == "__main__":
    main()

Available columns in CSV: ['User ID', 'Name', 'Followers Count', 'Tweet', 'Location', 'Time', 'Friends Count', 'Sentiment']
Mapped fields to actual columns: {'user_id': 'User ID', 'name': 'Name', 'followers_count': 'Followers Count', 'tweet_text': 'Tweet', 'location': 'Location', 'created_at': 'Time', 'friends_count': 'Friends Count', 'sentiment': 'Sentiment'}
Tweet 1 data: {'user_id': 22594051, 'name': 'The Star', 'followers_count': 1903943, 'tweet_text': "It's a new day, yes it is! 🎺🦄\n\nHere's what you can expect in your copy of The Star newspaper today. Read them online at", 'location': 'Kuala Lumpur, Malaysia', 'created_at': 'Thu Apr 17 19:01:24 +0000 2025', 'friends_count': 274, 'sentiment': 'Positive'}
Sent to topic: EntertainmentNewsTopic
Tweet 2 data: {'user_id': 22594051, 'name': 'The Star', 'followers_count': 1903943, 'tweet_text': 'The victim, in her 50s, first became aware of the alleged scheme through an individual on Facebook in late December, 2024.', 'location': 'Kuala 

# kafka consumer

In [6]:
from dataingestion.kafka_consumer import run_kafka_consumer

def main():
    """Main entry point of the application"""
    run_kafka_consumer()

if __name__ == "__main__":
    main()

#----------------------------------------#
#   Kafka Tweet Consumer Tool            #
#----------------------------------------#
Available Topics:
1. PoliticsNewsTopic
2. BusinessNewsTopic
3. SportsNewsTopic
4. EntertainmentNewsTopic
5. MalaysiaNewsTopic



Enter the number of the topic you want to consume from:  5



Output Format Options:
1. Detailed - Show each field on a separate line
2. Table - Show messages in a table format
3. Compact - Show messages in a single line each



Select output format (1-3):  1

Enter maximum number of messages to display (default: 5):  5


Consuming messages from MalaysiaNewsTopic...

Message 1:
  user_id: 22594051
  name: The Star
  followers_count: 1903943
  tweet_text: The victim, in her 50s, first became aware of the alleged scheme through an individual on Facebook in late December, 2024.
  location: Kuala Lumpur, Malaysia
  created_at: Thu Apr 17 15:33:11 +0000 2025
  friends_count: 274
  sentiment: Negative

Message 2:
  user_id: 22594051
  name: The Star
  followers_count: 1903943
  tweet_text: Myanmar pardoned over 4,900 prisoners on April 17, the first day of the New Year on the Myanmar calendar.
  location: Kuala Lumpur, Malaysia
  created_at: Thu Apr 17 15:30:33 +0000 2025
  friends_count: 274
  sentiment: Negative

Message 3:
  user_id: 22594051
  name: The Star
  followers_count: 1903943
  tweet_text: The 88-year-old pontiff made a short foray outside of the Vatican, as the prison is only about a five-minute drive away.
  location: Kuala Lumpur, Malaysia
  created_at: Thu Apr 17 15:15:20 +0000 2025
  friends


Do you want to export these messages to CSV? (y/n):  n



Kafka consumer operation completed.


# store to HDFS

In [7]:
from dataingestion.store_to_HDFS import consume_tweets_to_hdfs

if __name__ == "__main__":
    consume_tweets_to_hdfs()



Starting to consume tweets and save to /user/hduser/raw_tweets/tweets_20250418_053129.json
Saved 10 tweets to HDFS
Saved 20 tweets to HDFS
Saved 30 tweets to HDFS
Saved 40 tweets to HDFS
Saved 50 tweets to HDFS
Saved 60 tweets to HDFS
Saved 70 tweets to HDFS
Saved 80 tweets to HDFS
Saved 90 tweets to HDFS
Saved 100 tweets to HDFS
Saved 110 tweets to HDFS
Saved 120 tweets to HDFS
Saved 130 tweets to HDFS
Saved 140 tweets to HDFS
Saved 150 tweets to HDFS
Saved 160 tweets to HDFS
Saved 170 tweets to HDFS
Saved 180 tweets to HDFS
Saved 190 tweets to HDFS
Saved 200 tweets to HDFS
Saved 210 tweets to HDFS
Saved 220 tweets to HDFS
Saved 230 tweets to HDFS
Saved 240 tweets to HDFS
Saved 250 tweets to HDFS
Saved 260 tweets to HDFS
Saved 270 tweets to HDFS
Saved 280 tweets to HDFS
Saved 290 tweets to HDFS
Saved 300 tweets to HDFS
Reached limit of 300 tweets
Consumer closed. Saved 300 tweets to HDFS.
Successfully verified file exists: /user/hduser/raw_tweets/tweets_20250418_053129.json
Run 'hadoo

In [8]:
#check availability of data
#hadoop fs -ls /user/hduser/raw_tweets/  
# hadoop fs -cat /user/hduser/raw_tweets/tweets_batch_20250418_032815_e7ff3920_10.json
