<a href="https://colab.research.google.com/github/karri-ten/VibeScope/blob/main/VibeScope.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# VibeScope _ Analysing online vibe through NLP

VibeScope analyzes Twitter conversations in real time, using NLP and machine learning to extract and interpret sentiment trends. It provides insights on topics, brands, and events while leveraging automated data pipelines and interactive dashboards to enhance understanding of public opinion.

In [1]:
# Install Tweepy (v4.10+ for API v2 Support)
!pip install tweepy matplotlib pandas numpy seaborn plotly



# Scrapping tweet from Twitter (Real-Time)

In [None]:
# Authenticate and Initialize Client
import tweepy
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display
from google.colab import userdata



In [None]:
# Retrieve all necessary credentials from Google Colab secrets
bearer_token = userdata.get("BEARER_TOKEN")
consumer_key = userdata.get("API_key")
consumer_secret = userdata.get("API_key_secret")
access_token = userdata.get("Access_Token")
access_token_secret = userdata.get("Access_Token_Secret")

if any(token is None for token in [bearer_token, consumer_key, consumer_secret, access_token, access_token_secret]):
    raise ValueError("One or more Twitter API secrets are missing.")

# Initialize tweepy client
client = tweepy.Client(
    bearer_token=bearer_token,
    consumer_key=consumer_key,
    consumer_secret=consumer_secret,
    access_token=access_token,
    access_token_secret=access_token_secret,
    wait_on_rate_limit=True
)

In [None]:
# fetching of tweets
def fetch_tweets(query, num_tweets=100, client=client):
    tweets_data = []
    paginator = tweepy.Paginator(
        client.search_recent_tweets,
        query=query,
        max_results=100,
        tweet_fields=["created_at", "public_metrics", "author_id"],
        user_fields=["username"],
        expansions=["author_id"],
        limit=max(1, (num_tweets // 100))
    )
    for tweet in paginator.flatten(limit=num_tweets):
        tweets_data.append({
            "text": tweet.text,
            "created_at": tweet.created_at,
            "likes": tweet.public_metrics["like_count"],
            "retweets": tweet.public_metrics["retweet_count"],
            "author_id": tweet.author_id
        })
    return pd.DataFrame(tweets_data)

In [None]:
#  Interactive Widgets for Query & Tweet Count
keyword_widget = widgets.Text(
    value='ChatGPT',
    placeholder='Type a keyword or hashtag',
    description='Keyword:'
)
tweet_count_widget = widgets.IntSlider(
    value=100,
    min=10,
    max=1000,
    step=10,
    description='Num Tweets:'
)
display(keyword_widget, tweet_count_widget)

In [None]:
#Fetch and Display Tweets Using Function and Widgets
query = f"{keyword_widget.value} lang:en -is:retweet"
num_tweets = tweet_count_widget.value

df = fetch_tweets(query, num_tweets)
print(f"Fetched {len(df)} tweets.")
df.head()

In [None]:
# Save the results to CSV if needed
df.to_csv(f"tweets_{keyword_widget.value}.csv", index=False)

# Using dowloaded twitter dataset (US Election 2020 Tweets) from Kaggle

##Setting up & Data Pipeline (Kaggle + Colab)

In [3]:
# Install the kaggle package
!pip install kaggle
import os

# Upload your kaggle.json (from Kaggle API) using Colab left sidebar or:
from google.colab import files
files.upload()
# Ensure kaggle.json is uploaded before running this cell


# Configure Kaggle CLI
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download Election 2020 Tweets dataset using Kaggle CLI
dataset_slug = "manchunhui/us-election-2020-tweets"
!kaggle datasets download {dataset_slug}

# Unzip the downloaded file (it will be a zip file with the dataset slug name)
unzip_command = f"unzip {dataset_slug.split('/')[-1]}.zip"
!{unzip_command}


path = "." # Or the specific subdirectory if unzip created one

print(f"Files should now be in the current directory: {os.getcwd()}")
print("Listing contents of current directory:")
!ls



Saving kaggle.json to kaggle (1).json
Dataset URL: https://www.kaggle.com/datasets/manchunhui/us-election-2020-tweets
License(s): CC0-1.0
us-election-2020-tweets.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  us-election-2020-tweets.zip
replace hashtag_donaldtrump.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: hashtag_donaldtrump.csv  
replace hashtag_joebiden.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: hashtag_joebiden.csv    
Files should now be in the current directory: /content
Listing contents of current directory:
 hashtag_donaldtrump.csv  'kaggle (1).json'   sample_data
 hashtag_joebiden.csv	   kaggle.json	      us-election-2020-tweets.zip


Loading the Data for “Real-time” Analysis

In [4]:
import pandas as pd
import os

biden_file_path = '/content/hashtag_joebiden.csv'
trump_file_path = '/content/hashtag_donaldtrump.csv'

# Check if files exist before trying to read
if not os.path.exists(biden_file_path):
    raise FileNotFoundError(f"Expected file not found: {biden_file_path}")
if not os.path.exists(trump_file_path):
    raise FileNotFoundError(f"Expected file not found: {trump_file_path}")

# Load tweets for both major candidates using the filenames
# Try specifying the 'python' engine and handling bad lines
try:
    biden = pd.read_csv(biden_file_path, engine='python', on_bad_lines='skip')
    print(f"Successfully loaded {biden_file_path}")
except Exception as e:
    print(f"Error loading {biden_file_path}: {e}")
    # Potentially try different parsing options or investigate the file
    raise

try:
    trump = pd.read_csv(trump_file_path, engine='python', on_bad_lines='skip')
    print(f"Successfully loaded {trump_file_path}")
except Exception as e:
    print(f"Error loading {trump_file_path}: {e}")
    # Potentially try different parsing options or investigate the file
    raise


# For real-time simulation, sample in chunks
chunk_size = 500  # You can adjust this for "real-time" batch simulation

# Ensure that sampling does not exceed the number of rows actually loaded
biden_sample_size = min(chunk_size, len(biden))
trump_sample_size = min(chunk_size, len(trump))

if biden_sample_size > 0:
    biden_sample = biden.sample(biden_sample_size, random_state=1)
else:
    biden_sample = pd.DataFrame() # Handle case where biden DataFrame is empty

if trump_sample_size > 0:
    trump_sample = trump.sample(trump_sample_size, random_state=1)
else:
    trump_sample = pd.DataFrame() # Handle case where trump DataFrame is empty


# Combine for joint analysis
# Check if samples are not empty before concatenating
if not biden_sample.empty or not trump_sample.empty:
    tweets = pd.concat([biden_sample, trump_sample], ignore_index=True)
    if not tweets.empty:
        tweets = tweets.sample(frac=1, random_state=2).reset_index(drop=True)  # shuffle
    else:
        print("Concatenated DataFrame is empty.")
else:
    tweets = pd.DataFrame() # Create empty DataFrame if both samples are empty
    print("Both sample DataFrames are empty.")

# You might want to add a check here to see if the 'tweets' DataFrame is empty
# before proceeding with further analysis.
if tweets.empty:
    print("No tweets were loaded or sampled successfully.")

Successfully loaded /content/hashtag_joebiden.csv
Successfully loaded /content/hashtag_donaldtrump.csv
