In [5]:
# %% [markdown]
# # #EndSARS Data Collection (Fixed Version)
# 
# **Resolved snscrape compatibility issue**

# %% [code] -- FIXED INSTALLATION --
# Uninstall problematic packages and install compatible versions
!pip uninstall -y snscrape
!pip install snscrape==0.4.3.20220106
!pip install pandas requests beautifulsoup4 fake-useragent tqdm

# %% [code] -- FIXED TWITTER IMPORT --
# ======================
# TWITTER DATA SCRAPING (FIXED)
# ======================
import pandas as pd
from tqdm.notebook import tqdm
import snscrape.base  # Import base module first

# Use COMPATIBLE import method
from snscrape.modules import twitter as sntwitter

# Configure search
search_terms = [
    "#EndSARS education since:2020-10-08 until:2020-10-31",
    "#EndSARS school since:2020-10-08 until:2020-12-31",
    "#EndSARS university since:2021-10-01 until:2024-10-31"
]

def scrape_twitter(terms, max_tweets=1000):
    tweets = []
    for term in terms:
        print(f"\nScraping: {term}")
        try:
            scraper = sntwitter.TwitterSearchScraper(term)
            for i, tweet in tqdm(enumerate(scraper.get_items()), total=max_tweets):
                if i >= max_tweets:
                    break
                tweets.append({
                    "date": tweet.date,
                    "id": tweet.id,
                    "content": tweet.content,
                    "username": tweet.user.username,
                    "likes": tweet.likeCount,
                    "retweets": tweet.retweetCount,
                    "hashtags": tweet.hashtags
                })
        except Exception as e:
            print(f"Error: {str(e)}")
    return pd.DataFrame(tweets)

# Run Twitter scraper (reduced count for testing)
twitter_df = scrape_twitter(search_terms, max_tweets=500)

# Save Twitter data
twitter_df.to_csv("endsars_twitter_data.csv", index=False)
print(f"\n✅ Twitter data saved ({len(twitter_df)} tweets)")

# %% [code]
# ===================
# NEWS DATA SCRAPING
# ===================
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from fake_useragent import UserAgent

# Initialize random user agents
ua = UserAgent()

sources = {
    "PremiumTimes": "https://www.premiumtimesng.com/tag/endsars/",
    "Vanguard": "https://www.vanguardngr.com/tag/endsars/",
    "Guardian": "https://guardian.ng/tag/endsars/"
}

# Site-specific configuration
site_config = {
    "PremiumTimes": {
        "article_selector": "article",
        "title_selector": "h2.headline",
        "excerpt_selector": ".excerpt",
        "link_selector": "h2 a"
    },
    "Vanguard": {
        "article_selector": ".mvp-blog-story-wrap",
        "title_selector": "h2",
        "excerpt_selector": ".mvp-blog-excerpt",
        "link_selector": "h2 a"
    },
    "Guardian": {
        "article_selector": ".js-article-list-item",
        "title_selector": "h3",
        "excerpt_selector": ".js-article-list-item__excerpt",
        "link_selector": "h3 a"
    }
}

def scrape_news_sites():
    articles = []
    for name, url in sources.items():
        try:
            print(f"\nScraping: {name}")
            headers = {'User-Agent': ua.random}
            response = requests.get(url, headers=headers, timeout=15)
            soup = BeautifulSoup(response.content, 'html.parser')
            config = site_config[name]
            
            for article in soup.select(config["article_selector"]):
                # Extract title
                title_elem = article.select_one(config["title_selector"])
                title = title_elem.text.strip() if title_elem else ''
                
                # Extract excerpt
                excerpt_elem = article.select_one(config["excerpt_selector"])
                excerpt = excerpt_elem.text.strip() if excerpt_elem else ''
                
                # Extract article URL
                link_elem = article.select_one(config["link_selector"])
                if link_elem and link_elem.has_attr('href'):
                    article_url = link_elem['href']
                    # Fix relative URLs
                    if not article_url.startswith('http'):
                        article_url = url + article_url.lstrip('/')
                else:
                    article_url = url
                
                articles.append({
                    "source": name,
                    "title": title,
                    "excerpt": excerpt,
                    "url": article_url,
                    "source_page": url
                })
            
            # Respectful delay between sites
            time.sleep(2)
            
        except Exception as e:
            print(f"Error scraping {name}: {str(e)}")
    
    return pd.DataFrame(articles)

# Run news scraper
news_df = scrape_news_sites()

# Save news data
if not news_df.empty:
    news_df.to_csv("endsars_news_data.csv", index=False)
    print(f"\n✅ News data saved ({len(news_df)} articles)")
else:
    print("\n❌ No articles scraped - check selectors")

# %% [code]
# =====================
# DATA VERIFICATION
# =====================
def verify_data():
    try:
        # Load Twitter data
        twitter_df = pd.read_csv("endsars_twitter_data.csv")
        print(f"Twitter data: {len(twitter_df)} tweets")
        print(twitter_df.head(2))
        
        # Load News data
        news_df = pd.read_csv("endsars_news_data.csv")
        print(f"\nNews data: {len(news_df)} articles")
        print(news_df.head(2))
        
        return True
    except Exception as e:
        print(f"Verification failed: {str(e)}")
        return False

if verify_data():
    print("\n🎉 All data collected successfully!")
else:
    print("\n⚠️ Some data collection failed - check logs")

Found existing installation: snscrape 0.7.0.20230622
Uninstalling snscrape-0.7.0.20230622:
  Successfully uninstalled snscrape-0.7.0.20230622
Collecting snscrape==0.4.3.20220106
  Downloading snscrape-0.4.3.20220106-py3-none-any.whl.metadata (4.4 kB)
Downloading snscrape-0.4.3.20220106-py3-none-any.whl (59 kB)
Installing collected packages: snscrape
Successfully installed snscrape-0.4.3.20220106

Scraping: #EndSARS education since:2020-10-08 until:2020-10-31


  0%|          | 0/500 [00:00<?, ?it/s]

Error retrieving https://api.twitter.com/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweets=true&q=%23EndSARS+education+since%3A2020-10-08+until%3A2020-10-31&tweet_search_mode=live&count=100&query_source=spelling_expansion_revert_click&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel: non-200 status code
4 requests to https://api.twitter.com/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&incl

Error: 4 requests to https://api.twitter.com/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweets=true&q=%23EndSARS+education+since%3A2020-10-08+until%3A2020-10-31&tweet_search_mode=live&count=100&query_source=spelling_expansion_revert_click&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel failed, giving up.

Scraping: #EndSARS school since:2020-10-08 until:2020-12-31


  0%|          | 0/500 [00:00<?, ?it/s]

Error retrieving https://api.twitter.com/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweets=true&q=%23EndSARS+school+since%3A2020-10-08+until%3A2020-12-31&tweet_search_mode=live&count=100&query_source=spelling_expansion_revert_click&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel: non-200 status code
4 requests to https://api.twitter.com/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include

Error: 4 requests to https://api.twitter.com/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweets=true&q=%23EndSARS+school+since%3A2020-10-08+until%3A2020-12-31&tweet_search_mode=live&count=100&query_source=spelling_expansion_revert_click&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel failed, giving up.

Scraping: #EndSARS university since:2021-10-01 until:2024-10-31


  0%|          | 0/500 [00:00<?, ?it/s]

Error retrieving https://api.twitter.com/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweets=true&q=%23EndSARS+university+since%3A2021-10-01+until%3A2024-10-31&tweet_search_mode=live&count=100&query_source=spelling_expansion_revert_click&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel: non-200 status code
4 requests to https://api.twitter.com/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&inc

Error: 4 requests to https://api.twitter.com/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweets=true&q=%23EndSARS+university+since%3A2021-10-01+until%3A2024-10-31&tweet_search_mode=live&count=100&query_source=spelling_expansion_revert_click&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel failed, giving up.

✅ Twitter data saved (0 tweets)

Scraping: PremiumTimes

Scraping: Vanguard

Scraping: Guardian

✅ News data saved (25 articles)
Verification failed: No columns to parse from file

⚠️ Some data collection failed - check 