**Using ntscraper (nitter)**

  IP Rotation

In [1]:
!pip install ntscraper
!pip install requests

Collecting ntscraper
  Downloading ntscraper-0.3.13-py3-none-any.whl.metadata (6.7 kB)
Downloading ntscraper-0.3.13-py3-none-any.whl (11 kB)
Installing collected packages: ntscraper
Successfully installed ntscraper-0.3.13


In [2]:
from ntscraper import Nitter
import pandas as pd

from tqdm import tqdm

# Load valid proxies
with open('/content/drive/MyDrive/Work/WebScrapingTask/proxy_list.txt', 'r') as f:
    proxies = f.read().split("\n")

# Initialize scraper
scraper = Nitter(log_level=1, skip_instance_check=False)

# Function to scrape tweets using a proxy
def scrape_tweets_with_proxy(proxy, query, count):
    scraper.proxies = {"http": proxy, "https": proxy}
    tweets = scraper.get_tweets(query, mode="term", number=count)
    return tweets

# user input
query = input("Enter the search query (e.g., Data Science): ")
tweet_count = int(input("Enter the number of tweets to scrape: "))

data = {
    'link': [],
    'text': [],
    'user': [],
    'likes': [],
    'quotes': [],
    'retweets': [],
    'comments': []
}

# Rotate through proxies to scrape tweets
# Progress tracking
pbar = tqdm(total=tweet_count)
counter = 0
while len(data['text']) < tweet_count:
    proxy = proxies[counter % len(proxies)]
    try:
        tweets = scrape_tweets_with_proxy(proxy, query, min(100, tweet_count - len(data['text'])))
        for tweet in tweets['tweets']:
            data['link'].append(tweet['link'])
            data['text'].append(tweet['text'])
            data['user'].append(tweet['user']['name'])
            data['likes'].append(tweet['stats']['likes'])
            data['quotes'].append(tweet['stats']['quotes'])
            data['retweets'].append(tweet['stats']['retweets'])
            data['comments'].append(tweet['stats']['comments'])
        pbar.update(len(tweets['tweets']))
    except Exception as e:
        print(f"Failed with proxy {proxy}: {e}")
    finally:
        counter += 1

pbar.close()

# Save scraped data to a DataFrame
df = pd.DataFrame(data)

df.to_csv('/content/drive/MyDrive/Work/WebScrapingTask/dynamic_scraped_tweets.csv', index=False)
print("Scraping completed and saved to CSV.")


Testing instances: 100%|██████████| 77/77 [01:16<00:00,  1.01it/s]


Enter the search query (e.g., Data Science): Pakistan
Enter the number of tweets to scrape: 1500


  0%|          | 0/1500 [00:00<?, ?it/s]INFO:root:No instance specified, using random instance https://nitter.privacydev.net
INFO:root:Current stats for Pakistan: 17 tweets, 0 threads...
INFO:root:Current stats for Pakistan: 34 tweets, 0 threads...
INFO:root:Current stats for Pakistan: 50 tweets, 0 threads...
  3%|▎         | 50/1500 [00:21<10:31,  2.30it/s]INFO:root:No instance specified, using random instance https://nitter.privacydev.net
INFO:root:Current stats for Pakistan: 17 tweets, 0 threads...
INFO:root:Current stats for Pakistan: 34 tweets, 0 threads...
INFO:root:Current stats for Pakistan: 50 tweets, 0 threads...
  7%|▋         | 100/1500 [00:39<09:01,  2.59it/s]INFO:root:No instance specified, using random instance https://nitter.privacydev.net
INFO:root:Current stats for Pakistan: 17 tweets, 0 threads...
INFO:root:Current stats for Pakistan: 34 tweets, 0 threads...
INFO:root:Current stats for Pakistan: 50 tweets, 0 threads...
 10%|█         | 150/1500 [01:00<09:04,  2.48it

Scraping completed and saved to CSV.





Pre-processing and storing in FireBase DB

In [3]:
# pre-processing and cleaning the scraped data

df = pd.read_csv('/content/drive/MyDrive/Work/WebScrapingTask/dynamic_scraped_tweets.csv')

df.head()

Unnamed: 0,link,text,user,likes,quotes,retweets,comments
0,https://twitter.com/ImranKhanPTI/status/403945...,"Dharti Hamari, Marzi Hamari. Our Land, Our Way...",Imran Khan,21162,245,9782,383
1,https://twitter.com/pakistan_untold/status/181...,"""...Hatred against Hindus in Pakistan is runni...",Pakistan Untold,481,1,234,4
2,https://twitter.com/LoverCrick79813/status/181...,But Pakistan Ke Pass Sirf Zimbabar Hai 😂. 5 Wi...,Cricket Lover,0,0,0,0
3,https://twitter.com/MrSinha_/status/1816679057...,"Whenever I remember #KargilWar, it reminds me ...",Mr Sinha,37823,319,11919,1367
4,https://twitter.com/SRKNation2023/status/18171...,ICC Champions Trophy 2025 in Pakistan will out...,SRK Nation,0,0,0,0


In [5]:
df.drop_duplicates(subset='link', inplace=True)

In [6]:
df.shape

(50, 7)

In [None]:
import re

# 1. Remove duplicates based on 'link'
df.drop_duplicates(subset='link', inplace=True)

# 2. Handling missing values (if any column should not have nulls)
df.fillna({'text': '', 'user': 'Unknown'}, inplace=True)

# 3. Normalize text data
df['text'] = df['text'].str.lower()
df['user'] = df['user'].str.lower()

# 4. Text cleaning (removing URLs and special characters from text)
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text)  # Remove special characters
    return text.strip()

df['text'] = df['text'].apply(clean_text)

# 5. Convert numerical fields to appropriate types (if necessary)
df['likes'] = df['likes'].astype(int)
df['quotes'] = df['quotes'].astype(int)
df['retweets'] = df['retweets'].astype(int)
df['comments'] = df['comments'].astype(int)

# Output the cleaned data
df.to_csv('/content/drive/MyDrive/Work/WebScrapingTask/cleaned_tweets.csv', index=False)
print("Data has been preprocessed and saved to cleaned_tweets.csv")

Data has been preprocessed and saved to cleaned_tweets.csv


In [None]:
import pandas as pd

data = pd.read_csv("/content/cleaned_tweets.csv")

data.head()

Unnamed: 0,link,text,user,likes,quotes,retweets,comments
0,https://twitter.com/agentjay2009/status/181668...,another protest day today may we see the relea...,jibran ilyas,1772,8,953,33
1,https://twitter.com/saniaazizr/status/18167299...,except that both india and pakistan have been ...,sania aziz 🇵🇸,0,0,0,0
2,https://twitter.com/PPP_Org/status/18167284556...,the 69th birthday of president asif ali zardar...,pakistan peoples party - ppp,2,0,4,0
3,https://twitter.com/AskAnshul/status/181671359...,pakistani army refused to take the dead bodies...,anshul saxena,683,2,216,16
4,https://twitter.com/trunicle/status/1816317833...,india please grant us visas we wish to marry i...,trunicle ट्रूनिकल,2089,40,490,198


In [None]:
data.shape

(32, 7)

In [None]:
# storing in firebase database

!pip install firebase-admin pandas




In [None]:
# Step 1: Import necessary libraries
import firebase_admin
from firebase_admin import credentials, firestore
import pandas as pd

# Step 2: Initialize the Firebase Admin SDK
# Check if an app is already initialized
if not firebase_admin._apps:
    cred = credentials.Certificate('/content/twitterscrapeddata-3f305-firebase-adminsdk-q7tef-644196b17c.json')
    firebase_admin.initialize_app(cred)

# Step 3: Initialize Firestore
db = firestore.client()

# Step 4: Load your DataFrame
data = pd.read_csv("/content/cleaned_tweets.csv")

# Step 5: Convert DataFrame to dictionary format
df_dict = data.to_dict(orient='records')  # Each row will be a separate dictionary

# Step 6: Upload each row to Firestore
collection_name = 'twitter_data'  # Replace with your Firestore collection name

for index, record in enumerate(df_dict):
    doc_ref = db.collection(collection_name).document(f'record_{index}')
    doc_ref.set(record)

print("Data uploaded successfully.")

Data uploaded successfully.
