In [3]:
from os.path import isdir, join, normpath

# Number of tweets to receive
NUM_TWEETS = 5

# WattsDown
    

Scraping Twitter links from @meralco

- --jsonl Outputs the data in a JSON format allowing you to access tweet information. Otherwise, you’ll only receive direct links to the tweets.
- --progress Allows us to get updates from the CLI letting us know the progress of the scraping. It updates every 100 tweets. Does not appear to work when using Python with CLI.
- --max-results # Puts a cap on the number of tweets scraped.
- --since Sets lower bound date limit on query
- “until:” Sets upper bound date limit on query


In [4]:
import snscrape.modules.twitter as sntwitter
import pandas as pd

# Creating list to append tweet data to
meralco = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i, tweet in enumerate(sntwitter.TwitterSearchScraper('ADVISORY: from:meralco -filter:replies').get_items()):
    if i > NUM_TWEETS - 1:
        break

    # Check if the tweet has an attached image
    tweet_image = None
    if len(tweet.media) > 0:
        tweet_image = tweet.media[0].fullUrl

    meralco.append([tweet.date, tweet.id, tweet.user.username, tweet.content, tweet_image])
    
# Creating a dataframe from the tweets list above
meralco_df = pd.DataFrame(meralco, columns=['Timestamp', 'Tweet ID', 'Username', 'Text', 'Image URL'])

Processing scraped information as dataframe and getting the link.

In [5]:
from IPython.display import display

display(meralco_df)


Unnamed: 0,Timestamp,Tweet ID,Username,Text,Image URL
0,2022-06-03 13:55:06+00:00,1532722545081962497,meralco,ADVISORY: May scheduled power interruptions sa...,https://pbs.twimg.com/media/FUVTB7eXwAIHD1H?fo...
1,2022-06-03 13:50:04+00:00,1532721279924686848,meralco,ADVISORY: May scheduled power interruptions sa...,https://pbs.twimg.com/media/FUVR4S1WYAA0JdH?fo...
2,2022-06-03 13:45:05+00:00,1532720026574258178,meralco,ADVISORY: May scheduled power interruptions sa...,https://pbs.twimg.com/media/FUVQvU5WUAEmXWQ?fo...
3,2022-06-03 13:40:04+00:00,1532718765749698567,meralco,ADVISORY: May scheduled power interruptions sa...,https://pbs.twimg.com/media/FUVPl74WAAIWwpQ?fo...
4,2022-06-03 13:35:05+00:00,1532717507861176321,meralco,ADVISORY: May scheduled power interruptions sa...,https://pbs.twimg.com/media/FUVOcuVXEAAZwsJ?fo...


# OCR
Images to content (as dataframe)

In [8]:
from PIL import Image
from requests import get
from typing import Tuple
from urllib.parse import urlsplit, parse_qsl
import pytesseract

# OCR logic
def ocr_cropped(image_obj: Image, bounding_box: Tuple[int, int, int, int]) -> str:
    # Crop image to show affected areas
    cropped_image = image_obj.crop(bounding_box)
    # display(cropped_image)
    
    # Convert image to text
    text = pytesseract.image_to_string(cropped_image, config='--psm 6')
    return text.replace('\n', ' ')

def read_meralco_bulletin(image_obj: Image) -> Tuple[str, str, str, str]:
    # Get image dimensions
    width, _ = image_obj.size

    # Define crop for image regions
    box_outage_date = (85, 220, 480, 350)
    box_outage_time = (85, 385, 480, 480)
    box_outage_area = (490, 110, width, 325)
    box_affected_areas = (520, 390, width, 910)

    # Return recognized text from each area
    return (
        ocr_cropped(image_obj, box_outage_date),
        ocr_cropped(image_obj, box_outage_time),
        ocr_cropped(image_obj, box_outage_area),
        ocr_cropped(image_obj, box_affected_areas)
    )

meralco_info = pd.DataFrame(columns=['Tweet ID', 'Username', 'Outage Area', 'Outage Date', 'Outage Time', 'Affected Areas'])
# Iterate through every row in the dataframe
for index, row in meralco_df.iterrows():
    # Check if the tweet has an attached image
    if row['Image URL'] is not None:
        # Download the image
        response = get(row['Image URL'], stream=True)
        
        # Parse image URL to figure out the format from the query string
        image_url_qs = dict(parse_qsl(urlsplit(row['Image URL']).query))
        image_format = image_url_qs['format']

        # Create a PIL image from the response
        image = Image.open(response.raw)

        # OCR stuff
        print(f'\nTweet ID: {row["Tweet ID"]}')
        outage_date, outage_time, outage_area, outage_areas = read_meralco_bulletin(image)
        new = pd.DataFrame({'Tweet ID':[row["Tweet ID"]],
                            'Username':[row["Username"]],
                            'Outage Area':[outage_area],
                            'Outage Date':[outage_date],
                            'Outage Time':[outage_time],
                            'Affected Areas':[outage_areas]})
        # display(new)
        print(meralco_info.size)
        meralco_info = pd.concat([meralco_info, new])
        # display(meralco_info)

        # Save the image
        # image_path = normpath(join('images', f'{row["Tweet ID"]}.{image_format}'))
        # image.save(image_path)

        # Delete the response
        del response


Tweet ID: 1532722545081962497
0

Tweet ID: 1532721279924686848
6

Tweet ID: 1532720026574258178
12

Tweet ID: 1532718765749698567
18

Tweet ID: 1532717507861176321
24


In [9]:
display(meralco_info)


Unnamed: 0,Tweet ID,Username,Outage Date,Outage Time,Affected Areas,Outage Area
0,1532722545081962497,meralco,"JUNE 11, 2022 SATURDAY",BETWEEN 9:00AM AND 9:30AM AND THEN BETWEEN 3:3...,Bgy. Pinagtulayan in Norzagaray; Bgys. Banaban...,BULALAN (ANGAI; DUNA REMEDIOS TRINIDAD; NORZAG...
0,1532721279924686848,meralco,"JUNE 11, 2022 SATURDAY",BETWEEN 9:00AM AND 9:30AM AND THEN BETWEEN 3:3...,Portion of M. Valte Road from M. A. Fernando A...,BULALAN (ANGAI; DUNA REMEDIOS TRINIDAD; NORZAG...
0,1532720026574258178,meralco,"JUNE 8 - 9, 2022 WEDNESDAY TO THURSDAY","BETWEEN 11:00PM (WED., 06/08/22) AND 5:00AM (T...","Litex Subd., Metro Manila Hills Subd. Phases 2...",RIZAL PROVINCE (RODRIGUEZ)
0,1532718765749698567,meralco,"JUNE 8 - 9, 2022 WEDNESDAY TO THURSDAY","BETWEEN 11:00PM (WED., 06/08/22) AND 5:00AM (T...","Portion of Banaba, Lawaan and Tanguile Sts. fr...",RIZAL PROVINCE (RODRIGUEZ)
0,1532717507861176321,meralco,"JUNE 11, 2022 SATURDAY",BETWEEN 10:00AM AND 2:00PM },Portion of Evertex Road from Maharlika Highway...,LAGUNA (CALAMBA CITY)
