Preparando imports

In [4]:
import csv
import datetime
import json
import re
import sys
import urllib.request
import urllib.error
import urllib.parse
import http.cookiejar

from dateutil.relativedelta import relativedelta
from pyquery import PyQuery

import pandas as pd

Preparando classes e métodos de suporte a mineração no Twitter

In [7]:

class Tweet:
    """
    Holds tweet information.
    """
    pass


class QueryArgs:
    """
    Twitter advanced search arguments.
    """

    def __init__(self, results=1, query=None, username=None, location=None, location_radius=None, since=None,
                 until=None, lang=None, top_tweets=None):
        self.results = results
        self.query = query
        self.username = username
        self.location = location
        self.location_radius = location_radius
        self.since = since
        self.until = until
        self.lang = lang
        self.top_tweets = top_tweets


class TweetAdvancedQuery:
    """
    Runs the search using http and json to recover the tweet information.
    """

    @staticmethod
    def query(args, proxy=None, verbose=False):
        """
        Runs the query with the received args and returns the collected tweets.
        """
        if verbose:
            print('Start collect')

        refresh_cursor = ''
        results = []
        cookie_jar = http.cookiejar.CookieJar()
        active = True

        while active:
            json = TweetAdvancedQuery._get_json_reponse(args, refresh_cursor, cookie_jar, proxy)
            if len(json['items_html'].strip()) == 0:
                break

            refresh_cursor = json['min_position']            
            tweets = PyQuery(json['items_html'])('div.js-stream-tweet')
            if len(tweets) == 0:
                break

            for tweet_html in tweets:
                tweet_pq = PyQuery(tweet_html)
                tweet = Tweet()
                tweet_username = tweet_pq('span.username.js-action-profile-name b').text()
                tweet_text = re.sub(r'\s+', ' ', tweet_pq('p.js-tweet-text').text().replace('# ', '#').replace('@ ', '@'))
                retweets = int(tweet_pq('span.ProfileTweet-action--retweet span.ProfileTweet-actionCount').attr('data-tweet-stat-count').replace(',', ''))
                favorites = int(tweet_pq('span.ProfileTweet-action--favorite span.ProfileTweet-actionCount').attr('data-tweet-stat-count').replace(',', ''))
                date_info = int(tweet_pq('small.time span.js-short-timestamp').attr('data-time'))
                tweet_id = tweet_pq.attr('data-tweet-tweet_id')
                permalink = tweet_pq.attr('data-permalink-path')
                tweet_user_id = int(tweet_pq('a.js-user-profile-link').attr('data-user-id'))
                geo_span = tweet_pq('span.Tweet-geo')
                geo = geo_span.attr('title') if len(geo_span) > 0 else ''
                urls = []
                for link in tweet_pq('a'):
                    try:
                        urls.append((link.attrib['data-expanded-url']))
                    except KeyError:
                        pass
                tweet.tweet_id = tweet_id
                tweet.permalink = f'https://twitter.com{permalink}'
                tweet.username = tweet_username
                tweet.text = tweet_text
                tweet.date = datetime.datetime.fromtimestamp(date_info)
                tweet.formatted_date = datetime.datetime.fromtimestamp(date_info).strftime('%a %b %d %X +0000 %Y')
                tweet.retweets = retweets
                tweet.favorites = favorites
                tweet.mentions = ' '.join(re.compile(r'(@\\w*)').findall(tweet.text))
                tweet.hashtags = ' '.join(re.compile(r'(#\\w*)').findall(tweet.text))
                tweet.geo = geo
                tweet.urls = ','.join(urls)
                tweet.user_id = tweet_user_id
                results.append(tweet)

                if args.results > 0 and len(results) >= args.results:
                    active = False
                    break

            if verbose:
                print(f'collected {len(results)} tweets')

        return results

    @staticmethod
    def _get_json_reponse(args, refresh_cursor, cookie_jar, proxy):
        """
        Collects the twitter query response.
        """
        url_get_data = ''
        if args.username is not None:
            url_get_data += f' from:{args.username}'
        if args.since is not None:
            url_get_data += f' since:{args.since}'
        if args.until is not None:
            url_get_data += f' until:{args.until}'
        if args.location is not None:
            url_get_data += f' near:{args.location}'
            if args.location_radius is not None:
                url_get_data += f' within:{args.location_radius}mi'
        if args.query is not None:
            url_get_data += f' {args.query}'
        url_lang = f'lang={args.lang}&' if args.lang  is not None else ''

        url = 'https://twitter.com/i/search/timeline?f=tweets&q=%s&src=typd&%smax_position=%s'
        url = url % (urllib.parse.quote(url_get_data), url_lang, refresh_cursor)

        headers = [
            ('Host', 'twitter.com'),
            ('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'),
            ('Accept', 'application/json, text/javascript, */*; q=0.01'),
            ('Accept-Language', 'de,en-US;q=0.7,en;q=0.3'),
            ('X-Requested-With', 'XMLHttpRequest'),
            ('Referer', url),
            ('Connection', 'keep-alive')
        ]

        if proxy:
            opener = urllib.request.build_opener(
                urllib.request.ProxyHandler({'http': proxy, 'https': proxy}),
                urllib.request.HTTPCookieProcessor(cookie_jar))
        else:
            opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie_jar))
        opener.addheaders = headers

        try:
            response = opener.open(url)
            jsonResponse = response.read()
        except:
            raise ConnectionError(f'Twitter weird response. Try to see on browser: https://twitter.com/search?q={urllib.parse.quote(url_get_data)}&src=typd')

        dataJson = json.loads(jsonResponse.decode())
        return dataJson


def collect_event_tweets(query, since, until, results_per_day=1000, location=None, location_radius=None,
                         on_some_collected=None, verbose=False):
    """
    Helps the tweet collect by getting they per day in a received time interval.
    """
    if verbose:
        print('collecting tweets of event per day')

    tweets = []
    current_date = datetime.date(*(int(i) for i in since.split('-')))
    until_date = datetime.date(*(int(i) for i in until.split('-')))

    while current_date != until_date:
        next_date = current_date + relativedelta(days=1)
        if verbose:
            print(f'current day: {str(current_date)}')

        args = QueryArgs(
            query=query,
            lang="en",
            results=results_per_day,
            location=location,
            location_radius=location_radius,
            since=str(current_date),
            until=str(next_date))

        try:
            current_day_tweets = TweetAdvancedQuery().query(args, verbose=verbose)
        except ConnectionError as e:
            print(e)
            print(f'error while collecting tweets of day {current_date}')
            continue

        tweets.extend(current_day_tweets)
        if on_some_collected is not None:
            on_some_collected(current_day_tweets)

        current_date = next_date

    return tweets

Coletando tweeters

In [8]:
tweets = []    
collect_event_tweets(query="Worlds2017", results_per_day=2000, since='2017-10-16', until='2017-11-04',on_some_collected = lambda dt: tweets.extend(dt),verbose=True)
print('finished'),

collecting tweets of event per day
current day: 2017-10-16
Start collect
collected 20 tweets
collected 40 tweets
collected 60 tweets
collected 80 tweets
collected 100 tweets
collected 120 tweets
collected 140 tweets
collected 160 tweets
collected 180 tweets
collected 200 tweets
collected 220 tweets
collected 240 tweets
collected 260 tweets
collected 280 tweets
collected 300 tweets
collected 320 tweets
collected 340 tweets
collected 360 tweets
collected 380 tweets
collected 400 tweets
collected 420 tweets
collected 440 tweets
collected 460 tweets
collected 480 tweets
collected 500 tweets
collected 520 tweets
collected 540 tweets
collected 560 tweets
collected 578 tweets
collected 598 tweets
collected 618 tweets
collected 638 tweets
collected 658 tweets
collected 678 tweets
collected 698 tweets
collected 718 tweets
collected 738 tweets
collected 758 tweets
collected 778 tweets
collected 798 tweets
collected 801 tweets
current day: 2017-10-17
Start collect
collected 20 tweets
collected 40

collected 1970 tweets
collected 1988 tweets
collected 2000 tweets
current day: 2017-10-21
Start collect
collected 17 tweets
collected 37 tweets
collected 57 tweets
collected 76 tweets
collected 96 tweets
collected 114 tweets
collected 133 tweets
collected 153 tweets
collected 172 tweets
collected 191 tweets
collected 211 tweets
collected 230 tweets
collected 250 tweets
collected 270 tweets
collected 290 tweets
collected 310 tweets
collected 330 tweets
collected 350 tweets
collected 370 tweets
collected 390 tweets
collected 409 tweets
collected 427 tweets
collected 447 tweets
collected 467 tweets
collected 486 tweets
collected 506 tweets
collected 526 tweets
collected 541 tweets
collected 561 tweets
collected 581 tweets
collected 600 tweets
collected 620 tweets
collected 640 tweets
collected 660 tweets
collected 680 tweets
collected 700 tweets
collected 718 tweets
collected 738 tweets
collected 758 tweets
collected 778 tweets
collected 797 tweets
collected 815 tweets
collected 832 tweet

collected 324 tweets
collected 335 tweets
collected 341 tweets
collected 347 tweets
collected 353 tweets
collected 364 tweets
collected 370 tweets
collected 377 tweets
collected 389 tweets
collected 397 tweets
collected 401 tweets
collected 407 tweets
collected 418 tweets
collected 425 tweets
collected 435 tweets
collected 445 tweets
collected 452 tweets
collected 455 tweets
collected 460 tweets
collected 469 tweets
collected 473 tweets
collected 481 tweets
collected 490 tweets
collected 502 tweets
collected 507 tweets
collected 517 tweets
collected 527 tweets
collected 535 tweets
collected 542 tweets
collected 547 tweets
collected 549 tweets
collected 554 tweets
collected 563 tweets
collected 571 tweets
collected 580 tweets
collected 591 tweets
collected 598 tweets
collected 605 tweets
collected 616 tweets
collected 625 tweets
collected 638 tweets
collected 645 tweets
collected 655 tweets
collected 666 tweets
collected 676 tweets
collected 686 tweets
collected 696 tweets
collected 706

collected 20 tweets
collected 40 tweets
collected 60 tweets
collected 80 tweets
collected 100 tweets
collected 120 tweets
collected 140 tweets
collected 160 tweets
collected 180 tweets
collected 200 tweets
collected 220 tweets
collected 240 tweets
collected 260 tweets
collected 280 tweets
collected 300 tweets
collected 320 tweets
collected 340 tweets
collected 360 tweets
collected 380 tweets
collected 400 tweets
collected 420 tweets
collected 440 tweets
collected 460 tweets
collected 480 tweets
collected 500 tweets
collected 519 tweets
collected 539 tweets
collected 559 tweets
collected 579 tweets
collected 599 tweets
collected 619 tweets
collected 639 tweets
collected 659 tweets
collected 679 tweets
collected 699 tweets
collected 719 tweets
collected 739 tweets
collected 746 tweets
current day: 2017-11-02
Start collect
collected 20 tweets
collected 40 tweets
collected 60 tweets
collected 80 tweets
collected 100 tweets
collected 120 tweets
collected 140 tweets
collected 160 tweets
coll

Criando data frame com tweeters coletados e criando um CSV para armazernar os tweeters

In [13]:

tweet_columns = ['date', 'tweet']

tweets_data = [{'date':tweet.date, 'tweet':tweet.text} for tweet in tweets]
    
tweets_dataFrame = pd.DataFrame(tweets_data, columns=tweet_columns)
tweets_dataFrame['tweet'] = tweets_dataFrame['tweet'].str.replace(';', ',')
tweets_dataFrame.head()

Unnamed: 0,date,tweet
0,2017-10-16 20:53:27,I liked a @YouTube video http:// youtu.be/yCx3...
1,2017-10-16 20:47:20,Fnatic's Miracle Run & America's Last Hope - T...
2,2017-10-16 20:46:40,Shanghai #worlds2017 semifinal tickets go on s...
3,2017-10-16 20:46:35,Me gustó un video de @YouTube http:// youtu.be...
4,2017-10-16 20:46:29,Me ha gustado un vídeo de @YouTube ( http:// y...


In [14]:
tweets_dataFrame.to_csv("tweets", sep=';', encoding='utf-8')