Preparando imports

In [1]:
import csv
import datetime
import json
import re
import sys
import urllib.request
import urllib.error
import urllib.parse
import http.cookiejar

from dateutil.relativedelta import relativedelta
from pyquery import PyQuery

import pandas as pd

Preparando classes e métodos de suporte a mineração no Twitter

In [2]:
class Tweet:
    """
    Holds tweet information.
    """
    pass


class QueryArgs:
    """
    Twitter advanced search arguments.
    """

    def __init__(self, results=1, query=None, username=None, location=None, location_radius=None, since=None,
                 until=None, lang=None, top_tweets=None):
        self.results = results
        self.query = query
        self.username = username
        self.location = location
        self.location_radius = location_radius
        self.since = since
        self.until = until
        self.lang = lang
        self.top_tweets = top_tweets


class TweetAdvancedQuery:
    """
    Runs the search using http and json to recover the tweet information.
    """

    @staticmethod
    def query(args, proxy=None, verbose=False):
        """
        Runs the query with the received args and returns the collected tweets.
        """
        if verbose:
            print('Start collect')

        refresh_cursor = ''
        results = []
        cookie_jar = http.cookiejar.CookieJar()
        active = True

        while active:
            json = TweetAdvancedQuery._get_json_reponse(args, refresh_cursor, cookie_jar, proxy)
            if len(json['items_html'].strip()) == 0:
                break

            refresh_cursor = json['min_position']            
            tweets = PyQuery(json['items_html'])('div.js-stream-tweet')
            if len(tweets) == 0:
                break

            for tweet_html in tweets:
                tweet_pq = PyQuery(tweet_html)
                tweet = Tweet()
                tweet_username = tweet_pq('span.username.js-action-profile-name b').text()
                tweet_text = re.sub(r'\s+', ' ', tweet_pq('p.js-tweet-text').text().replace('# ', '#').replace('@ ', '@'))
                retweets = int(tweet_pq('span.ProfileTweet-action--retweet span.ProfileTweet-actionCount').attr('data-tweet-stat-count').replace(',', ''))
                favorites = int(tweet_pq('span.ProfileTweet-action--favorite span.ProfileTweet-actionCount').attr('data-tweet-stat-count').replace(',', ''))
                date_info = int(tweet_pq('small.time span.js-short-timestamp').attr('data-time'))
                tweet_id = tweet_pq.attr('data-tweet-tweet_id')
                permalink = tweet_pq.attr('data-permalink-path')
                tweet_user_id = int(tweet_pq('a.js-user-profile-link').attr('data-user-id'))
                geo_span = tweet_pq('span.Tweet-geo')
                geo = geo_span.attr('title') if len(geo_span) > 0 else ''
                urls = []
                for link in tweet_pq('a'):
                    try:
                        urls.append((link.attrib['data-expanded-url']))
                    except KeyError:
                        pass
                tweet.tweet_id = tweet_id
                tweet.permalink = f'https://twitter.com{permalink}'
                tweet.username = tweet_username
                tweet.text = tweet_text
                tweet.date = datetime.datetime.fromtimestamp(date_info)
                tweet.formatted_date = datetime.datetime.fromtimestamp(date_info).strftime('%a %b %d %X +0000 %Y')
                tweet.retweets = retweets
                tweet.favorites = favorites
                tweet.mentions = ' '.join(re.compile(r'(@\\w*)').findall(tweet.text))
                tweet.hashtags = ' '.join(re.compile(r'(#\\w*)').findall(tweet.text))
                tweet.geo = geo
                tweet.urls = ','.join(urls)
                tweet.user_id = tweet_user_id
                results.append(tweet)

                if args.results > 0 and len(results) >= args.results:
                    active = False
                    break

            if verbose:
                print(f'collected {len(results)} tweets')

        return results

    @staticmethod
    def _get_json_reponse(args, refresh_cursor, cookie_jar, proxy):
        """
        Collects the twitter query response.
        """
        url_get_data = ''
        if args.username is not None:
            url_get_data += f' from:{args.username}'
        if args.since is not None:
            url_get_data += f' since:{args.since}'
        if args.until is not None:
            url_get_data += f' until:{args.until}'
        if args.location is not None:
            url_get_data += f' near:{args.location}'
            if args.location_radius is not None:
                url_get_data += f' within:{args.location_radius}mi'
        if args.query is not None:
            url_get_data += f' {args.query}'
        url_lang = f'lang={args.lang}&' if args.lang  is not None else ''

        url = 'https://twitter.com/i/search/timeline?l=en&f=tweets&q=%s&src=typd&%smax_position=%s'
        url = url % (urllib.parse.quote(url_get_data), url_lang, refresh_cursor)
       
        headers = [
            ('Host', 'twitter.com'),
            ('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'),
            ('Accept', 'application/json, text/javascript, */*; q=0.01'),
            ('Accept-Language', 'de,en-US;q=0.7,en;q=0.3'),
            ('X-Requested-With', 'XMLHttpRequest'),
            ('Referer', url),
            ('Connection', 'keep-alive')
        ]

        if proxy:
            opener = urllib.request.build_opener(
                urllib.request.ProxyHandler({'http': proxy, 'https': proxy}),
                urllib.request.HTTPCookieProcessor(cookie_jar))
        else:
            opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie_jar))
        opener.addheaders = headers

        try:
            response = opener.open(url)
            jsonResponse = response.read()
        except:
            raise ConnectionError(f'Twitter weird response. Try to see on browser: https://twitter.com/search?q={urllib.parse.quote(url_get_data)}&src=typd')

        dataJson = json.loads(jsonResponse.decode())
        return dataJson


def collect_event_tweets(query, since, until, results_per_day=1000, location=None, location_radius=None,
                         on_some_collected=None, verbose=False):
    """
    Helps the tweet collect by getting they per day in a received time interval.
    """
    if verbose:
        print('collecting tweets of event per day')

    tweets = []
    current_date = datetime.date(*(int(i) for i in since.split('-')))
    until_date = datetime.date(*(int(i) for i in until.split('-')))

    while current_date != until_date:
        next_date = current_date + relativedelta(days=1)
        if verbose:
            print(f'current day: {str(current_date)}')

        args = QueryArgs(
            query=query,
            lang="en",
            results=results_per_day,
            location=location,
            location_radius=location_radius,
            since=str(current_date),
            until=str(next_date))

        try:
            current_day_tweets = TweetAdvancedQuery().query(args, verbose=verbose)
        except ConnectionError as e:
            print(e)
            print(f'error while collecting tweets of day {current_date}')
            continue

        tweets.extend(current_day_tweets)
        if on_some_collected is not None:
            on_some_collected(current_day_tweets)

        current_date = next_date

    return tweets

Coletando tweeters

In [3]:
tweets = []    
collect_event_tweets(query="Worlds2017", results_per_day=100, since='2017-10-16', until='2017-11-04',on_some_collected = lambda dt: tweets.extend(dt),verbose=True)
print('finished')

collecting tweets of event per day
current day: 2017-10-16
Start collect
collected 20 tweets
collected 40 tweets
collected 60 tweets
collected 80 tweets
collected 100 tweets
current day: 2017-10-17
Start collect
collected 20 tweets
collected 40 tweets
collected 60 tweets
collected 80 tweets
collected 100 tweets
current day: 2017-10-18
Start collect
collected 20 tweets
collected 40 tweets
collected 59 tweets
collected 79 tweets
collected 99 tweets
collected 100 tweets
current day: 2017-10-19
Start collect
collected 20 tweets
collected 40 tweets
collected 60 tweets
collected 80 tweets
collected 100 tweets
current day: 2017-10-20
Start collect
collected 20 tweets
collected 40 tweets
collected 59 tweets
collected 78 tweets
collected 96 tweets
collected 100 tweets
current day: 2017-10-21
Start collect
collected 18 tweets
collected 36 tweets
collected 56 tweets
collected 76 tweets
Twitter weird response. Try to see on browser: https://twitter.com/search?q=%20since%3A2017-10-21%20until%3A2017

(None,)

Criando data frame com tweeters coletados e criando um CSV para armazernar os tweeters

In [6]:
tweet_columns = ['date', 'tweet']

tweets_data = [{'date':tweet.date, 'tweet':tweet.text} for tweet in tweets]
    
tweets_dataFrame = pd.DataFrame(tweets_data, columns=tweet_columns)
tweets_dataFrame['tweet'] = tweets_dataFrame['tweet'].str.replace(';', ',')
tweets_dataFrame.head()

Unnamed: 0,date,tweet
0,2017-10-16 20:53:27,I liked a @YouTube video http:// youtu.be/yCx3...
1,2017-10-16 20:47:20,Fnatic's Miracle Run & America's Last Hope - T...
2,2017-10-16 20:46:40,Shanghai #worlds2017 semifinal tickets go on s...
3,2017-10-16 20:46:29,Me ha gustado un vídeo de @YouTube ( http:// y...
4,2017-10-16 20:44:34,Discover Worlds with @TravisGafford and @Mobal...


In [7]:
tweets_dataFrame.to_csv("tweets", sep=';', encoding='utf-8')

Uma vez minerados os twitters gerais sobre o campeonato, seguiremos minerando as opiniões a cerca de cada time.

In [3]:
teams = ['FNC', 'MSF', 'C9', 'LZ','WE', 'RNG', 'SKT', 'SSG']
#since='2017-10-16'
#until='2017-11-04'
#final dates https://www.maisesports.com.br/mundial-2017-cobertura-tabelas-datas/
key_dates = ['2017-10-19','2017-10-28','2017-11-04']
tweet_columns = ['date', 'tweet']
teamns_tweets = []
print('finished')

for index, team in enumerate(teams):
    print(team)
    key_index = 2;
    print(index)
    if index < 4:
        key_index = 0
    elif index < 6:
        key_index = 1
    collect_event_tweets(query=team, results_per_day=100, since='2017-10-16', until=key_dates[key_index],on_some_collected = lambda dt: teamns_tweets.extend(dt),verbose=True)

    tweets_data = [{'date':tweet.date, 'tweet':tweet.text, 'team':team} for tweet in teamns_tweets]

    tweets_dataFrame = pd.DataFrame(tweets_data, columns=tweet_columns)
    tweets_dataFrame['tweet'] = tweets_dataFrame['tweet'].str.replace(';', ',')
    tweets_dataFrame.head()

    tweets_dataFrame.to_csv(team, sep=';', encoding='utf-8')
    
    teamns_tweets = []
    

finished
FNC
0
collecting tweets of event per day
current day: 2017-10-16
Start collect
collected 20 tweets
collected 40 tweets
collected 60 tweets
collected 80 tweets
collected 100 tweets
current day: 2017-10-17
Start collect
collected 20 tweets
collected 40 tweets
collected 60 tweets
collected 80 tweets
collected 100 tweets
current day: 2017-10-18
Start collect
collected 20 tweets
collected 40 tweets
collected 60 tweets
collected 77 tweets
collected 94 tweets
collected 100 tweets
MSF
1
collecting tweets of event per day
current day: 2017-10-16
Start collect
collected 16 tweets
collected 36 tweets
collected 52 tweets
collected 71 tweets
collected 91 tweets
collected 100 tweets
current day: 2017-10-17
Start collect
collected 20 tweets
collected 40 tweets
collected 60 tweets
collected 80 tweets
collected 100 tweets
current day: 2017-10-18
Start collect
collected 17 tweets
collected 35 tweets
collected 55 tweets
collected 75 tweets
collected 95 tweets
collected 100 tweets
C9
2
collecting

collected 100 tweets
current day: 2017-11-01
Start collect
collected 20 tweets
collected 40 tweets
collected 60 tweets
collected 80 tweets
collected 100 tweets
current day: 2017-11-02
Start collect
collected 20 tweets
collected 40 tweets
collected 60 tweets
collected 80 tweets
collected 100 tweets
current day: 2017-11-03
Start collect
collected 20 tweets
collected 40 tweets
collected 60 tweets
collected 80 tweets
collected 100 tweets
SSG
7
collecting tweets of event per day
current day: 2017-10-16
Start collect
collected 20 tweets
collected 39 tweets
collected 59 tweets
collected 78 tweets
collected 97 tweets
collected 100 tweets
current day: 2017-10-17
Start collect
collected 16 tweets
collected 31 tweets
collected 51 tweets
collected 71 tweets
collected 91 tweets
collected 100 tweets
current day: 2017-10-18
Start collect
collected 20 tweets
collected 39 tweets
collected 59 tweets
collected 79 tweets
collected 99 tweets
collected 100 tweets
current day: 2017-10-19
Start collect
collec