In [1]:
#Conectarnos a la API de Twitter para recopilar datos.
#utilizar las relaciones tipo 'quién retuitea a quién' del topic Arduino

In [2]:
import tweepy
import pickle
from twitter_secrets import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

api = tweepy.API(auth)

try:
    api.verify_credentials()
    print("Autentificación OK")
except tweepy.TweepError as e:
    print("Falló la autentificación")
    print(e)

Autentificación OK


In [3]:
#recopilación de tweets
#La API nos proporciona hasta 18000 tweets cada 15 minutos (100 tweets para cada una de las 180 peticiones máximo permitidas),

In [4]:
#paginación con objeto cursor: http://docs.tweepy.org/en/v3.5.0/cursor_tutorial.html
for page_id, page in enumerate(tweepy.Cursor(
    api.search,q='arduino',count=100,
    tweet_mode='extended',
    languages = ['en', 'es']).pages()): # process status here
    
    fn = f'page_{page_id:06d}.pkl'
    with open("./pickle_arduino/"+fn, 'wb') as f:
        pickle.dump(page, f)

In [5]:
#paginación con objeto cursor: http://docs.tweepy.org/en/v3.5.0/cursor_tutorial.html
import json
for page_id, page in enumerate(tweepy.Cursor(
    api.search,q='Trump',count=100,
    tweet_mode='extended',
    languages = ['en', 'es']).pages()): # process status here
    
    tweets = [i._json for i in page]
    fn = f'page_{page_id:06d}.json'
    with open("./json_trump/"+fn, 'w') as f:
        json.dump(tweets, f)
    

TweepError: Twitter error response: status code = 429

In [6]:
#necesitaremos unos 100000 tweets

In [7]:
#obtener relaciones

In [48]:
from pathlib import Path
import pickle
import pandas as pd
import numpy as np

def read_pickled_pages(fn):
    pickle_files = Path('./pickle_arduino/').glob(fn)
    statuses = []
    for page_fn in pickle_files:
        #print(page_fn)
        with open(page_fn, 'rb') as f:
            statuses.extend(pickle.load(f))
    return statuses

def read_json_pages(fn):
    json_files = Path('./json_trump/').glob(fn)
    statuses = []
    for page_fn in json_files:
        #print(page_fn)
        with open(page_fn, 'rb') as f:
            statuses.extend(json.load(f))
    return statuses

def flatten_dict(nested_json, exclude=['']):
    """Flatten json object with nested keys into a single level.
        Args:
            nested_json: A nested json object.
            exclude: Keys to exclude from output.
        Returns:
            The flattened json object if successful, None otherwise.
    """
    out = {}

    def flatten(x, name='', exclude=exclude):
        if type(x) is dict:
            for a in x:
                if a not in exclude: flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(nested_json)
    return out

def statuses_to_pandas(statuses):
    data = []
    for status in statuses:
        data.append(flatten_dict(status._json))
    return pd.DataFrame(data)

def statuses_to_pandas_json(statuses):
    data = []
    for status in statuses:
        data.append(flatten_dict(status))
    return pd.DataFrame(data)

def keep_only_user_id_fields(df):
    def is_user_id_column(c):
        return (('user_id' in c.lower() 
                 or ('user_mentions' in c.lower() and 'id' in c.lower()))
                 and not 'str' in c.lower())
    user_columns = sorted([c for c in df.columns if is_user_id_column(c)], key=len)
    df_nx = df[user_columns] 
    return df_nx


def user_df_to_graph(df_nx):

    graph = set()
    for index, row in df_nx.iterrows():
        user_id = row[0]
        assert(not np.isnan(user_id))
        for other_id in row[1:]:
            try:
                if other_id is not None and not np.isnan(other_id):
                    graph.add((int(user_id), int(other_id)))
            except TypeError:
                pass


    df_graph = pd.DataFrame(graph, columns=['src', 'dest'])
    return df_graph

statuses = read_pickled_pages('*.pkl') #lista con los statuses

In [25]:
len(statuses)

16983

In [26]:
tweets_pd = statuses_to_pandas(statuses)

In [31]:
tweets_pd.describe()

Unnamed: 0,id,display_text_range_0,display_text_range_1,entities_user_mentions_0_id,entities_user_mentions_0_indices_0,entities_user_mentions_0_indices_1,entities_media_0_id,entities_media_0_indices_0,entities_media_0_indices_1,entities_media_0_sizes_thumb_w,...,quoted_status_user_entities_description_urls_1_indices_0,quoted_status_user_entities_description_urls_1_indices_1,quoted_status_user_entities_description_urls_2_indices_0,quoted_status_user_entities_description_urls_2_indices_1,extended_entities_media_0_additional_media_info_source_user_entities_description_urls_1_indices_0,extended_entities_media_0_additional_media_info_source_user_entities_description_urls_1_indices_1,quoted_status_entities_urls_2_indices_0,quoted_status_entities_urls_2_indices_1,retweeted_status_user_entities_description_urls_3_indices_0,retweeted_status_user_entities_description_urls_3_indices_1
count,16983.0,16983.0,16983.0,11566.0,11566.0,11566.0,3005.0,3005.0,3005.0,3005.0,...,5.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
mean,1.266635e+18,2.988871,128.902196,2.327292e+17,7.545046,17.669635,1.265537e+18,127.996007,150.996007,149.852246,...,99.4,121.8,125.0,148.0,119.0,142.0,78.0,101.0,168.0,191.0
std,845263600000000.0,27.076888,57.234558,4.292202e+17,21.950922,22.126279,1.339641e+16,69.632538,69.632538,2.494761,...,32.020306,32.337285,,,,,,,,
min,1.265262e+18,0.0,1.0,6136.0,0.0,4.0,8.373249e+17,9.0,32.0,91.0,...,55.0,78.0,125.0,148.0,119.0,142.0,78.0,101.0,168.0,191.0
25%,1.265941e+18,0.0,99.0,33884540.0,3.0,9.0,1.265499e+18,87.0,110.0,150.0,...,87.0,107.0,125.0,148.0,119.0,142.0,78.0,101.0,168.0,191.0
50%,1.266509e+18,0.0,139.0,266400800.0,3.0,13.0,1.266327e+18,105.0,128.0,150.0,...,94.0,117.0,125.0,148.0,119.0,142.0,78.0,101.0,168.0,191.0
75%,1.267456e+18,0.0,140.0,4760788000.0,3.0,17.0,1.267187e+18,162.0,185.0,150.0,...,128.0,151.0,125.0,148.0,119.0,142.0,78.0,101.0,168.0,191.0
max,1.268122e+18,658.0,932.0,1.266856e+18,270.0,278.0,1.268122e+18,735.0,758.0,150.0,...,133.0,156.0,125.0,148.0,119.0,142.0,78.0,101.0,168.0,191.0


In [32]:
#lista de relaciones id_user (user.id) y en retweeted_status.user.id

In [33]:
relaciones_pd=keep_only_user_id_fields(tweets_pd)

In [34]:
relaciones_pd.describe()

Unnamed: 0,user_id,in_reply_to_user_id,quoted_status_user_id,retweeted_status_user_id,entities_user_mentions_0_id,entities_user_mentions_1_id,entities_user_mentions_2_id,entities_user_mentions_3_id,entities_user_mentions_4_id,entities_user_mentions_5_id,...,retweeted_status_quoted_status_entities_user_mentions_1_id,retweeted_status_quoted_status_entities_user_mentions_2_id,retweeted_status_quoted_status_entities_user_mentions_3_id,retweeted_status_quoted_status_entities_user_mentions_4_id,retweeted_status_quoted_status_entities_user_mentions_5_id,extended_entities_media_0_additional_media_info_source_user_id,retweeted_status_quoted_status_entities_media_0_source_user_id,retweeted_status_quoted_status_extended_entities_media_0_source_user_id,quoted_status_extended_entities_media_0_additional_media_info_source_user_id,retweeted_status_extended_entities_media_0_additional_media_info_source_user_id
count,16983.0,1774.0,245.0,9192.0,11566.0,2095.0,906.0,454.0,222.0,156.0,...,30.0,9.0,2.0,1.0,1.0,72.0,25.0,25.0,2.0,20.0
mean,4.398314e+17,3.464544e+17,2.267731e+17,2.259187e+17,2.327292e+17,2.235437e+17,2.527618e+17,1.263024e+17,2.037318e+17,1.033356e+17,...,5.27951e+17,614349600.0,486411700.0,17877351.0,14642896.0,5.446408e+17,1.252922e+17,1.252922e+17,266400754.0,152378300.0
std,5.194239e+17,4.995458e+17,4.390404e+17,4.219745e+17,4.292202e+17,4.303091e+17,4.549073e+17,3.373457e+17,4.334782e+17,2.903765e+17,...,5.46975e+17,1325307000.0,658571500.0,,,5.460976e+17,3.497962e+17,3.497962e+17,0.0,116984500.0
min,11481.0,6136.0,767285.0,11481.0,6136.0,5279.0,741543.0,13348.0,62963.0,62963.0,...,9003112.0,13613290.0,20731300.0,17877351.0,14642896.0,33315780.0,21689900.0,21689900.0,266400754.0,38355920.0
25%,284068600.0,149547300.0,266400800.0,33884540.0,33884540.0,92819100.0,91781130.0,118614000.0,52078220.0,115251800.0,...,46393090.0,28542420.0,253571500.0,17877351.0,14642896.0,221823900.0,38355920.0,38355920.0,266400754.0,38355920.0
50%,3178720000.0,1669807000.0,266400800.0,266400800.0,266400800.0,266400800.0,266400800.0,319424500.0,338335300.0,266400800.0,...,4.081682e+17,266400800.0,486411700.0,17877351.0,14642896.0,7.710075e+17,38355920.0,38355920.0,266400754.0,152378300.0
75%,1.000303e+18,9.192716e+17,2922268000.0,4317986000.0,4760788000.0,4193540000.0,4338453000.0,1859962000.0,2758682000.0,905996500.0,...,1.139636e+18,266400800.0,719251900.0,17877351.0,14642896.0,1.155088e+18,241261300.0,241261300.0,266400754.0,266400800.0
max,1.267868e+18,1.267796e+18,1.258285e+18,1.266856e+18,1.266856e+18,1.266584e+18,1.25635e+18,1.247259e+18,1.206979e+18,1.232407e+18,...,1.17682e+18,4134255000.0,952092000.0,17877351.0,14642896.0,1.254139e+18,1.147923e+18,1.147923e+18,266400754.0,266400800.0


In [35]:
grafo = user_df_to_graph(relaciones_pd)

In [36]:
grafo

Unnamed: 0,src,dest
0,2933209888,33884545
1,1119982272206315520,1139636315031404544
2,3640559832,266400754
3,24969935,101584084
4,1142058348,1607992999
...,...,...
14372,20531864,46096376
14373,4710601,13475312
14374,2616208171,70266297
14375,2444040222,2372055768


In [37]:
grafo.describe()

Unnamed: 0,src,dest
count,14377.0,14377.0
mean,3.675073e+17,1.95944e+17
std,5.028204e+17,4.058739e+17
min,11481.0,338.0
25%,184211400.0,33884540.0
50%,1668358000.0,266400800.0
75%,9.318553e+17,2876785000.0
max,1.267846e+18,1.267796e+18


In [39]:
#guardo el grafo en formato csv
grafo.to_csv('grafo_arduino.csv', sep=' ', index=False, header=False)

In [42]:
statuses_json = read_json_pages('*.json') #lista con los statuses

In [46]:
len(statuses_json)

18000

In [49]:
tweets_json_pd = statuses_to_pandas_json(statuses_json)

In [50]:
tweets_json_pd.describe()

Unnamed: 0,id,display_text_range_0,display_text_range_1,entities_hashtags_0_indices_0,entities_hashtags_0_indices_1,entities_hashtags_1_indices_0,entities_hashtags_1_indices_1,entities_hashtags_2_indices_0,entities_hashtags_2_indices_1,entities_hashtags_3_indices_0,...,retweeted_status_entities_media_1_sizes_thumb_w,retweeted_status_entities_media_1_sizes_thumb_h,retweeted_status_entities_media_1_sizes_large_w,retweeted_status_entities_media_1_sizes_large_h,retweeted_status_entities_media_1_sizes_medium_w,retweeted_status_entities_media_1_sizes_medium_h,retweeted_status_entities_media_1_sizes_small_w,retweeted_status_entities_media_1_sizes_small_h,retweeted_status_entities_media_1_source_status_id,retweeted_status_entities_media_1_source_user_id
count,18000.0,18000.0,18000.0,1679.0,1679.0,569.0,569.0,253.0,253.0,133.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
mean,1.268132e+18,3.131667,132.400444,68.645622,80.519952,95.170475,106.316344,112.162055,123.525692,127.421053,...,150.0,150.0,1700.0,1133.0,1200.0,800.0,680.0,453.0,1.268041e+18,7.476661e+17
std,568562500000.0,17.561784,47.617262,58.242456,58.821608,58.114543,58.931135,59.024489,59.840432,59.760534,...,,,,,,,,,,
min,1.268131e+18,0.0,6.0,0.0,3.0,5.0,11.0,13.0,25.0,26.0,...,150.0,150.0,1700.0,1133.0,1200.0,800.0,680.0,453.0,1.268041e+18,7.476661e+17
25%,1.268131e+18,0.0,112.0,21.0,35.0,57.0,67.0,78.0,89.0,88.0,...,150.0,150.0,1700.0,1133.0,1200.0,800.0,680.0,453.0,1.268041e+18,7.476661e+17
50%,1.268132e+18,0.0,140.0,61.0,74.0,82.0,95.0,98.0,109.0,110.0,...,150.0,150.0,1700.0,1133.0,1200.0,800.0,680.0,453.0,1.268041e+18,7.476661e+17
75%,1.268132e+18,0.0,140.0,93.0,105.0,118.0,130.0,128.0,135.0,169.0,...,150.0,150.0,1700.0,1133.0,1200.0,800.0,680.0,453.0,1.268041e+18,7.476661e+17
max,1.268133e+18,666.0,918.0,905.0,910.0,313.0,321.0,322.0,331.0,292.0,...,150.0,150.0,1700.0,1133.0,1200.0,800.0,680.0,453.0,1.268041e+18,7.476661e+17


In [51]:
relaciones_json_pd=keep_only_user_id_fields(tweets_json_pd)
relaciones_json_pd.describe()

Unnamed: 0,user_id,in_reply_to_user_id,quoted_status_user_id,retweeted_status_user_id,entities_user_mentions_0_id,entities_user_mentions_1_id,entities_user_mentions_2_id,entities_user_mentions_3_id,entities_user_mentions_4_id,entities_user_mentions_5_id,...,retweeted_status_quoted_status_entities_user_mentions_3_id,retweeted_status_quoted_status_entities_user_mentions_4_id,retweeted_status_quoted_status_entities_user_mentions_5_id,retweeted_status_quoted_status_entities_user_mentions_6_id,extended_entities_media_0_additional_media_info_source_user_id,retweeted_status_quoted_status_entities_media_0_source_user_id,retweeted_status_quoted_status_extended_entities_media_0_source_user_id,quoted_status_extended_entities_media_0_additional_media_info_source_user_id,retweeted_status_extended_entities_media_0_additional_media_info_source_user_id,retweeted_status_quoted_status_extended_entities_media_0_additional_media_info_source_user_id
count,18000.0,2057.0,323.0,14208.0,16430.0,2414.0,981.0,421.0,207.0,125.0,...,2.0,1.0,1.0,1.0,296.0,124.0,124.0,5.0,46.0,5.0
mean,4.674965e+17,2.983749e+17,2.508349e+17,2.888775e+17,2.858426e+17,2.361715e+17,2.093107e+17,2.463386e+17,2.837787e+17,3.667609e+17,...,53107880.0,264739059.0,200476117.0,284597363.0,5.467286e+17,1.507859e+17,1.507859e+17,4.471187e+17,2.866723e+17,2.174761e+17
std,5.267312e+17,4.786269e+17,4.423722e+17,4.687897e+17,4.678406e+17,4.35924e+17,4.082627e+17,4.417055e+17,4.569685e+17,5.043784e+17,...,4016619.0,,,,5.464505e+17,3.400483e+17,3.400483e+17,6.126201e+17,4.714427e+17,4.862913e+17
min,38623.0,12.0,807095.0,1688.0,12.0,1688.0,12.0,759251.0,759251.0,939091.0,...,50267700.0,264739059.0,200476117.0,284597363.0,1917731.0,16106580.0,16106580.0,17376890.0,759251.0,25073880.0
25%,406500500.0,25073880.0,25073880.0,73949000.0,57009690.0,25073880.0,25073880.0,25073880.0,25073880.0,47691930.0,...,51687790.0,264739059.0,200476117.0,284597363.0,1947164000.0,195288500.0,195288500.0,28241600.0,747154700.0,26059160.0
50%,3234065000.0,419849900.0,256881600.0,970207300.0,857001700.0,107183300.0,217543200.0,217543200.0,432895300.0,499074000.0,...,53107880.0,264739059.0,200476117.0,284597363.0,8.063763e+17,1915549000.0,1915549000.0,1334569000.0,2800581000.0,26059160.0
75%,1.021363e+18,8.061771e+17,3.653394e+17,8.000867e+17,7.863099e+17,3386462000.0,2858648000.0,4807056000.0,8.025209e+17,8.222157e+17,...,54527960.0,264739059.0,200476117.0,284597363.0,1.08738e+18,2923527000.0,2923527000.0,1.08738e+18,7.138393e+17,1334569000.0
max,1.26813e+18,1.268129e+18,1.267976e+18,1.268029e+18,1.268129e+18,1.268093e+18,1.267623e+18,1.26658e+18,1.267573e+18,1.267573e+18,...,55948050.0,264739059.0,200476117.0,284597363.0,1.265762e+18,1.249247e+18,1.249247e+18,1.148213e+18,1.24356e+18,1.08738e+18


In [52]:
grafo_json = user_df_to_graph(relaciones_json_pd)
grafo_json.describe()
grafo_json.to_csv('grafo_json_trump.csv', sep=' ', index=False, header=False)

In [53]:
len(grafo_json)

23706

In [54]:
len(grafo)

14377

In [55]:
grafo

Unnamed: 0,src,dest
0,2933209888,33884545
1,1119982272206315520,1139636315031404544
2,3640559832,266400754
3,24969935,101584084
4,1142058348,1607992999
...,...,...
14372,20531864,46096376
14373,4710601,13475312
14374,2616208171,70266297
14375,2444040222,2372055768


In [56]:
grafo_json

Unnamed: 0,src,dest
0,2970569441,326255267
1,793965581385011200,15429807
2,1221366000752324608,37034483
3,1020987296,591505507
4,1734300049,25073877
...,...,...
23701,17391764,254117355
23702,1958521082,17980523
23703,1237091453534429184,1167415560
23704,2266797787,496120047


In [57]:
#Parte 2 Analizar la red

In [58]:
#Limpieza de datos

In [59]:
#Análisis de la red

In [60]:
#Análisis de los nodos

In [61]:
#Detección de comunidades