## 1 Importando Paquetes

In [1]:
import os
import tweepy as tw
import pandas as pd
from tqdm import tqdm, notebook

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

## 2 Autenticacion Twitter API

In [3]:
consumer_api_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
consumer_api_secret = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

In [4]:
auth = tw.OAuthHandler(consumer_api_key, consumer_api_secret)

In [5]:
api = tw.API(auth, wait_on_rate_limit=True)

## 3 Consulta en los Tweets

### 3.1 Definiendo la consulta

In [6]:
search_words = "#covid19 -filter:retweets"
date_since = "2020-03-01"

In [11]:
# Colectando tweets
tweets = tw.Cursor(api.search,
                  q = search_words,
                  lang = "en",
                  since = date_since).items(1000)

### 3.2 Recuperando los tweets

In [12]:
tweets_copy = []
for tweet in tqdm(tweets):
    tweets_copy.append(tweet)

1000it [01:30, 11.07it/s]


In [13]:
print(f"nuevos tweets recuperados: {len(tweets_copy)}")

nuevos tweets recuperados: 1000


## 4. Complete el conjunto de datos

In [14]:
tweets_df = pd.DataFrame()
for tweet in tqdm(tweets_copy):
    hashtags = []
    try:
        for hashtag in tweet.entities["hashtags"]:
            hashtags.append(hashtag["text"])
        text = api.get_status(id=tweet.id, tweet_mode='extended').full_text
    except:
        pass
    tweets_df = tweets_df.append(pd.DataFrame({'user_name': tweet.user.name,
                                               'user_location': tweet.user.location,\
                                               'user_description': tweet.user.description,
                                               'user_created': tweet.user.created_at,
                                               'user_followers': tweet.user.followers_count,
                                               'user_friends': tweet.user.friends_count,
                                               'user_favourites': tweet.user.favourites_count,
                                               'user_verified': tweet.user.verified,
                                               'date': tweet.created_at,
                                               'text': text, 
                                               'hashtags': [hashtags if hashtags else None],
                                               'source': tweet.source,
                                               'is_retweet': tweet.retweeted}, index=[0]))

100%|██████████| 1000/1000 [16:38<00:00,  1.00it/s] 


In [15]:
tweets_df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,Iranian Community Romania,,,2020-06-10 17:13:38,573,1138,18205,False,2021-03-29 10:27:17,"#Iran-#Covid19 ""#vaccination""a weapon to eradicate rebellion generation.https://t.co/gE6T414G6z\n.@AlHadath.@handelsblatt.@I24News.@nytimes.@Telegraph.@BBCWorld.@ntvde.@FoxNews.@thehill.@el_pais.@Corriere.@TheGardianNews.@voz_populi.@TheSun.@Politico.@expressen.@SZ.@IrishTime","[Iran, Covid19, vaccination]",Twitter Web App,False
0,Annalisa Klebers,"New York, USA","Anchor @News12NJ, @News12CT, @AlticeUSA Email: Annalisa.Klebers@News12.com",2018-10-15 17:29:13,948,453,3626,False,2021-03-29 10:27:05,COVID ALERT: Draft of WHO-China report obtained by the Associated Press says that COVID-19 likely spread from animals to humans and that a lab leak “extremely unlikely.” \n#COVID19 #Coronavirus #WHO \n@News12CT https://t.co/iTmD22GnSo,,Twitter for iPhone,False
0,Radio Misfits,United States,"Great Talk Radio Isn't Dead, It Just Moved To A Better Place...\n\nShows availalbe almost everywhere you find podcasts, just search for 'Radio Misfits'",2014-05-24 15:30:03,23773,19707,16037,False,2021-03-29 10:27:02,#TheGameShowShow EP102: PATIENCE IS A VIRTUE w/ @peoplecallmeJAZ @andersonlawfer @JohnnyMoDigital \n\n#NSFWAF #Chicago #Comedy #TechnicalDifficulties #JohnnyMoDoesTheNewsForYou #Podcast #ChakaKhan #COVID19 #AdultsOnly #MoreProfanity #AndStuff https://t.co/6F4gPjeld0,"[TheGameShowShow, NSFWAF]",SimplyCast,False
0,phoebesaid,"Detroit, MI",Detroit Free Press autos reporter covering people + products + supply chain. Tips: phoward@freepress.com. She/Her. Alum: @mizzou @dmregister @mcclatchy @TimeInc,2012-08-11 06:11:35,3381,772,11754,True,2021-03-29 10:27:01,Exclusive: @UAW holds strong in 2020 among dues-paying members despite #COVID19 and scandals involving 2 past presidents. Strike fund up.\n\nhttps://t.co/FtttB6FtTZ via @freep @freepautos @UAW @Ford @GM @Stellantis @UniforTheUnion @Columbia @Harvard @DefUnionPhila,[COVID19],Twitter Web App,False
0,Shahid Anwar,,,2013-08-19 17:53:54,15,159,394,False,2021-03-29 10:26:53,@GHMCOnline I noticed that in many localities of old city still waiting for authority to remove garbage. During this pandemic it's not your responsibility to Abide by #COVID19 protocols.\n@asadowaisi \n@NR_abuaimal \n@kcrtrs,,Twitter for Android,False


## 5. Guardando los Datos

### 5.1 Leyendo .csv pasado

In [21]:
#Guardar la primera vez
tweets_df.to_csv(r'C:\Users\covid19_tweets.csv',index = False, header=True)

In [24]:
tweets_old_df = pd.read_csv("covid19_tweets.csv")
print(f"past tweets: {tweets_old_df.shape}")

### 5.2 Juntando .csv pasado y presente

In [25]:
tweets_all_df = pd.concat([tweets_old_df, tweets_df], axis=0)
print(f"new tweets: {tweets_df.shape[0]} past tweets: {tweets_old_df.shape[0]} all tweets: {tweets_all_df.shape[0]}")

### 5.3 Eliminando Duplicados

In [None]:
tweets_all_df.drop_duplicates(subset = ["user_name", "date", "text"], inplace=True)
print(f"all tweets: {tweets_all_df.shape}")

### 5.4 Exportando .csv 

In [None]:
tweets_all_df.to_csv("covid19_tweets.csv", index=False)