In [None]:
# Code is partly borrowed from https://betterprogramming.pub/how-to-scrape-tweets-with-snscrape-90124ed006af

# Imports

In [None]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

In [None]:
import pickle
def save_obj(obj, name):
    pickle.dump(obj,open(name + '.pkl', 'wb'), protocol=4)
    
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

# Get the tweets

In [None]:
queries = ['vegan', 'plant based', 'vegetarian', 'veggie', 'veganism', 'cruelty-free', 'plant milk', 'tofu']

In [None]:
# Creating list to append tweet data to
tweets_list2 = []

years = [i for i in range(2010, 2023, 1)]
months = [i for i in range(1, 13, 1)]
mdays = {1:31, 2:28, 3:31, 4:30, 5:31, 6:30, 7:31, 8:31, 9:30, 10:31, 11:30, 12:31}

for y in years:
    for m in months:
        for q in tqdm(queries):
            print(y, m, q)
            if (y == 2022) and (m > 5):
                break
            # Using TwitterSearchScraper to scrape data and append tweets to list
            for i,tweet in enumerate(tqdm(sntwitter.TwitterSearchScraper(q + ' since:{y}-{m}-01 until:{y}-{m}-{eday}'.format(y=y, m=m, eday=mdays[m])).get_items())):
                if (i>99) or (not tweet):
                #if not(tweet):
                    break
                tweets_list2.append([tweet.date, 
                                     tweet.id, 
                                     tweet.content, 
                                     tweet.user.username, 
                                     q, 
                                     tweet.replyCount, 
                                     tweet.retweetCount, 
                                     tweet.likeCount, 
                                     tweet.quoteCount, 
                                     tweet.lang, 
                                     tweet.source, 
                                     tweet.retweetedTweet, 
                                     tweet.quotedTweet,
                                     tweet.mentionedUsers])
        
len(tweets_list2)

In [None]:
len(tweets_list2)

In [None]:
# Creating a dataframe from the tweets list above
tweets_df2 = pd.DataFrame(tweets_list2, columns=['Datetime', 
                                                 'Tweet Id', 
                                                 'Text', 
                                                 'Username', 
                                                 'Query',
                                                 'replyCount', 
                                                 'retweetCount', 
                                                 'likeCount', 
                                                 'quoteCount', 
                                                 'lang', 
                                                 'source', 
                                                 'retweetedTweet', 
                                                 'quotedTweet',
                                                 'mentionedUsers'])



In [None]:
tweets_df2.to_csv('vegan_tweets2010-2022_monthly.csv', ';', index=False, encoding='utf8')

In [None]:
tweets_df2.shape

In [None]:
tweets_df2.head()

In [None]:
save_obj(tweets_df2, 'vegan_tweets2010-2022_monthly')

In [None]:
tweets_df2 = load_obj('vegan_tweets2010-2022_monthly')

# Explore datatset

## Language

In [None]:
tweet_lang_ds = tweets_df2.groupby('lang').count()
tweet_lang_ds

In [None]:
new_lang_lbls = [l if n > 1000 else 'other' for l, n in zip(tweet_lang_ds.index, tweet_lang_ds.Text)]
tweet_lang_ds['new_lang_lbls'] = new_lang_lbls
tweet_lang_ds = tweet_lang_ds.groupby('new_lang_lbls').sum()

In [None]:
tweet_lang_ds

In [None]:
figure(figsize=(10, 10), dpi=80)
plt.pie(tweet_lang_ds.Text, labels=tweet_lang_ds.index, rotatelabels=True, autopct='%.0f%%', pctdistance=1.1, labeldistance=None)
plt.legend()

## Interaction with tweets

In [None]:
tweets_df2[['replyCount', 'retweetCount', 'likeCount', 'quoteCount']].hist()

In [None]:
tweets_df2.describe(percentiles=[0.7, 0.8, 0.9, 0.95, 0.97, 0.99]).round(2)

## Tweets by query

In [None]:
tweets_df2[tweets_df2.likeCount > 0].shape

In [None]:
tw_by_query = tweets_df2[tweets_df2.likeCount > 0].groupby('Query').count()
tw_by_query = tw_by_query.sort_values(by=['Text'])
tw_by_query

In [None]:

fig, ax = plt.subplots(figsize=(10,10))
plt.rcParams.update({'font.size': 22})

bars = ax.barh(tw_by_query.index, tw_by_query.Text)

#plt.barh(tw_by_query.index, tw_by_query.Text)

ax.bar_label(bars, labels=[f'{x:,.0f}' for x in bars.datavalues], label_type='center', color='white')

## Tweets by date

In [None]:
tweets_df2['Year'] = [d.year for d in tweets_df2.Datetime]
tweets_df2['Month'] = [d.month for d in tweets_df2.Datetime]

In [None]:
tweets_df2.head()

In [None]:
veg_post_yearly = tweets_df2[tweets_df2.likeCount > 0].groupby('Year').count()[['Text']]
veg_post_yearly

In [None]:
plt.rcParams.update({'font.size': 12})
plt.plot(veg_post_yearly.index, veg_post_yearly.Text.values)
plt.ylabel('Number of tweets')
plt.xlabel('Years')

In [None]:
veg_post_monthly = tweets_df2[tweets_df2.likeCount > 0].groupby('Month').count()[['Text']]
veg_post_monthly

In [None]:
plt.plot(veg_post_monthly.index, veg_post_monthly.Text.values)
plt.ylabel('Number of tweets')
plt.xlabel('Month of a year')

In [None]:
veg_post_yearly_monthly = tweets_df2[tweets_df2.likeCount > 0].groupby(['Year','Month']).count()[['Text']]
veg_post_yearly_monthly

In [None]:
veg_post_yearly_monthly.index

In [None]:


figure(figsize=(20, 10), dpi=80)

idx = [str(i) for i in veg_post_yearly_monthly.index]
years = np.array([[y,m] for y, m in veg_post_yearly_monthly.index])[:,0]
plt.plot(idx, veg_post_yearly_monthly.Text.values, labels=years)
plt.xticks(rotation = 90)

In [None]:
np.array([[y,m] for y, m in veg_post_yearly_monthly.index])[:,0]