In [1]:
import numpy as np
import pandas as pd

import tweepy
from tweepy import Cursor
from keys import api_key, api_secret_key

from transformers import pipeline

import json

In [2]:
classifier = pipeline('sentiment-analysis')

In [3]:
clf = classifier('We are very happy to include pipeline into the transformers repository.')

In [4]:
clf[0]['label']

'POSITIVE'

In [5]:
clf[0]['score']

0.9978193640708923

In [6]:
auth = tweepy.OAuthHandler(api_key, api_secret_key)

In [7]:
api = tweepy.API(auth)

In [9]:
df_list = []
for tweet in Cursor(api.search, q="innocent smoothies", tweet_mode="extended").items(1000):
    temp_dict = {}
    temp_dict['id'] = tweet.id
    temp_dict['created_at'] = tweet.created_at
    temp_dict['full_text'] = tweet.full_text
    temp_dict['retweet_count'] = tweet.retweet_count
    df_list.append(temp_dict)
df = pd.DataFrame.from_dict(df_list)

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_api', '_json', 'author', 'contributors', 'coordinates', 'created_at', 'destroy', 'display_text_range', 'entities', 'favorite', 'favorite_count', 'favorited', 'full_text', 'geo', 'id', 'id_str', 'in_reply_to_screen_name', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'is_quote_status', 'lang', 'metadata', 'parse', 'parse_list', 'place', 'retweet', 'retweet_count', 'retweeted', 'retweets', 'source', 'source_url', 'truncated', 'user']


In [125]:
df

Unnamed: 0,id,created_at,full_text,retweet_count
0,1347527991790854145,2021-01-08 12:58:03,@buymytatpls We've never promoted anti-vaxxer ...,0
1,1347514202936791042,2021-01-08 12:03:16,@buymytatpls We've never promoted anti-vaxxer ...,0
2,1347493892254535681,2021-01-08 10:42:33,@innocent You have enhanced my Friday morning ...,0
3,1347465899197001728,2021-01-08 08:51:19,not to start a smoothie war but recently I've ...,0
4,1347463979430522881,2021-01-08 08:43:42,my life would be a lot easier if I had some wa...,0
5,1347143661839130625,2021-01-07 11:30:52,omg remember those innocent smoothies when the...,0
6,1346966017373777920,2021-01-06 23:44:58,@MineCartMayhem @innocent Do you expect them t...,0
7,1346928706418339841,2021-01-06 21:16:42,RT @MattChorley: EXCLUSIVE: Boris Johnson does...,19
8,1346881943107989509,2021-01-06 18:10:53,"RT @NHSuk: @innocent We stand with Louisa, and...",1
9,1346881686689230854,2021-01-06 18:09:52,@innocent Tell Louisa I'm on it. Both smoothie...,0


In [126]:
def classify(text):
    clf = classifier(text)
    return (clf[0]['label'], clf[0]['score'])

In [127]:
df['label'], df['score'] = zip(*df['full_text'].map(lambda x: classify(x)))

In [128]:
df['date'] = df['created_at'].apply(lambda x: x.date())

In [129]:
df

Unnamed: 0,id,created_at,full_text,retweet_count,label,score,date
0,1347527991790854145,2021-01-08 12:58:03,@buymytatpls We've never promoted anti-vaxxer ...,0,POSITIVE,0.986515,2021-01-08
1,1347514202936791042,2021-01-08 12:03:16,@buymytatpls We've never promoted anti-vaxxer ...,0,POSITIVE,0.984648,2021-01-08
2,1347493892254535681,2021-01-08 10:42:33,@innocent You have enhanced my Friday morning ...,0,NEGATIVE,0.996681,2021-01-08
3,1347465899197001728,2021-01-08 08:51:19,not to start a smoothie war but recently I've ...,0,POSITIVE,0.983925,2021-01-08
4,1347463979430522881,2021-01-08 08:43:42,my life would be a lot easier if I had some wa...,0,NEGATIVE,0.998334,2021-01-08
5,1347143661839130625,2021-01-07 11:30:52,omg remember those innocent smoothies when the...,0,POSITIVE,0.964045,2021-01-07
6,1346966017373777920,2021-01-06 23:44:58,@MineCartMayhem @innocent Do you expect them t...,0,NEGATIVE,0.99843,2021-01-06
7,1346928706418339841,2021-01-06 21:16:42,RT @MattChorley: EXCLUSIVE: Boris Johnson does...,19,NEGATIVE,0.988612,2021-01-06
8,1346881943107989509,2021-01-06 18:10:53,"RT @NHSuk: @innocent We stand with Louisa, and...",1,POSITIVE,0.989658,2021-01-06
9,1346881686689230854,2021-01-06 18:09:52,@innocent Tell Louisa I'm on it. Both smoothie...,0,POSITIVE,0.71272,2021-01-06


In [250]:
total = len(df)
total_pos = df['label'].value_counts()[1]
total_neg = df['label'].value_counts()[0]
total_neu = 0

In [131]:
idx_max_pos = df[df['label'] == 'POSITIVE']['score'].idxmax()
idx_max_neg = df[df['label'] == 'NEGATIVE']['score'].idxmax()
idx_max_retweets = df['retweet_count'].idxmax()

id_max_pos = df.iloc[idx_max_pos]['id']
id_max_neg = df.iloc[idx_max_neg]['id']
id_max_retweets = df.iloc[idx_max_retweets]['id']

In [244]:
dates = df.groupby('date').count().reset_index()['date']
dates = list(dates.apply(lambda x: x.strftime("%d/%m/%Y")))
tweets_per_day = list(df.groupby('date').count().reset_index()['id'])

In [245]:
dates

['01/01/2021',
 '02/01/2021',
 '03/01/2021',
 '04/01/2021',
 '05/01/2021',
 '06/01/2021',
 '07/01/2021',
 '08/01/2021']

In [269]:
tweets_per_day

[1, 8, 2, 16, 7, 16, 1, 5]

In [227]:
line_df = df.groupby(['date', 'label']).count().reset_index()

In [228]:
line_df

Unnamed: 0,date,label,id,created_at,full_text,retweet_count,score
0,2021-01-01,POSITIVE,1,1,1,1,1
1,2021-01-02,NEGATIVE,2,2,2,2,2
2,2021-01-02,POSITIVE,6,6,6,6,6
3,2021-01-03,NEGATIVE,1,1,1,1,1
4,2021-01-03,POSITIVE,1,1,1,1,1
5,2021-01-04,NEGATIVE,11,11,11,11,11
6,2021-01-04,POSITIVE,5,5,5,5,5
7,2021-01-05,NEGATIVE,5,5,5,5,5
8,2021-01-05,POSITIVE,2,2,2,2,2
9,2021-01-06,NEGATIVE,11,11,11,11,11


In [229]:
line_df = line_df.pivot(index='date', columns='label', values='id').reset_index()

In [230]:
line_df

label,date,NEGATIVE,POSITIVE
0,2021-01-01,,1.0
1,2021-01-02,2.0,6.0
2,2021-01-03,1.0,1.0
3,2021-01-04,11.0,5.0
4,2021-01-05,5.0,2.0
5,2021-01-06,11.0,5.0
6,2021-01-07,,1.0
7,2021-01-08,2.0,3.0


In [236]:
def f(row):
    if np.isnan(row['NEGATIVE']) and ~np.isnan(row['POSITIVE']):
        val = 100
    elif ~np.isnan(row['NEGATIVE']) and np.isnan(row['POSITIVE']):
        val = 0
    else:
        val = 100 * (row['POSITIVE'] / (row['POSITIVE'] + (row['NEGATIVE'])))
    return round(val)

In [237]:
line_df

label,date,NEGATIVE,POSITIVE,percent_pos
0,2021-01-01,,1.0,100.0
1,2021-01-02,2.0,6.0,75.0
2,2021-01-03,1.0,1.0,50.0
3,2021-01-04,11.0,5.0,31.25
4,2021-01-05,5.0,2.0,28.571429
5,2021-01-06,11.0,5.0,31.25
6,2021-01-07,,1.0,100.0
7,2021-01-08,2.0,3.0,60.0


In [238]:
line_df['percent_pos'] = line_df.apply(f, axis=1)

In [239]:
line_df

label,date,NEGATIVE,POSITIVE,percent_pos
0,2021-01-01,,1.0,100
1,2021-01-02,2.0,6.0,75
2,2021-01-03,1.0,1.0,50
3,2021-01-04,11.0,5.0,31
4,2021-01-05,5.0,2.0,29
5,2021-01-06,11.0,5.0,31
6,2021-01-07,,1.0,100
7,2021-01-08,2.0,3.0,60


In [252]:
pos_tweets_per_day = list(line_df.apply(f, axis=1))

In [278]:
dict_data = {
    'totals': {
        'total': f'{total}',
        'total_pos': total_pos,
        'total_neg': total_neg,
        'total_neu': total_neu
        },
    'line': {
        'dates': dates,
        'pos_tweets_per_day': pos_tweets_per_day
        },
    'bar': {
        'dates': dates,
        'tweets_per_day': tweets_per_day
        },
    'tweets': {
        'id_max_pos': id_max_pos,
        'id_max_neg': id_max_neg,
        'id_max_retweets': id_max_retweets
        }
    }

In [279]:
dict_data

{'totals': {'total': '56', 'total_pos': 24, 'total_neg': 32, 'total_neu': 0},
 'line': {'dates': ['01/01/2021',
   '02/01/2021',
   '03/01/2021',
   '04/01/2021',
   '05/01/2021',
   '06/01/2021',
   '07/01/2021',
   '08/01/2021'],
  'pos_tweets_per_day': [100, 75, 50, 31, 29, 31, 100, 60]},
 'bar': {'dates': ['01/01/2021',
   '02/01/2021',
   '03/01/2021',
   '04/01/2021',
   '05/01/2021',
   '06/01/2021',
   '07/01/2021',
   '08/01/2021'],
  'tweets_per_day': [1, 8, 2, 16, 7, 16, 1, 5]},
 'tweets': {'id_max_pos': 1345033363732705282,
  'id_max_neg': 1346200179536506880,
  'id_max_retweets': 1346928706418339841}}

In [280]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)

In [281]:
json_data = json.dumps(dict_data, cls=NpEncoder)

In [282]:
json_data

'{"totals": {"total": "56", "total_pos": 24, "total_neg": 32, "total_neu": 0}, "line": {"dates": ["01/01/2021", "02/01/2021", "03/01/2021", "04/01/2021", "05/01/2021", "06/01/2021", "07/01/2021", "08/01/2021"], "pos_tweets_per_day": [100, 75, 50, 31, 29, 31, 100, 60]}, "bar": {"dates": ["01/01/2021", "02/01/2021", "03/01/2021", "04/01/2021", "05/01/2021", "06/01/2021", "07/01/2021", "08/01/2021"], "tweets_per_day": [1, 8, 2, 16, 7, 16, 1, 5]}, "tweets": {"id_max_pos": 1345033363732705282, "id_max_neg": 1346200179536506880, "id_max_retweets": 1346928706418339841}}'