In [1]:
from utils.utilities import *
from transformers import pipeline

In [2]:
### Parameters:

data_path = './Data/data.csv'

In [49]:

def load_data(path):
    
    '''
    Function to load data into a Pandas DataFrame and clean date 
    '''
    data = pd.read_csv(path)
    data.published_date = pd.to_datetime(data.published_date)
    
    return data

def transformer_sentiment(sentiment_classifier, text):
    '''
    Predict sentiment of any text.
    '''
    return sentiment_classifier(text)[0]['label']

def assign_transformer_sentiment(data, sentiment_classifier):
    '''
    Assign new sentiment label to each tweet in the dataset.
    '''
    ## Use pre-trained transformer 'pipeline' from Huggingface to assign new sentiment to each tweet.
    ## Ref: https://github.com/huggingface/transformers
    
    
    data['transformer_sentiment'] = [transformer_sentiment(sentiment_classifier, x) for x in data.body]
    return data

def keyword_extractor(kw_extractor, text, top = 10):
    
    '''
    Extract the top n key words of a text with Yake library.
    https://github.com/LIAAD/yake
    '''
    keywords = kw_extractor.extract_keywords(text)
    keywords_list = [x[0] for x in keywords[:top]]
    
    return keywords_list

sentiment_classifier = pipeline('sentiment-analysis')
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [4]:
data = load_data(data_path)

In [21]:
data = assign_transformer_sentiment(data, sentiment_classifier)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [50]:
## Extract Keywords:

kw_extractor = yake.KeywordExtractor()
data['keywords'] = [keyword_extractor(kw_extractor, x) for x in data.body]

In [51]:
data

Unnamed: 0,body,clusters_0,sentiment_positive_score,sentiment_negative_score,sentiment_combined_score,sentiment_summary,published_date,post_type,total_engagement,comments,shares,likes,author_gender,followers,transformer_sentiment,keywords
0,During the pandemic government workers have de...,Poor Pay,7.170060,1.118036,1.511767,positive,2020-11-22,Reposts,0,0,0,0,Unknown,45747,NEGATIVE,"[vital public services, pandemic government wo..."
1,I didn’t realize how poor I was until I got a ...,Poor Pay,3.428319,4.349932,-0.584664,negative,2021-05-07,Reposts,0,0,0,0,Female,82152,NEGATIVE,"[good job, n’t realize, realize how poor, midd..."
2,British people have this really dumb complex w...,Cost of Living,1.277756,3.769611,-1.428440,negative,2021-01-02,Reposts,0,0,0,0,Unknown,13581,NEGATIVE,"[Dominic Cummings, British people, pay rise, d..."
3,Tory MPs have voted against a ban on residents...,Poor Pay,3.365988,5.851462,-0.899544,negative,2021-04-27,Reposts,0,0,0,0,Male,2976,NEGATIVE,"[fire safety problems, fix flammable cladding,..."
4,The trouble with Rishi Sunak's upbeat message ...,Wage Growth,5.453477,2.944297,0.269810,neutral,2021-10-28,Reposts,0,0,0,0,Male,32501,NEGATIVE,"[Rishi Sunak upbeat, Sunak upbeat message, bac..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Calderdale's head of public health wants the G...,Low Income Families,6.601709,1.788858,0.959178,positive,2020-11-11,Reposts,0,0,0,0,Female,2594,NEGATIVE,"[Gov to change, change the NHS, Calderdale hea..."
996,Sir Peter Lampl is correct too many kids are ...,Low Income Families,3.638085,1.318853,0.668121,positive,2021-08-09,Reposts,0,0,0,0,Unknown,2291,NEGATIVE,"[Sir Peter Lampl, Peter Lampl, Sir Peter, Lamp..."
997,Johnson n Symonds hoped the cost of refurbishm...,Mental Health,4.196486,2.896350,0.024222,neutral,2021-04-28,Reposts,0,0,0,0,Female,2272,NEGATIVE,"[Johnson n Symonds, Symonds hoped, charitable ..."
998,We should avoid this patter like the plague. T...,House Prices,4.083818,2.638728,0.090162,neutral,2021-03-25,Reposts,0,0,0,0,Unknown,1971,NEGATIVE,"[avoid this patter, plague, pay rise, negated ..."


In [5]:
topics = pd.DataFrame(data.clusters_0.value_counts().reset_index())
topics.columns = ['Topic', 'counts']
topics

Unnamed: 0,Topic,counts
0,Poor Pay,144
1,Low Income Families,123
2,Government Support,104
3,Mental Health,63
4,Cost of Living,60
5,State Pension,59
6,Public Sector Pay,46
7,Social Care,46
8,Poor People,38
9,Rich People,37


In [159]:


ner_model = pipeline("ner", aggregation_strategy = 'simple')

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english)


In [127]:
j = 12
ner_output = ner_model(data.body[j])
print(ner_output)

[{'entity_group': 'ORG', 'score': 0.9521137, 'word': 'Westminster', 'start': 31, 'end': 42}, {'entity_group': 'LOC', 'score': 0.9995466, 'word': 'UK', 'start': 56, 'end': 58}, {'entity_group': 'MISC', 'score': 0.854854, 'word': 'Tory', 'start': 127, 'end': 131}, {'entity_group': 'LOC', 'score': 0.9996809, 'word': 'Scotland', 'start': 220, 'end': 228}]


In [128]:
NER_extractor_from_transformer(ner_output)

([], ['Westminster', 'UK', 'Scotland'], [])

In [135]:
# ner_all = [NER_extractor_from_transformer(ner_model(x)) for x in data.body]

In [193]:
def NER_extractor_from_transformer(ner_dictionary):
    ''' Clean ner_dictioonary_output'''
    people = []
    locations = []
    organizations = []
    
    if len(ner_dictionary) ==0:
        return (people, locations, organizations)
    
    else: 
        try:
            for token in range(len(ner_dictionary)):
                if ner_dictionary[token]['entity_group'] == 'PER': 
                    people.append(ner_dictionary[token]['word'])

                elif ner_dictionary[token]['entity_group'] == 'LOC':
                    locations.append(ner_dictionary[token]['word'])

                elif ner_dictionary[token]['entity_group'] == 'ORG':
                    organizations.append(ner_dictionary[token]['word'])
                else:
                    pass
        except: 
            return (people, locations, organizations)
                
    return (people, locations, organizations)

def get_engagement_stats(data):
    
    ''' Function to extract stats of engagement data'''
    
    temp_data = data[['total_engagement', 'comments', 'shares', 'likes']].describe().loc[['mean','max']]
    return temp_data

In [None]:
@st.cache(hash_funcs={tokenizers.Tokenizer: lambda _: None, tokenizers.AddedToken: lambda _: None})
def get_ner_model():
    return pipeline("ner", aggregation_strategy = 'simple')

In [210]:
from utils.query_similar import *
query = 'Economic struggle in the UK'
similar_tweets = get_similar_tweets_to_text_string(data.body, query, rank = 10, return_as_dataframe = True)

ner_output_list = [NER_extractor_from_transformer(ner_model(x)) for x in similar_tweets['Top similar tweets']]

In [211]:
list_of_people = [people[0] for people in ner_output_list]
print('People: ', np.unique([j for i in list_of_people for j in i]))

list_of_locations = [locations[1] for locations in ner_output_list]
print('Locations: ', np.unique([j for i in list_of_locations for j in i]))

list_of_organisations = [org[2] for org in ner_output_list]
print('Organisations: ', np.unique([j for i in list_of_organisations for j in i]))

People:  ['Dominic Cummings' 'Johnson']
Locations:  ['Europe' 'UK']
Organisations:  ['DHSC' 'Lib Dem']


In [216]:
np.unique([j for i in list_of_organisations for j in i])

array(['DHSC', 'Lib Dem'], dtype='<U7')

In [10]:
from transformers import pipeline
classifier = pipeline('sentiment-analysis')
classifier(data.body[0])

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=629.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267844284.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=48.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




[{'label': 'NEGATIVE', 'score': 0.9875652194023132}]

In [219]:
data.clusters_0.unique()[:3]

array(['Poor Pay', 'Cost of Living', 'Wage Growth'], dtype=object)

In [240]:
def filter_data_by_topic(data, topics):
    ''' filter data by topics contained in column cluster_0'''
    return data[data.clusters_0.isin(list(topics))]

In [241]:
a = filter_data_by_topic(data,data.clusters_0.unique()[:4] )


In [242]:
a

Unnamed: 0,body,clusters_0,sentiment_positive_score,sentiment_negative_score,sentiment_combined_score,sentiment_summary,published_date,post_type,total_engagement,comments,shares,likes,author_gender,followers,transformer_sentiment,keywords
0,During the pandemic government workers have de...,Poor Pay,7.170060,1.118036,1.511767,positive,2020-11-22,Reposts,0,0,0,0,Unknown,45747,NEGATIVE,"[vital public services, pandemic government wo..."
1,I didn’t realize how poor I was until I got a ...,Poor Pay,3.428319,4.349932,-0.584664,negative,2021-05-07,Reposts,0,0,0,0,Female,82152,NEGATIVE,"[good job, n’t realize, realize how poor, midd..."
2,British people have this really dumb complex w...,Cost of Living,1.277756,3.769611,-1.428440,negative,2021-01-02,Reposts,0,0,0,0,Unknown,13581,NEGATIVE,"[Dominic Cummings, British people, pay rise, d..."
3,Tory MPs have voted against a ban on residents...,Poor Pay,3.365988,5.851462,-0.899544,negative,2021-04-27,Reposts,0,0,0,0,Male,2976,NEGATIVE,"[fire safety problems, fix flammable cladding,..."
4,The trouble with Rishi Sunak's upbeat message ...,Wage Growth,5.453477,2.944297,0.269810,neutral,2021-10-28,Reposts,0,0,0,0,Male,32501,NEGATIVE,"[Rishi Sunak upbeat, Sunak upbeat message, bac..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
967,UK energy bills could rise 30% in 2022. Wages ...,Cost of Living,2.132700,4.835561,-1.165182,negative,2021-10-08,Reposts,0,0,0,0,Unknown,4105,NEGATIVE,"[bills could rise, energy bills, levellingdown..."
980,"Since the coup , Kyat value falls and the pric...",Poor Pay,2.891547,4.445826,-0.776748,negative,2021-09-17,Original,0,0,0,0,Unknown,143,NEGATIVE,"[Kyat value falls, price of goods, goods incre..."
986,Thomas Paine said Basic Income should be paid ...,Rich People,2.891547,4.301985,-0.743858,negative,2021-03-22,Reposts,0,0,0,0,Female,5049,NEGATIVE,"[Thomas Paine, prevent invidious distinctions,..."
987,Extraordinary: on the same day Johnson has bee...,Poor Pay,6.222808,2.307993,0.645269,positive,2021-03-11,Reposts,0,0,0,0,Unknown,4169,NEGATIVE,"[public money, day Johnson, bridge between Sco..."


In [26]:
sentiment_data = data.transformer_sentiment.value_counts().reset_index()
sentiment_data.columns = ['Sentiment', 'counts']

Unnamed: 0,index,transformer_sentiment
0,NEGATIVE,857
1,POSITIVE,143


In [27]:
## Total 
gender_data = data.author_gender.value_counts().reset_index()
gender_data.columns = ['Sentiment', 'counts']

Unnamed: 0,index,author_gender
0,Unknown,452
1,Male,356
2,Female,192


In [28]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1585.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1625270765.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




In [34]:
print(data.body[1])
print(summarizer(data.body[1], max_length=50, min_length=10, do_sample=False))

Your max_length is set to 50, but you input_length is only 44. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)


I didn’t realize how poor I was until I got a good job. Education and a middle class salary can not undo the long term economic and emotional damage of living most of your life in poverty.
[{'summary_text': '"I didn\'t realize how poor I was until I got a good job," she says. "Education and a middle class salary can not undo the long term economic and emotional damage"'}]


In [41]:
## Clean text:
from scipy import spatial
import gensim.downloader as api

model = api.load("glove-wiki-gigaword-50") 

def preprocess(text_string):
    return [i.lower() for i in text_string.split()]

def get_vector(s):
    return np.sum(np.array([model[i] for i in preprocess(s)]), axis=0)





In [43]:
get_vector(data.body[0])

KeyError: "Key 'services,' not present"

In [44]:
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel

model = LdaModel(common_corpus, 5, common_dictionary)
cm = CoherenceModel(model=model, corpus=common_corpus, coherence='u_mass')
coherence = cm.get_coherence()  # get coherence value