In [1]:
import pandas as pd

In [2]:
import os
import sys

notebook_dir = os.path.dirname(os.path.abspath("__file__"))
project_dir = os.path.dirname(notebook_dir)

if project_dir not in sys.path:
    sys.path.append(project_dir)

In [3]:
tweets_df = pd.read_csv(r"C:\Users\johna\anaconda3\envs\royalmail-dash\royalmail-dash\data\01_raw\tweets_details2023-03-15_20-43-36.csv")
tweets_df = tweets_df.loc[tweets_df['language'] == 'en'].copy()
tweets_df['date'] = pd.to_datetime(tweets_df['creation_date'], format='%a %b %d %H:%M:%S +0000 %Y')
tweets_df['month_year'] = tweets_df['date'].apply(lambda x: x.strftime('%Y-%m'))
tweets_df = tweets_df.loc[tweets_df['date'] >= '01-01-2023']
tweets_df = tweets_df.drop_duplicates()
tweets_df = tweets_df[tweets_df['user.location'].notna()]
usernames_to_exclude = ['myroyalmailuk', 'royalmailnews', 'RoyalMailStamps', 'RoyalMail', 'Royal_Mail_PAF', 'BBCNews', 'BBCTech', 'BBCBreaking', 'FSNewsUK']
tweets_df = tweets_df.loc[~tweets_df['user.username'].isin(usernames_to_exclude)]
tweets_df = tweets_df.loc[~tweets_df['user.username'].str.contains('news', case=False)]
tweets_df = tweets_df.loc[tweets_df['user.is_blue_verified']==False]

tweets_df

Unnamed: 0,tweet_id,creation_date,text,media_url,video_url,language,favorite_count,retweet_count,reply_count,quote_count,...,user.description,user.external_url,user.number_of_tweets,user.bot,user.timestamp,user.has_nft_avatar,user,detail,date,month_year
30135,1.609589e+18,Sun Jan 01 16:33:45 +0000 2023,"20 days to 41, and I still don't have some of ...",,,en,0.0,0.0,0.0,0.0,...,"Software developer. Science, Tech, LEGO. Gener...",https://wizpip.com,31750.0,False,1.199709e+09,False,,,2023-01-01 16:33:45,2023-01
30136,1.609571e+18,Sun Jan 01 15:22:04 +0000 2023,How Can I Track My Royal Mail Parcel Using My ...,,,en,0.0,0.0,0.0,0.0,...,World's Leading Business and News Magazine,https://business2news.com/,619.0,False,1.590952e+09,False,,,2023-01-01 15:22:04,2023-01
30137,1.609661e+18,Sun Jan 01 21:19:48 +0000 2023,Just ludicrous! #Capping at 2%. Now is the ti...,,,en,0.0,0.0,0.0,0.0,...,Believe everyone is Equal. \nPhD Prison Office...,,22806.0,False,1.355993e+09,False,,,2023-01-01 21:19:48,2023-01
30138,1.609626e+18,Sun Jan 01 19:03:01 +0000 2023,@RoyalMail 90% of my Xmas post which I sent fi...,,,en,0.0,1.0,0.0,0.0,...,"Mother to teenager, originally from Hannover, ...",,1458.0,False,1.578340e+09,False,,,2023-01-01 19:03:01,2023-01
30139,1.609671e+18,Sun Jan 01 22:01:17 +0000 2023,Shame on #SimonThompson and the #RoyalMail boa...,,,en,1.0,0.0,0.0,0.0,...,"I make stuff: chaos, cake, poems, stitches, lo...",http://helenmccarthy.wordpress.com,193252.0,False,1.260864e+09,False,,,2023-01-01 22:01:17,2023-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31537,1.635787e+18,Tue Mar 14 23:34:48 +0000 2023,'The oppositional sentiments expressed by post...,,,en,6.0,4.0,0.0,0.0,...,"Validation,validation,validation. My Great Gra...",http://www.wsws.org,119012.0,False,1.244396e+09,False,,,2023-03-14 23:34:48,2023-03
31538,1.635323e+18,Mon Mar 13 16:52:57 +0000 2023,Couple of old covers used for outgoing orders ...,['https://pbs.twimg.com/media/FrHVqfvXoAoj6oF....,,en,63.0,1.0,1.0,1.0,...,"Team PTS! MD @ptsandstampex, proud Fellow of @...",http://www.artstamped.com,13749.0,False,1.389218e+09,False,,,2023-03-13 16:52:57,2023-03
31540,1.635573e+18,Tue Mar 14 09:24:47 +0000 2023,Our next breakfast is Thurs 6 Apr at @HullGolf...,['https://pbs.twimg.com/media/FrK4rZ7XsAARiNR....,,en,1.0,0.0,0.0,1.0,...,Hull Businesswomen's Breakfast Club - to provi...,,1739.0,False,1.364481e+09,False,,,2023-03-14 09:24:47,2023-03
31541,1.635522e+18,Tue Mar 14 06:02:30 +0000 2023,Good morning! We’ll be open from 7am for all o...,['https://pbs.twimg.com/media/FrKKYKHX0AA16KB....,,en,9.0,3.0,1.0,0.0,...,"Convenience Store & Post Office Selling 🗞, 🍞,🍏...",http://www.facebook.com/boscombeeastpo,10678.0,False,1.349427e+09,False,,,2023-03-14 06:02:30,2023-03


In [4]:
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from scripts.data_processing.preprocess_tweets_lite import TextCleaner


def get_topics(df, MODEL, model_path):
    
    _df = df.copy()
        
    # load model
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    # Set up tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL)

    cleaner = TextCleaner(stop_words_remove=False)

    _df['cleaned_text'] = _df['text'].apply(cleaner.clean_text)

    # set up pipeline
    classifier = pipeline(
        "text-classification", model=model, tokenizer=tokenizer, device=0
    )

    # perform inference and extract predicted class and probability
    results = classifier(list(_df["cleaned_text"]), truncation=True)
    
    _df["topic"] = [result['label'] for result in results]
    _df["topic_score"] = [result['score'] for result in results]
    
    # Create a new DataFrame with the required columns
    output_df = _df[["tweet_id","topic", "topic_score", "cleaned_text", "user.location", "user.is_blue_verified", "favorite_count", "retweet_count", "reply_count", "quote_count", "date", "month_year"]]

    return output_df


In [5]:
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from scripts.data_processing.preprocess_tweets_lite import TextCleaner


def get_emotion(df):
    
    _df = df.copy()
            
    #task='sentiment'
    MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"    

    # load model
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)

    # Set up tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL)

    cleaner = TextCleaner(stop_words_remove=False)

    # set up pipeline
    classifier = pipeline(
        "text-classification", model=model, tokenizer=tokenizer, device=0
    )

    # perform inference and extract predicted class and probability
    results = classifier(list(_df["cleaned_text"]), truncation=True)
    
    _df["sentiment"] = [result['label'] for result in results]
    _df["sentiment_score"] = [result['score'] for result in results]
    
    return _df


In [6]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

def get_lat_lon(location):
    geolocator = Nominatim(user_agent="geoapiExercises")
    try:
        location_data = geolocator.geocode(location, timeout=10)
        if location_data:
            return location_data.latitude, location_data.longitude
    except GeocoderTimedOut:
        return get_lat_lon(location)
    return None, None

def add_lat_lon_columns(df, location_column):
    df_copy = df.copy()
    df_copy['coordinates'] = df_copy[location_column].apply(get_lat_lon)
    df_copy[['latitude', 'longitude']] = pd.DataFrame(df_copy['coordinates'].tolist(), index=df_copy.index)
    df_copy = df_copy.drop(columns=['coordinates'])
    return df_copy


In [7]:
tweets_dash_data = get_topics(df=tweets_df, MODEL="cardiffnlp/twitter-roberta-base", model_path=r"C:\Users\johna\OneDrive\Desktop\models_twitter_dash\output")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [8]:
tweets_dash_data_2 = get_emotion(df=tweets_dash_data)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:
tweets_dash_final = add_lat_lon_columns(tweets_dash_data_2, 'user.location')
tweets_dash_final = tweets_dash_final.dropna(subset=['latitude', 'longitude']).copy()
tweets_dash_final

Unnamed: 0,tweet_id,topic,topic_score,cleaned_text,user.location,user.is_blue_verified,favorite_count,retweet_count,reply_count,quote_count,date,month_year,sentiment,sentiment_score,latitude,longitude
30135,1.609589e+18,Customer Service,0.960980,"20 days to 41, and I still don't have some of ...",Bedford,False,0.0,0.0,0.0,0.0,2023-01-01 16:33:45,2023-01,neutral,0.468922,40.027145,-78.523745
30136,1.609571e+18,Customer Service,0.943773,How Can I Track My Royal Mail Parcel Using My ...,"Durgapur, India",False,0.0,0.0,0.0,0.0,2023-01-01 15:22:04,2023-01,neutral,0.911570,23.535048,87.338043
30137,1.609661e+18,Financial News,0.969004,Just ludicrous! #Capping at 2%. Now is the ti...,Scotland,False,0.0,0.0,0.0,0.0,2023-01-01 21:19:48,2023-01,negative,0.883241,56.786111,-4.114052
30138,1.609626e+18,Customer Service,0.958329,@RoyalMail 90% of my Xmas post which I sent fi...,Lincolnshire UK,False,0.0,1.0,0.0,0.0,2023-01-01 19:03:01,2023-01,negative,0.567491,53.182303,-0.203121
30139,1.609671e+18,Financial News,0.972166,Shame on #SimonThompson and the #RoyalMail boa...,London,False,1.0,0.0,0.0,0.0,2023-01-01 22:01:17,2023-01,negative,0.926638,51.507336,-0.127650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31537,1.635787e+18,Customer Service,0.526125,'The oppositional sentiments expressed by post...,Liverpool,False,6.0,4.0,0.0,0.0,2023-03-14 23:34:48,2023-03,neutral,0.631704,53.407199,-2.991680
31538,1.635323e+18,Customer Service,0.960618,Couple of old covers used for outgoing orders ...,United Kingdom,False,63.0,1.0,1.0,1.0,2023-03-13 16:52:57,2023-03,positive,0.823597,54.702354,-3.276575
31540,1.635573e+18,Royal Mail Jobs,0.977593,Our next breakfast is Thurs 6 Apr at @HullGolf...,Hull,False,1.0,0.0,0.0,1.0,2023-03-14 09:24:47,2023-03,neutral,0.829839,53.743572,-0.339476
31541,1.635522e+18,Royal Mail Jobs,0.979051,Good morning! We’ll be open from 7am for all o...,"Bournemouth, BH7 6DQ",False,9.0,3.0,1.0,0.0,2023-03-14 06:02:30,2023-03,positive,0.836590,50.744673,-1.857958


In [10]:
tweets_dash_final.to_csv(r"C:\Users\johna\anaconda3\envs\royalmail-dash\royalmail-dash\data\02_intermediate\tweets_dash.csv", index=False)

In [11]:
import folium
from folium.plugins import MarkerCluster
import branca

# Define a function to assign emojis based on emotion
def get_emoji(sentiment):
    if sentiment == "positive":
        return "😁"
    elif sentiment == "neutral":
        return "😐"
    elif sentiment == "negative":
        return "😡"
    else:
        return "❓"

# Assuming your DataFrame has an 'emotion' column
tweets_dash_final['emoji'] = tweets_dash_final['sentiment'].apply(get_emoji)

# Create a base map centered on the UK
map = folium.Map(location=[51.5074, -0.1278], zoom_start=6)

# Assuming you have a list of topics
topics = ['Customer Service', 'Philately', 'Politics', 'Royal Reply', 'Royal Mail Jobs', 'Financial News']
topic_layer_groups = {}

# Create a LayerGroup for each topic and add it to the map
for topic in topics:
    topic_layer_groups[topic] = folium.FeatureGroup(name=topic)
    map.add_child(topic_layer_groups[topic])

# Loop through each row in the DataFrame and add a marker with an emoji to the appropriate layer
for index, row in tweets_dash_final.iterrows():
    lat, lon = row['latitude'], row['longitude']
    emoji = row['emoji']
    topic = row['topic']
    cleaned_text = row['cleaned_text']

    marker = folium.Marker(
        location=[lat, lon],
        icon=folium.DivIcon(html=f"""<div style="font-size:24px;">{emoji}</div>"""),
        tooltip=cleaned_text
    )
    topic_layer_groups[topic].add_child(marker)

# Add layer control to switch between layers
map.add_child(folium.LayerControl())

# Display the map
map
