In [1]:
import pandas as pd

In [2]:
import os
import sys

notebook_dir = os.path.dirname(os.path.abspath("__file__"))
project_dir = os.path.dirname(notebook_dir)

if project_dir not in sys.path:
    sys.path.append(project_dir)

In [3]:
tweets_df = pd.read_csv(r"C:\Users\johna\anaconda3\envs\royalmail-dash\royalmail-dash\data\01_raw\tweets_details2023-03-15_20-43-36.csv")
tweets_df = tweets_df.loc[tweets_df['language'] == 'en'].copy()
tweets_df = tweets_df.sample(10)

In [4]:
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from scripts.data_processing.preprocess_tweets_lite import TextCleaner


def get_topics(df, MODEL, model_path):
    
    _df = df.copy()
        

    # load model
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    # Set up tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL)

    cleaner = TextCleaner(stop_words_remove=False)

    _df['cleaned_text'] = _df['text'].apply(cleaner.clean_text)

    # set up pipeline
    classifier = pipeline(
        "text-classification", model=model, tokenizer=tokenizer, device=0
    )

    # perform inference and extract predicted class and probability
    results = classifier(list(_df["cleaned_text"]), truncation=True)
    
    _df["topic"] = [result['label'] for result in results]
    _df["topic_score"] = [result['score'] for result in results]
    
    # Create a new DataFrame with the required columns
    output_df = _df[["topic", "topic_score", "cleaned_text", "user.location", "user.is_blue_verified", "favorite_count", "retweet_count", "reply_count", "quote_count", "creation_date"]]

    return output_df


In [5]:
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from scripts.data_processing.preprocess_tweets_lite import TextCleaner


def get_emotion(df):
    
    _df = df.copy()
            
    task='emotion'
    MODEL = f"cardiffnlp/twitter-roberta-base-{task}"    

    # load model
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)

    # Set up tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL)

    cleaner = TextCleaner(stop_words_remove=False)

    # set up pipeline
    classifier = pipeline(
        "text-classification", model=model, tokenizer=tokenizer, device=0
    )

    # perform inference and extract predicted class and probability
    results = classifier(list(_df["cleaned_text"]), truncation=True)
    
    _df["emotion"] = [result['label'] for result in results]
    _df["emotion_score"] = [result['score'] for result in results]
    
    return _df


In [6]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

def get_lat_lon(location):
    geolocator = Nominatim(user_agent="geoapiExercises")
    try:
        location_data = geolocator.geocode(location, timeout=10)
        if location_data:
            return location_data.latitude, location_data.longitude
    except GeocoderTimedOut:
        return get_lat_lon(location)
    return None, None

def add_lat_lon_columns(df, location_column):
    df_copy = df.copy()
    df_copy['coordinates'] = df_copy[location_column].apply(get_lat_lon)
    df_copy[['latitude', 'longitude']] = pd.DataFrame(df_copy['coordinates'].tolist(), index=df_copy.index)
    df_copy = df_copy.drop(columns=['coordinates'])
    return df_copy


In [7]:
tweets_dash_data = get_topics(df=tweets_df, MODEL="cardiffnlp/twitter-roberta-base", model_path=r"C:\Users\johna\OneDrive\Desktop\models_twitter_dash\output")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [8]:
tweets_dash_data_2 = get_emotion(df=tweets_dash_data)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:
tweets_dash_final = add_lat_lon_columns(tweets_dash_data_2, 'user.location')
tweets_dash_final

Unnamed: 0,topic,topic_score,cleaned_text,user.location,user.is_blue_verified,favorite_count,retweet_count,reply_count,quote_count,creation_date,emotion,emotion_score,latitude,longitude
1349,Customer Service,0.95595,You Just got Lawyered by #RoyalMail! 😂😂😂,On the Corner,False,0.0,0.0,0.0,1.0,Mon Nov 07 10:57:09 +0000 2016,optimism,0.5374,52.048027,-0.023892
692,Customer Service,0.88676,#RoyalMail advising plebs there's no night shi...,Taxhaven-sur-Mer,False,0.0,0.0,0.0,0.0,Mon Sep 12 23:47:42 +0000 2016,joy,0.951047,,
8813,Customer Service,0.943916,How fantastic is this! Please get involved and...,"East Midlands, England",False,314.0,326.0,34.0,53.0,Sat Jul 14 22:56:59 +0000 2018,optimism,0.904597,52.796561,-0.671795
30674,Politics,0.965101,"So @darrenpjones How can you call yourself ""im...",Wirral,False,1.0,0.0,0.0,0.0,Thu Jan 26 18:34:09 +0000 2023,joy,0.969231,53.340971,-3.050092
29707,Customer Service,0.931512,@karren_brady maybe you should look in to the ...,"Wigan, from the Isle of Bute",False,0.0,0.0,0.0,0.0,Sun Dec 11 22:03:11 +0000 2022,joy,0.924056,,
21340,Customer Service,0.95924,"New ""depot scam"" currently doing the rounds \n...","Kendal, Cumbria",False,1.0,1.0,1.0,0.0,Mon Feb 15 20:00:01 +0000 2021,joy,0.844457,54.32898,-2.747183
2330,Customer Service,0.942809,Sometimes it's the simple things. #postbox #ro...,"West Midlands, England",False,0.0,0.0,0.0,0.0,Fri Jan 20 19:28:57 +0000 2017,sadness,0.383989,52.505003,-1.964396
8090,Politics,0.933722,Glad @CWUnews has condemned this apparently se...,"Scarborough, England",False,0.0,0.0,0.0,0.0,Thu May 17 23:14:14 +0000 2018,joy,0.97061,54.282001,-0.401187
25729,Financial News,0.974386,MARSHALL WACE LLP declares that on March 10 it...,,False,0.0,0.0,0.0,0.0,Fri Mar 11 19:34:23 +0000 2022,optimism,0.402407,46.314475,11.048029
12940,Financial News,0.9709,European Equity Movers this morning: #RoyalMai...,"London, England",False,0.0,0.0,0.0,0.0,Mon Jun 17 07:22:16 +0000 2019,optimism,0.662903,51.507336,-0.12765


In [10]:
#tweets_dash_final.to_csv(r"C:\Users\johna\anaconda3\envs\royalmail-dash\royalmail-dash\data\02_intermediate\tweets_dash.csv", index=False)