In [None]:
import pandas as pd

In [None]:
import os
import sys

notebook_dir = os.path.dirname(os.path.abspath("__file__"))
project_dir = os.path.dirname(notebook_dir)

if project_dir not in sys.path:
    sys.path.append(project_dir)

In [None]:
tweets_df = pd.read_csv(r"C:\Users\johna\anaconda3\envs\royalmail-dash\royalmail-dash\data\01_raw\tweets_details2023-03-15_20-43-36.csv")
tweets_df = tweets_df.loc[tweets_df['language'] == 'en'].copy()
tweets_df = tweets_df.sample(10)

In [None]:
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from scripts.data_processing.preprocess_tweets_lite import TextCleaner


def get_topics(df, MODEL, model_path):
    
    _df = df.copy()
        

    # load model
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    # Set up tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL)

    cleaner = TextCleaner(stop_words_remove=False)

    _df['cleaned_text'] = _df['text'].apply(cleaner.clean_text)

    # set up pipeline
    classifier = pipeline(
        "text-classification", model=model, tokenizer=tokenizer, device=0
    )

    # perform inference and extract predicted class and probability
    results = classifier(list(_df["cleaned_text"]), truncation=True)
    
    _df["topic"] = [result['label'] for result in results]
    _df["topic_score"] = [result['score'] for result in results]
    
    # Create a new DataFrame with the required columns
    output_df = _df[["topic", "topic_score", "cleaned_text", "user.location", "user.is_blue_verified", "favorite_count", "retweet_count", "reply_count", "quote_count", "creation_date"]]

    return output_df


In [None]:
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from scripts.data_processing.preprocess_tweets_lite import TextCleaner


def get_emotion(df):
    
    _df = df.copy()
            
    task='emotion'
    MODEL = f"cardiffnlp/twitter-roberta-base-{task}"    

    # load model
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)

    # Set up tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL)

    cleaner = TextCleaner(stop_words_remove=False)

    # set up pipeline
    classifier = pipeline(
        "text-classification", model=model, tokenizer=tokenizer, device=0
    )

    # perform inference and extract predicted class and probability
    results = classifier(list(_df["cleaned_text"]), truncation=True)
    
    _df["emotion"] = [result['label'] for result in results]
    _df["emotion_score"] = [result['score'] for result in results]
    
    return _df


In [None]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

def get_lat_lon(location):
    geolocator = Nominatim(user_agent="geoapiExercises")
    try:
        location_data = geolocator.geocode(location, timeout=10)
        if location_data:
            return location_data.latitude, location_data.longitude
    except GeocoderTimedOut:
        return get_lat_lon(location)
    return None, None

def add_lat_lon_columns(df, location_column):
    df_copy = df.copy()
    df_copy['coordinates'] = df_copy[location_column].apply(get_lat_lon)
    df_copy[['latitude', 'longitude']] = pd.DataFrame(df_copy['coordinates'].tolist(), index=df_copy.index)
    df_copy = df_copy.drop(columns=['coordinates'])
    return df_copy


In [None]:
tweets_dash_data = get_topics(df=tweets_df, MODEL="cardiffnlp/twitter-roberta-base", model_path=r"C:\Users\johna\OneDrive\Desktop\models_twitter_dash\output")

In [None]:
tweets_dash_data_2 = get_emotion(df=tweets_dash_data)

In [None]:
tweets_dash_final = add_lat_lon_columns(tweets_dash_data_2, 'user.location')
tweets_dash_final

In [None]:
#tweets_dash_final.to_csv(r"C:\Users\johna\anaconda3\envs\royalmail-dash\royalmail-dash\data\02_intermediate\tweets_dash.csv", index=False)