# Dependencies

In [None]:
!pip install numpy
!pip install pandas
!pip install plotly
!pip install pydantic
!pip install pyyaml
!pip install nltk
!pip install gpt-2-simple
!pip install wordcloud
!pip install matplotlib

In [None]:
!sudo apt install -y --no-install-recommends g++ protobuf-compiler libprotobuf-dev
!pip install gcld3

In [None]:
!nvidia-smi

# Imports

In [1]:
import gcld3
import pandas as pd
import nltk
import gpt_2_simple as gpt2
import matplotlib.pyplot as plt
import string
import re
from nltk.sentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('vader_lexicon')

In [None]:
gpt2.download_gpt2(model_name="124M")

# Constants

In [7]:
COLUMNS = ['tweet', 'likes', 'retweet_count', 'user_screen_name', 'user_description', 'user_followers_count']
LANG = 'en'
TW_USERNAME_REGEX = r"@[a-zA-Z0-9_]{0,15}"
URL_REGEX = r"\b(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][" \
            r"a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2," \
            r"}|www\.[a-zA-Z0-9]+\.[^\s]{2,})\b"
SPACES_REGEX = r"\s+"
HASHTAG_REGEX = r'^#[^ !@#$%^&*(),.?":{}|<>]*$'

In [None]:
GOOGLE_DRIVE_BASE_DIR = "/content/drive/MyDrive/ITBA/Quinto Año/Segundo Cuatrimestre/NLP/TP"
gpt2.mount_gdrive()

# Helper functions

In [None]:
def get_selected_columns(df, columns):
    return df[columns]


def delete_hashtag_symbol(df):
    df['tweet'] = df['tweet'].replace('#', '', regex=True)
    return df


def delete_twitter_username(df):
    df['tweet'] = df['tweet'].replace(TW_USERNAME_REGEX, '', regex=True)
    return df


def delete_urls(df):
    df['tweet'] = df['tweet'].replace(URL_REGEX, '', regex=True)
    return df


def delete_multiple_spaces(df):
    df['tweet'] = df['tweet'].replace(SPACES_REGEX, '', regex=True)


def is_lang(row, detector, lang='en'):
    prediction = detector.FindLanguage(text=row['tweet'])
    if prediction.language == lang and prediction.is_reliable:
        return True
    else:
        return False


def filter_by_language(df, lang='en'):
    detector = gcld3.NNetLanguageIdentifier(min_num_bytes=50, max_num_bytes=2048)

    mask = df.apply(is_lang, axis=1, detector=detector, lang=lang)
    return df[mask]


def is_feeling(row, sia, feeling, threshold):
    sentiment_scores = sia.polarity_scores(row['tweet'])
    if sentiment_scores[feeling] > threshold:
        return True
    else:
        return False


def filter_by_sentiment(df, feeling, threshold=0.4):
    sia = SentimentIntensityAnalyzer()

    mask = df.apply(is_feeling, axis=1, sia=sia, feeling=feeling, threshold=threshold)
    return df[mask]


def is_relevant(row, min_likes, min_retweets):
    try:
        likes = float(row['likes'])
        retweets = float(row['retweet_count'])
        if likes > min_likes or retweets > min_retweets:
            return True
        else:
            return False
    except:
        return False


def filter_by_relevance(df, min_likes, min_retweets):
    mask = df.apply(is_relevant, axis=1, min_likes=min_likes, min_retweets=min_retweets)
    return df[mask]


def preprocess_and_lemmatize(text):
    # Tokenización de palabras y convertir texto a minúsculas
    tokens = word_tokenize(text.lower())
    # Filtrar signos de puntuación
    tokens = [token for token in tokens if token not in punctuation]
    # Lematización de palabras
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Unir tokens lematizados en una cadena de texto
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

def clean_generated_tweets(tweets):
    tweet_content = []
    for text in tweets:
        matches = re.findall(r"tweet: (.+)", text)
        clean_matches = [match.strip() for match in matches]
        tweet_content.extend(clean_matches)
    return tweet_content

# Tweets filtering and processing

In [None]:
trump_df = pd.read_csv(f"{GOOGLE_DRIVE_BASE_DIR}/hashtag_donaldtrump.csv", sep=',', lineterminator='\n',
                       parse_dates=True, low_memory=False)
biden_df = pd.read_csv(f"{GOOGLE_DRIVE_BASE_DIR}/hashtag_joebiden.csv", sep=',', lineterminator='\n',
                       parse_dates=True, low_memory=False)

trump_tweets_count = len(trump_df)
biden_tweets_count = len(biden_df)

print(f'Total Trump Tweets: {trump_tweets_count}')
print(f'Total Biden Tweets: {biden_tweets_count}')

# Filtro de campos de interes
trump_df = get_selected_columns(trump_df, COLUMNS)
biden_df = get_selected_columns(biden_df, COLUMNS)

# Filtro por longitud
trump_df = trump_df[trump_df['tweet'].str.len() >= 50]
biden_df = biden_df[biden_df['tweet'].str.len() >= 50]

# Filtro por contenido
# Links
trump_df['tweet'] = trump_df['tweet'].replace(URL_REGEX, '', regex=True)
biden_df['tweet'] = biden_df['tweet'].replace(URL_REGEX, '', regex=True)
# Arrobas de respuesta o mencion
trump_df['tweet'] = trump_df['tweet'].replace(TW_USERNAME_REGEX, '', regex=True)
biden_df['tweet'] = biden_df['tweet'].replace(TW_USERNAME_REGEX, '', regex=True)
# Espacios en blanco de mas
trump_df['tweet'] = trump_df['tweet'].replace(SPACES_REGEX, ' ', regex=True)
biden_df['tweet'] = biden_df['tweet'].replace(SPACES_REGEX, ' ', regex=True)
# Caracteres inválidos
trump_df['tweet'] = trump_df['tweet'].replace("&amp", '')
biden_df['tweet'] = biden_df['tweet'].replace("&amp", '')
# Simbolo de hashtag
trump_df['tweet'] = trump_df['tweet'].replace(HASHTAG_REGEX, '', regex=True)
biden_df['tweet'] = biden_df['tweet'].replace(HASHTAG_REGEX, '', regex=True)
# Filtro por relevancia
print(f'\nTrump tweets mean')
print(trump_df[['likes', 'retweet_count']].mean())
print(f'\nTrump tweets max')
print(trump_df[['likes', 'retweet_count']].max())

print(f'\nBiden tweets mean')
print(biden_df[['likes', 'retweet_count']].mean())
print(f'\nBiden tweets max')
print(biden_df[['likes', 'retweet_count']].max())

trump_df = filter_by_relevance(trump_df, min_likes=10, min_retweets=10)
biden_df = filter_by_relevance(biden_df, min_likes=10, min_retweets=10)

print(f"Filtered trump tweets: {len(trump_df)}")
print(f"Filtered biden tweets: {len(biden_df)}")

# Lenguage Ingles
trump_df = filter_by_language(trump_df, lang=LANG)
biden_df = filter_by_language(biden_df, lang=LANG)

print(f"Filtered trump tweets: {len(trump_df)}")
print(f"Filtered biden tweets: {len(biden_df)}")

In [None]:
# Filtro por sentimiento
trump_df = filter_by_sentiment(trump_df, feeling='neg', threshold=0.3)
biden_df = filter_by_sentiment(biden_df, feeling='neg', threshold=0.3)

print(f"Filtered trump tweets: {len(trump_df)}")
print(f"Filtered biden tweets: {len(biden_df)}")

In [None]:
print(trump_df['tweet'][:10].to_numpy())
print(biden_df['tweet'][:10].to_numpy())

# Wordcloud generation

In [None]:
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet')

# Obtener lista de signos de puntuación
punctuation = set(string.punctuation)
stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

## Trump Wordcloud

In [None]:
preprocessed_trump_tweets = trump_df['tweet'].apply(preprocess_and_lemmatize)

processed_trump_tweets = preprocessed_trump_tweets.str.cat(sep=' ')

wordcloud = WordCloud(stopwords=stop_words).generate(processed_trump_tweets)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

## Biden Wordcloud

In [None]:
preprocessed_biden_tweets = biden_df['tweet'].apply(preprocess_and_lemmatize)

processed_biden_tweets = preprocessed_biden_tweets.str.cat(sep=' ')

wordcloud = WordCloud(stopwords=stop_words).generate(processed_biden_tweets)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Post processing files generation

In [None]:
trump_file_path = f"{GOOGLE_DRIVE_BASE_DIR}/trump_tweets.txt"

In [None]:
trump_tweets = trump_df["tweet"].tolist()
with open(trump_file_path, "w", encoding="utf-8") as file:
    for tweet in trump_tweets:
        file.write(f"tweet: {tweet}\n")

In [None]:
biden_file_path = f"{GOOGLE_DRIVE_BASE_DIR}/biden_tweets.txt"

In [None]:
biden_tweets = biden_df["tweet"].tolist()
with open(biden_file_path, "w", encoding="utf-8") as file:
    for tweet in biden_tweets:
        file.write(f"tweet: {tweet}\n")

In [None]:
users_file_path = f"{GOOGLE_DRIVE_BASE_DIR}/users.txt"

In [None]:
with open(users_file_path, "w", encoding="utf-8") as file:
    for _, tweet in biden_df.iterrows():
        file.write(f"username: {tweet['user_screen_name']}\ndescription: {tweet['user_description']}\n")
    for _, tweet in trump_df.iterrows():
        file.write(f"username: {tweet['user_screen_name']}\ndescription: {tweet['user_description']}\n")

# Trump tweets generation

In [None]:
trump_file_path = f"{GOOGLE_DRIVE_BASE_DIR}/trump_tweets.txt"

In [None]:
sess = gpt2.start_tf_sess()

gpt2.finetune(sess, dataset=trump_file_path, model_name='124M', steps=100, restore_from='fresh',
              run_name='trump_tweets',
              print_every=10, sample_every=100, save_every=100)

In [None]:
gpt2.copy_checkpoint_to_gdrive(run_name='trump_tweets')

In [None]:
generated_trump_tweets = gpt2.generate(sess, length=100, temperature=0.7, nsamples=5, batch_size=5,
                                       return_as_list=True, run_name='trump_tweets',
                                       prefix="Generate a list of tweets.\n")

for tweet in generated_trump_tweets:
    print(tweet + '\n\n')

## Run from checkpoint

In [None]:
gpt2.copy_checkpoint_from_gdrive(run_name='trump_tweets')

In [None]:
sess = gpt2.start_tf_sess()

In [None]:
gpt2.load_gpt2(sess, run_name='trump_tweets')

In [None]:
generated_trump_tweets = gpt2.generate(sess, length=100, temperature=0.7, nsamples=5, batch_size=5,
                                       return_as_list=True, run_name='trump_tweets',
                                       prefix="Generate a list of tweets.\n")

for tweet in generated_trump_tweets:
    print(tweet + '\n\n')

# Biden tweets generation

In [None]:
biden_file_path = f"{GOOGLE_DRIVE_BASE_DIR}/biden_tweets.txt"

In [None]:
sess = gpt2.start_tf_sess()

gpt2.finetune(sess, dataset=biden_file_path, model_name='124M', steps=100, restore_from='fresh',
              run_name='biden_tweets',
              print_every=10, sample_every=100, save_every=100)

In [None]:
gpt2.copy_checkpoint_to_gdrive(run_name='biden_tweets')

In [None]:
generated_biden_tweets = gpt2.generate(sess, length=100, temperature=0.7, nsamples=5, batch_size=5,
                                       return_as_list=True, run_name='biden_tweets',
                                       prefix="Generate a list of tweets.\n")

for tweet in generated_biden_tweets:
    print(tweet + '\n\n')

## Run from checkpoint

In [None]:
gpt2.copy_checkpoint_from_gdrive(run_name='biden_tweets')

In [None]:
sess = gpt2.start_tf_sess()

In [None]:
gpt2.load_gpt2(sess, run_name='biden_tweets')

In [None]:
generated_biden_tweets = gpt2.generate(sess, length=100, temperature=0.7, nsamples=5, batch_size=5,
                                       return_as_list=True, run_name='biden_tweets',
                                       prefix="Generate a list of tweets.\n")

for tweet in generated_biden_tweets:
    print(tweet + '\n\n')

# Wordcloud generation

## Trump Wordcloud

In [None]:
clean_generated_trump_tweets = clean_generated_tweets(generated_trump_tweets)
preprocessed_generated_trump_tweets = [preprocess_and_lemmatize(tweet) for tweet in clean_generated_trump_tweets]

processed_generated_trump_tweets = ' '.join(preprocessed_generated_trump_tweets)

wordcloud = WordCloud(stopwords=stop_words).generate(processed_generated_trump_tweets)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

## Biden Wordcloud

In [None]:
clean_generated_biden_tweets = clean_generated_tweets(generated_biden_tweets)
preprocessed_generated_biden_tweets = [preprocess_and_lemmatize(tweet) for tweet in clean_generated_biden_tweets]

processed_generated_biden_tweets = ' '.join(preprocessed_generated_biden_tweets)

wordcloud = WordCloud(stopwords=stop_words).generate(processed_generated_biden_tweets)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Users generation

In [None]:
users_file_path = f"{GOOGLE_DRIVE_BASE_DIR}/users.txt"

In [None]:
sess = gpt2.start_tf_sess()

gpt2.finetune(sess, dataset=users_file_path, model_name='124M', steps=100, restore_from='fresh', run_name='users',
              print_every=10, sample_every=100, save_every=100)

In [17]:
gpt2.copy_checkpoint_to_gdrive(run_name='users')

In [None]:
users = gpt2.generate(sess, length=100, temperature=0.7, nsamples=5, batch_size=5,
                      return_as_list=True, run_name='users',
                      prefix="Generate a list of usernames and descriptions.\n")

for user in users:
    print(user + '\n\n')

## Run from checkpoint

In [3]:
gpt2.copy_checkpoint_from_gdrive(run_name='users')

In [4]:
sess = gpt2.start_tf_sess()

In [None]:
gpt2.load_gpt2(sess, run_name='users'),

In [None]:
users = gpt2.generate(sess, length=100, temperature=0.7, nsamples=5, batch_size=5,
                      return_as_list=True, run_name='users',
                      prefix="Generate a list of usernames and descriptions.\n")

for user in users:
    print(user + '\n\n')