# Dependencies

In [None]:
!pip install numpy
!pip install pandas
!pip install plotly
!pip install pydantic
!pip install pyyaml
!pip install nltk
!pip install gpt-2-simple

In [None]:
!sudo apt install -y --no-install-recommends g++ protobuf-compiler libprotobuf-dev
!pip install gcld3

In [None]:
!nvidia-smi

# Imports

In [1]:
import gcld3
import pandas as pd
import nltk
import gpt_2_simple as gpt2
from nltk.sentiment import SentimentIntensityAnalyzer

In [None]:
nltk.download('vader_lexicon')


In [None]:
gpt2.download_gpt2(model_name="124M")

# Constants

In [7]:
COLUMNS = ['tweet', 'likes', 'retweet_count', 'user_screen_name', 'user_description', 'user_followers_count']
LANG = 'en'
TW_USERNAME_REGEX = r"@[a-zA-Z0-9_]{0,15}"
URL_REGEX = r"\b(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][" \
            r"a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2," \
            r"}|www\.[a-zA-Z0-9]+\.[^\s]{2,})\b"
SPACES_REGEX = r"\s+"

In [None]:
GOOGLE_DRIVE_BASE_DIR = "/content/drive/MyDrive/ITBA/Quinto Año/Segundo Cuatrimestre/NLP/TP"
gpt2.mount_gdrive()

# Helper functions

In [9]:
def get_selected_columns(df, columns):
    return df[columns]


def delete_hashtag_symbol(df):
    df['tweet'] = df['tweet'].replace('#', '', regex=True)
    return df


def delete_twitter_username(df):
    df['tweet'] = df['tweet'].replace(TW_USERNAME_REGEX, '', regex=True)
    return df


def delete_urls(df):
    df['tweet'] = df['tweet'].replace(URL_REGEX, '', regex=True)
    return df


def delete_multiple_spaces(df):
    df['tweet'] = df['tweet'].replace(SPACES_REGEX, '', regex=True)


def is_lang(row, detector, lang='en'):
    prediction = detector.FindLanguage(text=row['tweet'])
    if prediction.language == lang and prediction.is_reliable:
        return True
    else:
        return False


def filter_by_language(df, lang='en'):
    detector = gcld3.NNetLanguageIdentifier(min_num_bytes=50, max_num_bytes=2048)

    mask = df.apply(is_lang, axis=1, detector=detector, lang=lang)
    return df[mask]


def is_negative(row, sia, threshold):
    sentiment_scores = sia.polarity_scores(row['tweet'])
    if sentiment_scores['neg'] > threshold:
        return True
    else:
        return False


def filter_by_sentiment(df):
    sia = SentimentIntensityAnalyzer()

    mask = df.apply(is_negative, axis=1, sia=sia, threshold=0.4)
    return df[mask]

# Tweets filtering and processing

In [None]:
trump_df = pd.read_csv(f"{GOOGLE_DRIVE_BASE_DIR}/hashtag_donaldtrump_short.csv", sep=',')
biden_df = pd.read_csv(f"{GOOGLE_DRIVE_BASE_DIR}/hashtag_joebiden_short.csv", sep=',')

# Filtro de campos de interes
trump_df = get_selected_columns(trump_df, COLUMNS)
biden_df = get_selected_columns(biden_df, COLUMNS)

# Filtro por longitud
trump_df = trump_df[trump_df['tweet'].str.len() >= 50]
biden_df = biden_df[biden_df['tweet'].str.len() >= 50]

# Filtro por contenido
# Links
trump_df['tweet'] = trump_df['tweet'].replace(URL_REGEX, '', regex=True)
biden_df['tweet'] = biden_df['tweet'].replace(URL_REGEX, '', regex=True)
# Arrobas de respuesta o mencion
trump_df['tweet'] = trump_df['tweet'].replace(TW_USERNAME_REGEX, '', regex=True)
biden_df['tweet'] = biden_df['tweet'].replace(TW_USERNAME_REGEX, '', regex=True)
# Espacios en blanco de mas
trump_df['tweet'] = trump_df['tweet'].replace(SPACES_REGEX, ' ', regex=True)
biden_df['tweet'] = biden_df['tweet'].replace(SPACES_REGEX, ' ', regex=True)
# Simbolo de hashtag
trump_df['tweet'] = trump_df['tweet'].replace('#', '', regex=True)
biden_df['tweet'] = biden_df['tweet'].replace('#', '', regex=True)
# Lenguage Ingles
trump_df = filter_by_language(trump_df, lang=LANG)
biden_df = filter_by_language(biden_df, lang=LANG)

# Filtro por sentimiento
# trump_df = filter_by_sentiment(trump_df)
# biden_df = filter_by_sentiment(biden_df)

print(f"Filtered trump tweets: {len(trump_df)}\n")
print(f"Filtered biden tweets: {len(biden_df)}\n")

# Post processing files generation

In [None]:
trump_file_path = f"{GOOGLE_DRIVE_BASE_DIR}/trump_tweets.txt"

In [None]:
trump_tweets = trump_df["tweet"].tolist()
with open(trump_file_path, "w", encoding="utf-8") as file:
    for tweet in trump_tweets:
        file.write(f"tweet: {tweet}\n")

In [None]:
biden_file_path = f"{GOOGLE_DRIVE_BASE_DIR}/biden_tweets.txt"

In [None]:
biden_tweets = biden_df["tweet"].tolist()
with open(biden_file_path, "w", encoding="utf-8") as file:
    for tweet in biden_tweets:
        file.write(f"tweet: {tweet}\n")

In [None]:
users_file_path = f"{GOOGLE_DRIVE_BASE_DIR}/users.txt"

In [None]:
with open(users_file_path, "w", encoding="utf-8") as file:
    for _, tweet in biden_df.iterrows():
        file.write(f"username: {tweet['user_screen_name']}\ndescription: {tweet['user_description']}\n")
    for _, tweet in trump_df.iterrows():
        file.write(f"username: {tweet['user_screen_name']}\ndescription: {tweet['user_description']}\n")

# Trump tweets generation

In [None]:
sess = gpt2.start_tf_sess()

gpt2.finetune(sess, dataset=trump_file_path, model_name='124M', steps=100, restore_from='fresh',
              run_name='trump_tweets',
              print_every=10, sample_every=100, save_every=100)

In [None]:
generated_tweets = gpt2.generate(sess, length=100, temperature=0.7, nsamples=5, batch_size=5,
                                 return_as_list=True, run_name='trump_tweets',
                                 prefix="Generate tweets with at least 20 words:")

for tweet in generated_tweets:
    print(tweet + '\n\n<SEPARATOR/>\n\n')

In [None]:
gpt2.copy_checkpoint_to_gdrive(run_name='trump_tweets')

## Run from checkpoint

In [None]:
gpt2.copy_checkpoint_from_gdrive(run_name='trump_tweets')

In [None]:
sess = gpt2.start_tf_sess()

In [None]:
gpt2.load_gpt2(sess, run_name='trump_tweets')

In [None]:
generated_tweets = gpt2.generate(sess, length=100, temperature=0.7, nsamples=5, batch_size=5,
                                 return_as_list=True, run_name='trump_tweets',
                                 prefix="Generate tweets with at least 20 words:")

for tweet in generated_tweets:
    print(tweet + '\n\n<SEPARATOR/>\n\n')

# Biden tweets generation

In [None]:
sess = gpt2.start_tf_sess()

gpt2.finetune(sess, dataset=biden_file_path, model_name='124M', steps=100, restore_from='fresh',
              run_name='biden_tweets',
              print_every=10, sample_every=100, save_every=100)

In [None]:
generated_tweets = gpt2.generate(sess, length=100, temperature=0.7, nsamples=5, batch_size=5,
                                 return_as_list=True, run_name='biden_tweets',
                                 prefix="Generate tweets with at least 20 words:")

for tweet in generated_tweets:
    print(tweet + '\n\n<SEPARATOR/>\n\n')

In [None]:
gpt2.copy_checkpoint_to_gdrive(run_name='biden_tweets')

## Run from checkpoint

In [None]:
gpt2.copy_checkpoint_from_gdrive(run_name='biden_tweets')

In [None]:
sess = gpt2.start_tf_sess()

In [None]:
gpt2.load_gpt2(sess, run_name='biden_tweets')

In [None]:
generated_tweets = gpt2.generate(sess, length=100, temperature=0.7, nsamples=5, batch_size=5,
                                 return_as_list=True, run_name='biden_tweets',
                                 prefix="Generate tweets with at least 20 words:")

for tweet in generated_tweets:
    print(tweet + '\n\n<SEPARATOR/>\n\n')

# Users generation

In [None]:
sess = gpt2.start_tf_sess()

gpt2.finetune(sess, dataset=users_file_path, model_name='124M', steps=100, restore_from='fresh', run_name='users',
              print_every=10, sample_every=100, save_every=100)

In [None]:
users = gpt2.generate(sess, length=100, temperature=0.7, nsamples=5, batch_size=5,
                                  return_as_list=True, run_name='users', prefix="Generate a list of usernames and descriptions:")

for user in users:
    print(user+'\n\n<SEPARATOR/>\n\n')

In [17]:
gpt2.copy_checkpoint_to_gdrive(run_name='users')

## Run from checkpoint

In [3]:
gpt2.copy_checkpoint_from_gdrive(run_name='users')

In [4]:
sess = gpt2.start_tf_sess()

In [None]:
gpt2.load_gpt2(sess, run_name='users'),

In [None]:
users = gpt2.generate(sess, length=100, temperature=0.7, nsamples=5, batch_size=5,
                                  return_as_list=True, run_name='users', prefix="Generate a list of usernames and descriptions:")

for user in users:
    print(user+'\n\n<SEPARATOR/>\n\n')