# Synthetic data creation

In [41]:
import pandas as pd
import random
import csv
from faker import Faker

fake = Faker()

def load_firms_from_csv(firm_list_path) -> list:
        df = pd.read_csv(firm_list_path)
        # Combine the 'name', 'abbreviation', and 'ticker' into a single list, excluding any NaN values
        patterns = df[['name','altname', 'abbreviation', 'ticker','altticker']].fillna('').apply(lambda x: '|'.join(x[x != '']), axis=1).tolist()
        # Ensure patterns are unique and non-empty
        patterns = [pattern for pattern in set(patterns) if pattern]

        # Split into a single list
        xss = [i.split('|') for i in patterns]
        firms = [x for xs in xss for x in xs]

        return firms

def read_tweet_templates(filepath:str) -> list:
    with open(filepath, newline='', encoding='utf-8') as f:
        reader = csv.reader(f, quotechar='"')
        tweet_templates = [item for row in reader for item in row if item]

    return tweet_templates

def generate_synthetic_tweets(num_records: int, firms: list, tweet_templates: list):
    data=[]
    for _ in range(num_records):
        timestamp = fake.date_time_this_year().isoformat()
        firm = random.choice(firms)
        tweet_template = random.choice(tweet_templates)
        content = tweet_template.format(firm)
        #data.append({"timestamp": timestamp, "tweet_content": content, "firm_mentioned": firm})
        data.append({"timestamp": timestamp, "tweet_content": content})

    return pd.DataFrame(data)

In [42]:
tweet_templates = read_tweet_templates("tweet_templates.csv")
firms = load_firms_from_csv(firm_list_path="firms.csv")

df = generate_synthetic_tweets(num_records=2000, firms=firms, tweet_templates=tweet_templates)

In [43]:
df.to_csv('synthetic.csv')