In [None]:
import json
import os
import re
from datetime import datetime
from typing import List, Tuple, Union

import instaloader
import numpy as np
import pandas as pd
import requests
import torch
from dateutil import parser as dateparser
from newspaper import Article
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer
import nltk
import time

from sklearn.metrics.pairwise import cosine_similarity, manhattan_distances, euclidean_distances
pd.options.mode.chained_assignment = None

def download_profile(
    usernames: Union[str, List[str]], root: os.PathLike = "../data", **kwargs
):
    """
    Downloads all posts of a given publicly-accessible profile.
    Does not download images or videos, only metadata

    Parameters
    ----------
    usernames : str, List[str]
        Username(s) of the profile to download
    root : str
        Path to the folder where the posts will be stored
    """

    loader = instaloader.Instaloader(
        dirname_pattern=os.path.join(root, "{profile}"),
        download_pictures=False,
        download_videos=False,
        download_video_thumbnails=False,
        download_geotags=False,
        download_comments=False,
        save_metadata=True,
        compress_json=False,
    )

    if isinstance(usernames, str):
        usernames = [usernames]

    profiles = [
        instaloader.Profile.from_username(loader.context, username)
        for username in usernames
    ]

    latest_stamps = instaloader.LatestStamps(os.path.join(root, "latest_timestamp.txt"))

    loader.download_profiles(
        profiles,
        fast_update=True,
        profile_pic=False,
        igtv=False,
        latest_stamps=latest_stamps,
        stories=False,
        highlights=False,
        tagged=False,
    )


class Vectorizer:
    """
    A class for vectorizing text inputs
    """

    def __init__(self, how: str = "tfidf", ngram_range: Tuple[int, int] = (1, 1)):
        """
        Initializes the vectorizer

        Parameters
        ----------
        how : str
            How to vectorize the content. Can be either "tfidf", "bow" (bag of words), or "bert"
        ngram_range : Tuple[int, int]
            Range of ngrams to use for tfidf or count vectorization
        """
        self.how = how
        self.ngram_range = ngram_range
        if self.how in ["tfidf", "bow"]:
            self.vectorizer = (
                TfidfVectorizer if self.how == "tfidf" else CountVectorizer
            )(
                input="filename",
                strip_accents="unicode",
                ngram_range=self.ngram_range,
            )
        elif self.how == "bert":
            self.vectorizer = AutoModel.from_pretrained("vinai/bertweet-base")
            self.tokenizer = AutoTokenizer.from_pretrained(
                "vinai/bertweet-base", use_fast=False
            )
        elif self.how == "roberta":
            self.vectorizer = SentenceTransformer("all-distilroberta-v1")
        else:
            raise NotImplementedError("how must be either tfidf, bow or bert")
        self.trained = False

    def fit_transform(
        self, text_files: List[os.PathLike], batch_size: int = 8
    ) -> np.ndarray:
        """
        Fits the vectorizer to the given text files, and returns the vectors

        Parameters
        ----------
        text_files : List[os.PathLike]
            List of paths to the text files to fit the vectorizer to
        batch_size : int
            Batch size for bert vectorization

        Returns
        -------
        np.ndarray
            Array containing the vectors
        """
        if self.how in ["tfidf", "bow"]:
            vectors = self.vectorizer.fit_transform(text_files).toarray()
            self.vectorizer.input = "content"
        elif self.how == "bert":
            all_embeddings = []

            for i in tqdm(range(0, len(text_files), 8), desc="Bert vectorization"):
                batch_contents = [
                    open(file_path, "r", encoding="utf-8").read()
                    for file_path in text_files[i : i + 8]
                    if os.path.exists(file_path)
                ]

                tokens = self.tokenizer(
                    batch_contents, padding=True, truncation=True, return_tensors="pt"
                )

                with torch.no_grad():
                    outputs = self.vectorizer(**tokens)
                    embeddings = [
                        o.numpy() for o in outputs.last_hidden_state
                    ]  # This contains the embeddings for each token in the input
                    all_embeddings.extend(embeddings)

            vectors = np.array(all_embeddings)
        else:
            raise NotImplementedError("how must be either tfidf, bow or bert")

        self.trained = True
        return vectors

    def fit(self, text_files: List[os.PathLike]) -> None:
        """
        Fits the vectorizer to the given text files

        Parameters
        ----------
        text_files : List[os.PathLike]
            List of paths to the text files to fit the vectorizer to
        """
        if self.how in ["tfidf", "bow"]:
            self.vectorizer.fit(text_files)
            self.vectorizer.input = "content"
        elif self.how in ["bert", "roberta"]:
            pass
        else:
            raise NotImplementedError("how must be either tfidf, bow or bert")

        self.trained = True

    def transform(self, text: str) -> np.ndarray:
        """
        Transforms the given text into a vector

        Parameters
        ----------
        text : str
            Text to transform

        Returns
        -------
        np.ndarray
            Array containing the vector
        """
        if not self.trained:
            raise ValueError("Vectorizer must be trained first")
        if self.how in ["tfidf", "bow"]:
            vector = self.vectorizer.transform([text]).toarray()
        elif self.how == "bert":
            tokens = self.tokenizer(
                [text], padding='max_length', max_length=1, truncation=True, return_tensors="pt"
            )

            with torch.no_grad():
                outputs = self.vectorizer(**tokens)
                vector = outputs.last_hidden_state[0].numpy()
        elif self.how == "roberta":
            return self.vectorizer.encode([text])

        return vector


def fit_vectorizer(
    username: str,
    root: str = "../data",
    how: Union[str, "tfidf", "bow", "bert"] = "tfidf",
    ngram_range: Tuple[int, int] = (1, 1),
    fit_before: Union[str, datetime] = datetime.today(),
    fit_after: datetime = datetime(year=2000, month=1, day = 1),
    batch_size: int = 8,
) -> Tuple[np.ndarray, TfidfVectorizer]:
    """
    Vectorizes the content of a given profile. Assumes the download
    has already been done, and the directory is full of posts. Directory
    is expected to contain a folder named after the profile, which contains
    the text files and json metadata files for each post.

    Parameters
    ----------
    username : str
        Username of the profile to vectorize
    root : str
        Path to the folder where the posts are stored
    how : str
        How to vectorize the content. Can be either "tfidf", "bow" (bag of words), or "bert"
    return_vectorizer : bool
        Whether to return the vectorizer object or not
    ngram_range : Tuple[int, int]
        Range of ngrams to use for tfidf or count vectorization
    fit_before : Union[str, datetime]
        Date to fit the vectorizer before. Can be either a datetime object
    batch_size : int
        Batch size for bert vectorization

    Returns
    -------
    Tuple[np.ndarray, Union[TfidfVectorizer, None]]
        Tuple containing the vectors and the vectorizer object if
        return_vectorizer is True, None otherwise
    """
    profile_path = os.path.join(root, username)
    text_files = [
        os.path.join(profile_path, file)
        for file in os.listdir(profile_path)
        if (file.endswith("UCT.txt") and
            fit_after < 
            datetime.strptime(os.path.basename(file), "%Y-%m-%d_%H-%M-%S_UTC.txt")
            <= fit_before
           )
    ]
    print(fit_before)
    print(fit_after)
    print(text_files)

    vectorizer = Vectorizer(how=how, ngram_range=ngram_range)
    vectorizer.fit(text_files)

    # Necessary to work with raw text inputs after training on documents
    return vectorizer


def remove_control_characters(html: str) -> str:
    """
    Strip invalid XML characters that `lxml` cannot parse.
    See: https://github.com/html5lib/html5lib-python/issues/96

    The XML 1.0 spec defines the valid character range as:
    Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]

    We can instead match the invalid characters by inverting that range into:
    InvalidChar ::= #xb | #xc | #xFFFE | #xFFFF | [#x0-#x8] | [#xe-#x1F] | [#xD800-#xDFFF]

    Sources:
    https://www.w3.org/TR/REC-xml/#charsets,
    https://lsimons.wordpress.com/2011/03/17/stripping-illegal-characters-out-of-xml-in-python/

    Parameters
    ----------
    html : str
        HTML string to clean

    Returns
    -------
    str
        Cleaned HTML string
    """

    def strip_illegal_xml_characters(s, default, base=10):
        # Compare the "invalid XML character range" numerically
        n = int(s, base)
        if (
            n in (0xB, 0xC, 0xFFFE, 0xFFFF)
            or 0x0 <= n <= 0x8
            or 0xE <= n <= 0x1F
            or 0xD800 <= n <= 0xDFFF
        ):
            return ""
        return default

    # We encode all non-ascii characters to XML char-refs, so for example "💖" becomes: "&#x1F496;"
    # Otherwise we'd remove emojis by mistake on narrow-unicode builds of Python
    html = html.encode("ascii", "xmlcharrefreplace").decode("utf-8")
    html = re.sub(
        r"&#(\d+);?",
        lambda c: strip_illegal_xml_characters(c.group(1), c.group(0)),
        html,
    )
    html = re.sub(
        r"&#[xX]([0-9a-fA-F]+);?",
        lambda c: strip_illegal_xml_characters(c.group(1), c.group(0), base=16),
        html,
    )
    # A regex matching the "invalid XML character range"
    html = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]").sub(
        "", html
    )
    return html


def get_article(url: str) -> str:
    """
    Obtains the likely text from an article based on the newspaper library

    Parameters
    ----------
    url: str
        URL of the article to fetch

    Returns
    -------
    str
        Most likely article text
    """

    text = ""
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4644.45 Safari/537.36",
            "Connection": "keep-alive",
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        article = Article(url)
        article.download(input_html=remove_control_characters(response.text))
        article.parse()
        text = article.text

    except requests.exceptions.HTTPError as http_err:
        pass

    return text


def get_trends(
    hl="en-US",
    geo="US",
    tz=360,
    count=20,
    date: Union[datetime, str] = datetime.today(),
) -> pd.DataFrame:
    """
    Fetch Google Trends realtime data

    Parameters
    ----------
    hl : str
        Language
    geo : str
        Country
    tz : int
        Timezone
    count : int
        Number of results
    date : Union[datetime, str]
        Date to fetch data from. Can be either a datetime object or a string
        in the format YYYY-MM-DD (fuzzy parsing is enabled, but not recommended)

    Returns
    -------
    pd.DataFrame
        DataFrame containing the results
    """
    if isinstance(date, str):
        date = dateparser.parse(date, fuzzy=True)

    response = requests.get(
        "https://trends.google.com/trends/api/dailytrends",
        params={
            "hl": hl,
            "tz": tz,
            "ed": date.strftime("%Y%m%d"),
            "geo": geo,
            "ns": count,
        },
    )

    response.raise_for_status()
    if response.status_code != 204:
        data = response.text.split(")]}',\n")[1]
        data = json.loads(data)["default"]["trendingSearchesDays"][0][
            "trendingSearches"
        ]

    dfs = pd.concat(
        [pd.DataFrame(trend["articles"]) for trend in data], ignore_index=True
    )

    dfs["text"] = dfs["url"].apply(get_article)

    return dfs


def read_metadata_json(fp: os.PathLike):
    """
    Reads a json file containing the metadata of an Instagram post
    and returns a dictionary with the relevant information. Expects
    the file to be named as YYYY-MM-DD_HH-MM-SS_UTC.json, which is
    the default for instaloader.

    Parameters
    ----------
    fp : os.PathLike
        Path to the json file

    Returns
    -------
    dict
        Dictionary containing the relevant metadata
    """
    with open(fp, "r") as f:
        metadata = json.loads(f.read())["node"]

        dt = datetime.strptime(os.path.basename(fp), "%Y-%m-%d_%H-%M-%S_UTC.json")

        clean_metadata = {
            "dt": dt,
            "likes": metadata["edge_media_preview_like"]["count"],
            "comments": metadata["edge_media_to_comment"]["count"],
            "caption": metadata["edge_media_to_caption"]["edges"][0]["node"]["text"]
            if metadata["edge_media_to_caption"]["edges"]
            else "",
            "comments_disabled": metadata["comments_disabled"],
            "is_video": metadata["is_video"],
            "tagged_users": metadata["edge_media_to_tagged_user"],
        }

        return clean_metadata


def get_posts(
    username: str,
    root: os.PathLike = "../data",
    get_before: Union[str, datetime] = datetime.today(),
    get_after: datetime = datetime(year=2000, month=1, day = 1),
) -> pd.DataFrame:
    """
    Reads all the posts of a given profile and returns a DataFrame
    with the relevant information

    Parameters
    ----------
    username : str
        Username of the profile to read
    root : os.PathLike
        Path to the folder where the posts are stored

    Returns
    -------
    pd.DataFrame
        DataFrame containing the relevant metadata
    """
    profile_path = os.path.join(root, username)
    json_files = [
        os.path.join(profile_path, file)
        for file in os.listdir(profile_path)
        if file.endswith("UTC.json") and
        get_after < 
        datetime.strptime(os.path.basename(file), "%Y-%m-%d_%H-%M-%S_UTC.json")
        <= get_before       
    ]
    metadata = [read_metadata_json(file) for file in json_files]
    df = pd.DataFrame(metadata)
    df.set_index("dt", inplace=True)
    df.sort_index(inplace=True)
    return df


def ema(data: pd.Series, alpha: float = 0.99) -> pd.Series:
    """
    Calculates the exponential moving average of a given series

    Parameters
    ----------
    data : pd.Series
        Series to calculate the ema for
    alpha : float
        Alpha parameter for the ema calculation

    Returns
    -------
    pd.Series
        Series containing the ema values
    """
    ema = []
    ema_value = None

    for value in data:
        if ema_value is None:
            ema_value = value
        else:
            ema_value = (value - ema_value) * alpha + ema_value
        ema.append(ema_value)

    return ema

def calc_embedding(x: pd.Series, how: str = 'mean'):
    if how == 'mean':
        return np.array(x["embeddings"].to_list()).mean(axis=0)
    elif how == 'weighted':
        weights = x[['likes', 'comments']].sum(axis=1)
        embeddings = np.array(x["embeddings"].to_list())
        weighted_mean = np.average(embeddings, axis=0, weights=weights)
        return weighted_mean


def jaccard_similarity(str1, str2):
    set1 = set(nltk.word_tokenize(str1.lower()))
    set2 = set(nltk.word_tokenize(str2.lower()))
    
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    jaccard_similarity = intersection / union if union != 0 else 0.0
    
    return jaccard_similarity

In [None]:
df_pre_nov = pd.read_csv("../Historical_Articles.csv")
df_pre_nov['Date'] = pd.to_datetime(df_pre_nov['Date'], format='%Y/%m/%d')
df_pre_nov = df_pre_nov[['Date', 'article_content']]

In [None]:
df_nov = pd.read_csv("../entire_nov.csv")
df_nov['publishedAt'] = pd.to_datetime(df_nov['publishedAt'])
df_nov = df_nov[['publishedAt', 'article_content']]
df_nov.columns = ['Date', 'article_content']

In [None]:
google_trends = pd.concat([df_pre_nov, df_nov])
google_trends['Date'] = pd.to_datetime(google_trends['Date'], utc=True) 
google_trends['Date'] = google_trends['Date'].dt.tz_localize(None)

google_trends = google_trends.dropna()

In [None]:
results = []

ROOT = "../data"
PERIODS = [
    # Weeks of Sep
    ('09/01/2023', '09/08/2023'),
    ('09/09/2023', '09/16/2023'),
    ('09/17/2023', '09/24/2023'),
    ('09/25/2023', '09/30/2023'),

    # Weeks of Oct
    ('10/01/2023', '10/08/2023'),
    ('10/09/2023', '10/16/2023'),
    ('10/17/2023', '10/24/2023'),
    ('10/25/2023', '10/31/2023'),

    # Weeks of Nov
    ('11/01/2023', '11/08/2023'),
    ('11/09/2023', '11/16/2023'),
    ('11/17/2023', '11/24/2023'),
    ('11/25/2023', '11/30/2023'),

    # Bi-weeks of Sep
    ('09/01/2023', '09/15/2023'),
    ('09/16/2023', '09/30/2023'),

    # Bi-weeks of Oct
    ('10/01/2023', '10/15/2023'),
    ('10/16/2023', '10/31/2023'),

    # Bi-weeks of Nov
    ('11/01/2023', '11/15/2023'),
    ('11/16/2023', '11/30/2023'),
]
SIMILARITIES = {
    'cosine': cosine_similarity,
    'manhattan':manhattan_distances, 
    'euclidean': euclidean_distances
}

for METHOD in ['roberta', 'tfidf', 'bow']:
    for USERNAME in ['espn', 'pubitysport', 'enews', 'forbes', 'bloomberg']:
        for SIMILARITY in SIMILARITIES.keys():
            for N_PER_DATE in [1,5]:
                for EMBEDDING_METRIC in ['weighted', 'mean']:
                    for START, END in PERIODS:
                        print(USERNAME, METHOD, SIMILARITY, EMBEDDING_METRIC, N_PER_DATE, START, END)
                        try:
                            START, END = datetime.strptime(START, '%m/%d/%Y'), datetime.strptime(END, '%m/%d/%Y')
                            profile_path = os.path.join(ROOT, USERNAME)
                            text_files = [
                                os.path.join(profile_path, file)
                                for file in os.listdir(profile_path)
                                if file.endswith("UTC.txt")
                                and datetime(year=2000, month=1, day=1) < 
                                datetime.strptime(os.path.basename(file), "%Y-%m-%d_%H-%M-%S_UTC.txt")
                                < START
                            ]
                            
                            # Calculate time to vectorize, and time to embed
                            # Also calculate number of posts in that timeframe
                            # Use Bloomberg as an example week 
                            vectorizer = Vectorizer(how=METHOD)
                            start = time.time()
                            vectorizer.fit(text_files)
                            end = time.time()
                            vectorization_time = end - start 
                            
                            prev_posts = get_posts(USERNAME, ROOT, get_before=START)
                            posts = get_posts(USERNAME, ROOT, get_before=END, get_after=START)
                            articles = google_trends[(google_trends['Date'] >= START) & (google_trends['Date'] <= END)]
                            
                            start = time.time()
                            tqdm.pandas(desc="Embed previous captions")
                            prev_posts["embeddings"] = prev_posts["caption"].progress_apply(vectorizer.transform)
                            embedding = calc_embedding(prev_posts, how=EMBEDDING_METRIC)
                            end = time.time()
                            embed_captions_time = end - start 
                            
                            start = time.time()
                            tqdm.pandas(desc="Embed articles")
                            articles["embeddings"] = articles["article_content"].progress_apply(vectorizer.transform)
                            end = time.time()
                            embed_articles_time = end - start 
                            
                            articles["similarity"] = articles["embeddings"].apply(
                                lambda x: SIMILARITIES[SIMILARITY](embedding, x)[0][0]
                            )
                            
                            # Get top posts per day and calculate jaccard 
                            top_k = articles.loc[articles.groupby(articles["Date"])["similarity"].nlargest(1).index.levels[1]]
                            
                            article_text = top_k['article_content'].str.cat(sep=' ')
                            posts_text = posts['caption'].str.cat(sep=' ')
                            jaccard_score = jaccard_similarity(article_text, posts_text)
                            
                            results.append({
                                "username": USERNAME,
                                "method": METHOD,
                                "n_per_date": N_PER_DATE,
                                "similarity": SIMILARITY,
                                "embedding_metric": EMBEDDING_METRIC,
                                "num_previous_posts": len(prev_posts),
                                "num_timeframe_posts": len(posts),
                                "num_articles": len(articles),
                                "start": START,
                                "end": END,
                                "vectorization_time": vectorization_time,
                                "embed_articles_time": embed_articles_time,
                                "embed_captions_time": embed_captions_time,
                                "jaccard": jaccard_score
                            })
                            pd.DataFrame(results).to_csv("FINAL_RESULTS.csv")
                            print(results[-1])
                        except Exception as E:
                            print(f"Failed to calculate Jaccard: {E}")