In [12]:
import os
from typing import List, Union, Tuple
from datetime import datetime, timedelta

import instaloader
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import json 
from itertools import takewhile, dropwhile
import requests
import torch
from transformers import AutoModel, AutoTokenizer
from dateutil import parser as dateparser
from newspaper import Article
import numpy as np
import pandas as pd

In [4]:
def download_profile(
    usernames: Union[str, List[str]], root: os.PathLike = "../data", **kwargs
):
    """
    Downloads all posts of a given publicly-accessible profile.
    Does not download images or videos, only metadata

    Parameters
    ----------
    usernames : str, List[str]
        Username(s) of the profile to download
    root : str
        Path to the folder where the posts will be stored
    """

    loader = instaloader.Instaloader(
        dirname_pattern=os.path.join(root, "{profile}"),
        download_pictures=False,
        download_videos=False,
        download_video_thumbnails=False,
        download_geotags=False,
        download_comments=False,
        save_metadata=True,
        compress_json=False,
    )

    if isinstance(usernames, str):
        usernames = [usernames]

    profiles = [
        instaloader.Profile.from_username(loader.context, username)
        for username in usernames
    ]

    latest_stamps = instaloader.LatestStamps(os.path.join(root, "latest_timestamp.txt"))

    loader.download_profiles(
        profiles,
        fast_update=True,
        profile_pic=False,
        igtv=False,
        latest_stamps=latest_stamps,
        stories=False,
        highlights=False,
        tagged=False,
    )

In [5]:
download_profile("pubitysport")

[1/1] Downloading profile pubitysport
Retrieving posts from profile pubitysport.
[    4/12636] [SWIPE ➡ Barcelona's new money…] json 
[    5/12636] [Spurs' 24 minute implosion:  …] json 
[    6/12636] [𝐖𝐇𝐀𝐓 𝐀 𝐆𝐀𝐌𝐄 🤯🍿  — 𝗙𝗜𝗩𝗘 goals …] json 
[    7/12636] [AT LAST CHELSEA SCORE, NICHOL…] json 
[    8/12636] [SPURS ARE NOW DOWN TO 9 MEN. …] json 
[    9/12636] [ROMERO CONCEDES A PENALTY, GE…] json 
[   10/12636] [Tottenham Hotspur's Destiny U…] json 
[   11/12636] [CHELSEA GO 1-0 DOWN WITHIN TH…] json 


In [37]:
class Vectorizer:
    """
    A class for vectorizing text inputs
    """

    def __init__(self, how: str = "tfidf", ngram_range: Tuple[int, int] = (1, 1)):
        """
        Initializes the vectorizer

        Parameters
        ----------
        how : str
            How to vectorize the content. Can be either "tfidf", "bow" (bag of words), or "bert"
        ngram_range : Tuple[int, int]
            Range of ngrams to use for tfidf or count vectorization
        """
        self.how = how
        self.ngram_range = ngram_range
        if self.how in ["tfidf", "bow"]:
            self.vectorizer = (
                TfidfVectorizer
                if self.how == "tfidf"
                else CountVectorizer
            )(
                input="filename",
                strip_accents="unicode",
                ngram_range=self.ngram_range,
            )
        elif self.how == "bert":
            self.vectorizer = AutoModel.from_pretrained("vinai/bertweet-base")
            self.tokenizer = AutoTokenizer.from_pretrained(
                "vinai/bertweet-base", use_fast=False
            )
        else:
            raise NotImplementedError("how must be either tfidf, bow or bert")
        self.trained = False
        
    def fit_transform(self, text_files: List[os.PathLike], batch_size: int = 8) -> np.ndarray:
        """
        Fits the vectorizer to the given text files, and returns the vectors

        Parameters
        ----------
        text_files : List[os.PathLike]
            List of paths to the text files to fit the vectorizer to
        batch_size : int
            Batch size for bert vectorization

        Returns
        -------
        np.ndarray
            Array containing the vectors
        """
        if self.how in ["tfidf", "bow"]:
            vectors = self.vectorizer.fit_transform(text_files)
        elif self.how == "bert":
            all_embeddings = []

            for i in range(0, len(text_files), 8):
                print(text_files[i:i + 8])

                batch_contents = [open(file_path, 'r', encoding='utf-8').read() for file_path in text_files[i:i + 8] if os.path.exists(file_path)]

                tokens = self.tokenizer(batch_contents, padding=True, truncation=True, return_tensors="pt")

                with torch.no_grad():
                    outputs = self.vectorizer(**tokens)
                    embeddings = [o.numpy() for o in outputs.last_hidden_state]  # This contains the embeddings for each token in the input
                    all_embeddings.extend(embeddings)
            
            vectors = np.array(all_embeddings)
        else:
            raise NotImplementedError("how must be either tfidf, bow or bert")
        
        self.trained = True
        return vectors
    
    def transform(self, text: str) -> np.ndarray:
        """
        Transforms the given text into a vector

        Parameters
        ----------
        text : str
            Text to transform

        Returns
        -------
        np.ndarray
            Array containing the vector
        """
        if not self.trained:
            raise ValueError("Vectorizer must be trained first")
        if self.how in ["tfidf", "bow"]:
            vector = self.vectorizer.transform([text])
        elif self.how == "bert":
            tokens = self.tokenizer([text], padding=True, truncation=True, return_tensors="pt")

            with torch.no_grad():
                outputs = self.vectorizer(**tokens)
                vector = outputs.last_hidden_state[0].numpy()

        return vector

In [38]:
!pip install tqdm



['../data/pubitysport/2023-06-19_20-02-07_UTC.txt', '../data/pubitysport/2019-06-29_20-32-58_UTC.txt', '../data/pubitysport/2020-02-24_14-58-57_UTC.txt', '../data/pubitysport/2021-10-16_12-43-27_UTC.txt', '../data/pubitysport/2019-05-11_10-50-10_UTC.txt', '../data/pubitysport/2020-05-04_16-19-04_UTC.txt', '../data/pubitysport/2022-08-19_09-53-27_UTC.txt', '../data/pubitysport/2018-04-22_19-20-46_UTC.txt']
['../data/pubitysport/2019-09-06_18-43-47_UTC.txt', '../data/pubitysport/2019-07-22_11-21-36_UTC.txt', '../data/pubitysport/2023-03-06_17-36-27_UTC.txt', '../data/pubitysport/2020-10-04_19-14-11_UTC.txt', '../data/pubitysport/2022-06-14_10-30-45_UTC.txt', '../data/pubitysport/2021-12-03_22-25-20_UTC.txt', '../data/pubitysport/2019-03-23_12-13-24_UTC.txt', '../data/pubitysport/2023-06-14_19-02-45_UTC.txt']
['../data/pubitysport/2023-02-04_11-46-07_UTC.txt', '../data/pubitysport/2021-07-07_19-46-53_UTC.txt', '../data/pubitysport/2023-09-10_18-47-37_UTC.txt', '../data/pubitysport/2018-12

KeyboardInterrupt: 

In [36]:
all_embeddings[0].shape

(62, 768)

In [2]:
def fit_vectorizer(
    username: str,
    root: str = "../data",
    how: Union[str, "tfidf", "bow", "bert"] = "tfidf",
    ngram_range: Tuple[int, int] = (1, 1),
    fit_before: Union[str, datetime] = datetime.today()
) -> Tuple[np.ndarray, TfidfVectorizer]:
    """
    Vectorizes the content of a given profile. Assumes the download
    has already been done, and the directory is full of posts. Directory
    is expected to contain a folder named after the profile, which contains
    the text files and json metadata files for each post.

    Parameters
    ----------
    username : str
        Username of the profile to vectorize
    root : str
        Path to the folder where the posts are stored
    how : str
        How to vectorize the content. Can be either "tfidf", "bow" (bag of words), or "bert"
    return_vectorizer : bool
        Whether to return the vectorizer object or not
    ngram_range : Tuple[int, int]
        Range of ngrams to use for tfidf or count vectorization
    fit_before : Union[str, datetime]
        Date to fit the vectorizer before. Can be either a datetime object

    Returns
    -------
    Tuple[np.ndarray, Union[TfidfVectorizer, None]]
        Tuple containing the vectors and the vectorizer object if
        return_vectorizer is True, None otherwise
    """
    if how in ["tfidf", "bow"]:
        profile_path = os.path.join(root, username)
        text_files = [
            os.path.join(profile_path, file)
            for file in os.listdir(profile_path)
            if file.endswith(".txt")
            and datetime.fromtimestamp(os.path.getmtime(os.path.join(profile_path, file))) < fit_before
        ]
        vec_class = TfidfVectorizer if how == "tfidf" else CountVectorizer
        vectorizer = vec_class(input="filename", strip_accents="unicode", ngram_range=ngram_range)
        vectors = vectorizer.fit_transform(text_files)

    elif how == "bert":
        raise NotImplementedError("bert vectorization not implemented yet")
    else:
        raise NotImplementedError("how must be either tfidf or bert")

    # Necessary to work with raw text inputs after training on documents
    vectorizer.input = 'content'
    return vectors, vectorizer


In [3]:
def get_trends(hl='en-US', geo="US", tz=360, count=20, date: Union[datetime, str] = datetime.today()) -> pd.DataFrame:
    """
    Fetch Google Trends realtime data
    
    Parameters
    ----------
    hl : str
        Language
    geo : str
        Country
    tz : int
        Timezone
    count : int
        Number of results
    date : Union[datetime, str]
        Date to fetch data from. Can be either a datetime object or a string
        in the format YYYY-MM-DD (fuzzy parsing is enabled, but not recommended)
    
    Returns
    -------
    pd.DataFrame
        DataFrame containing the results
    """
    if isinstance(date, str):
        date = dateparser.parse(date, fuzzy=True)
    
    response = requests.get(
        "https://trends.google.com/trends/api/dailytrends",
        params={
            "hl": hl,
            "tz": tz,
            "ed": date.strftime("%Y%m%d"),
            "geo": geo,
            "ns": count,
        },
    )

    response.raise_for_status()
    if response.status_code != 204:
        data = response.text.split(")]}\',\n")[1]
        data = json.loads(data)["default"]["trendingSearchesDays"][0]["trendingSearches"]

    dfs = pd.concat([pd.DataFrame(trend['articles']) for trend in data], ignore_index=True)
           
    return dfs

In [4]:
def get_article(url: str):
    """
    Obtains the likely text from an article based on the newspaper library

    Parameters
    ----------
    url: str
        URL of the article to fetch

    Returns 
    -------
    str
        Most likely article text 
    """
    
    text = ""
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4644.45 Safari/537.36",
            "Connection": "keep-alive",
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        article = Article(url)
        article.download(input_html=response.text)
        article.parse()
        text = article.text            

    except requests.exceptions.HTTPError as http_err:
        pass

    return text


In [5]:
def read_metadata_json(fp: os.PathLike):
    with open(fp, 'r') as f:
        metadata = json.loads(f.read())["node"]
    
        dt = datetime.strptime(os.path.basename(fp), "%Y-%m-%d_%H-%M-%S_UTC.json")
    
        clean_metadata = {
            "dt": dt,
            "likes": metadata["edge_media_preview_like"]["count"],
            "comments": metadata["edge_media_to_comment"]["count"],
            "caption": metadata["edge_media_to_caption"]["edges"][0]["node"]["text"] if metadata["edge_media_to_caption"]["edges"] else "",
            "comments_disabled": metadata["comments_disabled"],
            "is_video": metadata["is_video"],
            "tagged_users": metadata["edge_media_to_tagged_user"]
            
        }

        return clean_metadata

In [6]:
def get_posts(
        username: str,      
        root: os.PathLike = "../data",
    ) -> pd.DataFrame:
    """
    Reads all the posts of a given profile and returns a DataFrame
    with the relevant information

    Parameters
    ----------
    username : str
        Username of the profile to read
    root : os.PathLike
        Path to the folder where the posts are stored
    
    Returns
    -------
    pd.DataFrame
        DataFrame containing the relevant metadata
    """
    profile_path = os.path.join(root, username)
    json_files = [
        os.path.join(profile_path, file)
        for file in os.listdir(profile_path)
        if file.endswith("UTC.json")
    ]
    metadata = [read_metadata_json(file) for file in json_files]
    df = pd.DataFrame(metadata)
    df.set_index("dt", inplace=True)
    df.sort_index(inplace=True)
    return df

In [7]:
trends = get_trends()

In [8]:
trends

Unnamed: 0,title,timeAgo,source,image,url,snippet
0,**!!((S T R E A M))* Pittsburgh Steelers vs Te...,15h ago,Mageecourier-countynews,{'newsUrl': 'https://www.simpsoncounty.ms/node...,https://www.simpsoncounty.ms/node/286444,Category : Tennessee Titans vs. Pittsburgh Ste...
1,"Steelers shut door in fourth quarter, continue...",10h ago,ESPN,{'newsUrl': 'https://www.espn.com/nfl/story/_/...,https://www.espn.com/nfl/story/_/id/38804709/s...,Linebacker Kwon Alexander thwarts the Titans&#...
2,"Titans Fall 20-16 to Steelers, Drop to 3-5",9h ago,Tennessee Titans,{'newsUrl': 'https://www.tennesseetitans.com/n...,https://www.tennesseetitans.com/news/titans-fa...,The Steelers then took the lead 20-16 on a thr...
3,Titans vs. Steelers highlights: Pittsburgh win...,7h ago,FOXSports.com,{'newsUrl': 'https://www.foxsports.com/stories...,https://www.foxsports.com/stories/nfl/titans-v...,Week 9 of the NFL season begins Thursday with ...
4,Steelers and Kenny Pickett exorcise some offen...,3h ago,Yahoo Sports,{'newsUrl': 'https://sports.yahoo.com/steelers...,https://sports.yahoo.com/steelers-and-kenny-pi...,The calls for Matt Canada&#39;s job won&#39;t ...
...,...,...,...,...,...,...
126,"Hezbollah, Israel exchange fire as violence sp...",19h ago,Reuters,{'newsUrl': 'https://www.reuters.com/world/mid...,https://www.reuters.com/world/middle-east/leba...,Lebanon&#39;s Hezbollah said on Thursday it mo...
127,Lebanon-Israel border fighting picks up before...,18h ago,Aljazeera.com,{'newsUrl': 'https://www.aljazeera.com/news/20...,https://www.aljazeera.com/news/2023/11/2/leban...,Hezbollah chief Hassan Nasrallah is set to spe...
128,Hezbollah&#39;s Nasrallah to break silence as ...,4h ago,The Times of Israel,{'newsUrl': 'https://www.timesofisrael.com/hez...,https://www.timesofisrael.com/hezbollahs-nasra...,Lebanon&#39;s Hezbollah chief Hassan Nasrallah...
129,Israel-Hamas war live: Hezbollah chief Hassan ...,4h ago,Aljazeera.com,{'newsUrl': 'https://www.aljazeera.com/news/li...,https://www.aljazeera.com/news/liveblog/2023/1...,Israeli authorities send back thousands of Pal...


In [49]:
trends["text"] = trends["url"].apply(get_article)



In [40]:
vectors, vectorizer = fit_vectorizer("pubitysport", how="tfidf")

In [42]:
posts = get_posts("pubitysport")

In [43]:
posts['embedding'] = posts['caption'].apply(lambda x: vectorizer.transform([x]))

In [44]:
def ema_sparse_array(data, alpha):
    ema = []
    ema_value = None

    for value in data:
        if ema_value is None:
            ema_value = value
        else:
            ema_value = (value - ema_value) * alpha + ema_value
        ema.append(ema_value)

    return ema

In [45]:
posts['embedding_ema'] = ema_sparse_array(posts['embedding'], 0.99)

In [46]:
trends['embedding'] = trends['text'].apply(lambda x: vectorizer.transform([x]))

In [47]:
trends['score'] = trends['embedding'].apply(lambda x: cosine_similarity(x, posts['embedding_ema'].iloc[-1])[0][0])

In [48]:
trends.sort_values(by='score', ascending=False)

Unnamed: 0,title,timeAgo,source,image,url,snippet,text,embedding,score
57,World Series 2023: Live updates from Rangers p...,1h ago,The Dallas Morning News,{'newsUrl': 'https://www.dallasnews.com/news/2...,https://www.dallasnews.com/news/2023/11/03/tex...,A team of Dallas Morning News staffers will be...,The Texas Rangers have returned to their home ...,"(0, 15926)\t0.008895202478043075\n (0, 1591...",0.128359
95,"Yes, they&#39;ve already picked the Rockefelle...",23h ago,GPB,{'newsUrl': 'https://www.gpb.org/news/2023/11/...,https://www.gpb.org/news/2023/11/02/yes-theyve...,The Rockefeller Center Christmas tree is comin...,The Rockefeller Center Christmas tree is comin...,"(0, 15935)\t0.022177823251572158\n (0, 1592...",0.107895
56,City of Arlington to Host Texas Rangers World ...,16h ago,City of Arlington,{'newsUrl': 'https://www.arlingtontx.gov/news/...,https://www.arlingtontx.gov/news/my_arlington_...,Get the latest news delivered straight to your...,City of Arlington to Host Texas Rangers World ...,"(0, 15898)\t0.012901552571298495\n (0, 1589...",0.106151
52,Earth&#39;s new future is here in Kingdom of t...,23h ago,EW.com,{'newsUrl': 'https://ew.com/movies/kingdom-of-...,https://ew.com/movies/kingdom-of-the-planet-of...,"The war may be over, but the planet of the ape...","The war may be over, but the planet of the ape...","(0, 15928)\t0.03313274439783441\n (0, 15926...",0.104056
86,Tommy Tuberville&#39;s Military Blockade Is Th...,1h ago,Rolling Stone,{'newsUrl': 'https://www.rollingstone.com/poli...,https://www.rollingstone.com/politics/politics...,Is Tommy Tuberville the Most Ignorant Man in D...,Tommy Tuberville’s Republican colleagues had f...,"(0, 15988)\t0.0058343504296261344\n (0, 159...",0.102152
...,...,...,...,...,...,...,...,...,...
24,Fortnite OG: Drop Back into Battle Royale Chap...,7h ago,Epic Games,{'newsUrl': 'https://www.fortnite.com/news/for...,https://www.fortnite.com/news/fortnite-og-drop...,Drop in the original Island once again in Fort...,,,0.000000
25,Run It Back with the OG Pass and OG Shop in Fo...,7h ago,Epic Games,{'newsUrl': 'https://www.fortnite.com/news/run...,https://www.fortnite.com/news/run-it-back-with...,The time-traveling turbo Season Fortnite OG me...,,,0.000000
72,Duke Football Secures Bowl Eligibility With Wa...,3h ago,CalBearsMaven,{'newsUrl': 'https://www.si.com/college/duke/f...,https://www.si.com/college/duke/football/duke-...,Following Thursday night&#39;s battle-winning ...,,,0.000000
58,What to know about the Texas Rangers&#39; Worl...,21h ago,Axios,{'newsUrl': 'https://www.axios.com/local/dalla...,https://www.axios.com/local/dallas/2023/11/02/...,Rangers fans will finally get to experience so...,,,0.000000
