In [32]:
import requests
import time
import json
from typing import List
import pandas as pd

## Apify data procurement

In [2]:
class ApifyActorRunner:
    def __init__(self, token: str):
        self.base_url = "https://api.apify.com/v2"
        self.token = token

    def _headers(self):
        return {
            "Authorization": f"Bearer {self.token}",
            "Content-Type": "application/json",
        }

    def run_actor(self, actor_id: str, actor_input: dict) -> str:
        """Trigger the actor and return the runId."""
        url = f"{self.base_url}/acts/{actor_id}/runs"
        params = {"token": self.token}
        response = requests.post(
            url, headers=self._headers(), json=actor_input, params=params
        )
        response.raise_for_status()
        return response.json()["data"]["id"]

    def wait_for_run_to_finish(
        self, run_id: str, poll_interval: int = 5, timeout: int = 600
    ) -> str:
        """Poll for actor run status until it finishes, fails, or times out."""
        url = f"{self.base_url}/actor-runs/{run_id}"
        elapsed = 0
        while elapsed < timeout:
            response = requests.get(url, headers=self._headers())
            response.raise_for_status()
            status = response.json()["data"]["status"]
            if status in ["SUCCEEDED", "FAILED", "ABORTED", "TIMED-OUT"]:
                return status
            time.sleep(poll_interval)
            elapsed += poll_interval
        raise TimeoutError("Actor run timed out")

    def get_dataset_items(
        self, run_id: str, clean: bool = True, format: str = "json"
    ) -> list:
        """Fetch dataset items from completed actor run."""
        # First, get the dataset ID from the run
        url = f"{self.base_url}/actor-runs/{run_id}"
        response = requests.get(url, headers=self._headers())
        response.raise_for_status()
        dataset_id = response.json()["data"]["defaultDatasetId"]

        # Now fetch items from dataset
        dataset_url = f"{self.base_url}/datasets/{dataset_id}/items"
        params = {"clean": str(clean).lower(), "format": format}
        dataset_response = requests.get(
            dataset_url, headers=self._headers(), params=params
        )
        dataset_response.raise_for_status()
        return dataset_response.json()

    def run_actor_and_get_data(self, actor_id: str, actor_input: dict) -> list:
        """Full process: run, wait, and fetch data."""
        run_id = self.run_actor(actor_id, actor_input)
        status = self.wait_for_run_to_finish(run_id)
        if status != "SUCCEEDED":
            raise RuntimeError(f"Actor run did not succeed. Final status: {status}")
        return self.get_dataset_items(run_id)

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()  # reads .env in your project root
apify_token = os.getenv("APIFY_TOKEN")

runner = ApifyActorRunner(token=apify_token)

In [4]:
private_accounts = [
    "https://www.instagram.com/press.julian/",
    "https://www.instagram.com/ilonac.kneissler/",
    "https://www.instagram.com/ep3octaviars/",
    "https://www.instagram.com/ewe_michael/",
]

business_accounts = [
    "https://www.instagram.com/kfz_reifenservice_molfsee_/",
    "https://www.instagram.com/rogersonderegger/",
    "https://www.instagram.com/beton.gold.official/",
    "https://www.instagram.com/room.base24/",
    "https://www.instagram.com/grubenholz/",
    "https://www.instagram.com/seeluft.immobilien/",
    "https://www.instagram.com/mario.pahl.shabbygarten/",
]

In [5]:
def fetch_username_from_url(url: str) -> str:
    url = url.rstrip("/")
    return url.split("/")[-1]

In [6]:
actor_id = "apify~instagram-post-scraper"

user_names = [fetch_username_from_url(url=url) for url in private_accounts]

actor_input = {
    "resultsLimit": 20,
    "skipPinnedPosts": False,
    "username": None,
}

In [7]:
data_response = []

for username in user_names:
    actor_input["username"] = [username]
    try:
        data = runner.run_actor_and_get_data(actor_id, actor_input)
        print(f"Actor run succeeded for profile: {username}")
        data_response.append(data)
        time.sleep(5)
    except Exception as e:
        print(f"Error during actor run: {e}")

Actor run succeeded for profile: press.julian
Actor run succeeded for profile: ilonac.kneissler
Actor run succeeded for profile: ep3octaviars
Actor run succeeded for profile: ewe_michael


In [27]:
!pwd

/Users/kaushikdayalan/projects/streamlit_experiment/notebooks


In [29]:
len(data_response)

4

In [None]:
def raw_dump(username: str, data: List) -> None:
    with open(f"./data/raw_data/{username}_data.json", "w") as file:
        json.dump(data, file)


for username, data in zip(user_names, data_response):
    raw_dump(username=username, data=data)

In [68]:
def preprocess_reponses(data: List):
    processed_data = []

    for item in data:
        temp_data = {}
        temp_data["profile_url"] = item["inputUrl"]
        temp_data["profile_name"] = item["ownerUsername"]
        temp_data["media_type"] = item["type"]
        temp_data["post_url"] = item["url"]
        temp_data["caption"] = item["caption"]
        temp_data["comments_count"] = item["commentsCount"]
        temp_data["like_count"] = item["likesCount"]
        temp_data["post_timestamp"] = item["timestamp"]
        processed_data.append(temp_data)
    return processed_data

In [69]:
for username in user_names:
    with open(f"./data/raw_data/{username}_data.json", "r") as file:
        data = json.load(file)

    data = preprocess_reponses(data=data)
    df = pd.DataFrame(data)
    df.to_csv(f"./data/processed_data/{username}_data.csv", index=False)

## API data procurement

In [95]:
starter_url = "http://10.250.10.100"
base_post_url = "/instagramscraper"
download = "false"
endpoint = f"{starter_url}{base_post_url}/scraper/post_scraper"


public_usernames = [fetch_username_from_url(url=url) for url in business_accounts]

data_responses = []

for username in public_usernames:
    params = {"username": username, "download": download}
    response = requests.get(endpoint, params=params)
    followers_count = response.json()["business_discovery"]["followers_count"]
    data = pd.DataFrame(response.json()["business_discovery"]["media"]["data"])
    data["followers"] = followers_count
    data["engagement"] = data[["like_count", "comments_count"]].sum(axis=1)
    data["profile_url"] = f"https://www.instagram.com/{username}/"
    data["profile_name"] = username
    data = data.rename({"timestamp": "post_timestamp", "permalink": "post_url"}, axis=1)
    data = data[
        [
            "profile_url",
            "profile_name",
            "media_type",
            "post_url",
            "caption",
            "comments_count",
            "like_count",
            "post_timestamp",
        ]
    ]
    break
    # data.to_csv(f"./data/processed_data/{username}_data.csv", index=False)
    # time.sleep(5)

In [96]:
data

Unnamed: 0,profile_url,profile_name,media_type,post_url,caption,comments_count,like_count,post_timestamp
0,https://www.instagram.com/kfz_reifenservice_mo...,kfz_reifenservice_molfsee_,IMAGE,https://www.instagram.com/p/DEN5fqHAxfO/,Wir wünschen Euch einen guten Rutsch!🎊🎆🎈#molfs...,0,20,2024-12-30T21:12:48+0000
1,https://www.instagram.com/kfz_reifenservice_mo...,kfz_reifenservice_molfsee_,IMAGE,https://www.instagram.com/p/DD7m3BdgN3O/,"Liebe Kunden/liebe Kundinnen,\n\ndas Jahresend...",0,23,2024-12-23T18:43:38+0000
2,https://www.instagram.com/kfz_reifenservice_mo...,kfz_reifenservice_molfsee_,IMAGE,https://www.instagram.com/p/DA4AuBVgzl4/,"Ab wann Sommer­reifen, wann Winter­reifen?\n\n...",0,29,2024-10-08T19:38:27+0000
3,https://www.instagram.com/kfz_reifenservice_mo...,kfz_reifenservice_molfsee_,IMAGE,https://www.instagram.com/p/DA35-DyArqh/,Neue Änderung: Alpine - Symbol wird Pflicht\nA...,0,24,2024-10-08T18:39:28+0000
4,https://www.instagram.com/kfz_reifenservice_mo...,kfz_reifenservice_molfsee_,IMAGE,https://www.instagram.com/p/C5JvW6GIy0C/,Team Kfz&Reifenservice Molfsee wünscht frohe O...,1,39,2024-03-30T20:41:55+0000
5,https://www.instagram.com/kfz_reifenservice_mo...,kfz_reifenservice_molfsee_,IMAGE,https://www.instagram.com/p/C3ySCfzozBo/,Wir sind ein Familienbetrieb mit Perspektive!\...,1,36,2024-02-25T21:31:48+0000
6,https://www.instagram.com/kfz_reifenservice_mo...,kfz_reifenservice_molfsee_,VIDEO,https://www.instagram.com/reel/C1IPnBEAiAR/,"Liebe Kunden, \n\nwir bedanken uns sehr für Ih...",0,36,2023-12-21T20:39:49+0000
7,https://www.instagram.com/kfz_reifenservice_mo...,kfz_reifenservice_molfsee_,IMAGE,https://www.instagram.com/p/C00yIYwgQuu/,Wir sind ein Familienbetrieb mit Perspektive!\...,0,28,2023-12-14T07:16:31+0000
8,https://www.instagram.com/kfz_reifenservice_mo...,kfz_reifenservice_molfsee_,IMAGE,https://www.instagram.com/p/CsYjJn7IDy2/,Wir wünschen allen Vätern einen schönen Vatert...,0,36,2023-05-18T11:55:44+0000
9,https://www.instagram.com/kfz_reifenservice_mo...,kfz_reifenservice_molfsee_,IMAGE,https://www.instagram.com/p/Cm1F5GbDb5Z/,Wir wünschen Euch einen guten Rutsch!🎊🎆🎈,1,39,2022-12-31T10:49:41+0000


In [99]:
from glob import glob

files = glob("./data/processed_data/*.csv")

In [100]:
files

['./data/processed_data/seeluft.immobilien_data.csv',
 './data/processed_data/beton.gold.official_data.csv',
 './data/processed_data/kfz_reifenservice_molfsee__data.csv',
 './data/processed_data/ep3octaviars_data.csv',
 './data/processed_data/mario.pahl.shabbygarten_data.csv',
 './data/processed_data/ilonac.kneissler_data.csv',
 './data/processed_data/grubenholz_data.csv',
 './data/processed_data/rogersonderegger_data.csv',
 './data/processed_data/ewe_michael_data.csv',
 './data/processed_data/press.julian_data.csv',
 './data/processed_data/room.base24_data.csv']

In [121]:
merged_df = pd.concat([pd.read_csv(file) for file in files])

In [122]:
merged_df["like_count"] = merged_df["like_count"].fillna(0)
merged_df["comments_count"] = merged_df["comments_count"].fillna(0)
merged_df["engagement"] = merged_df[["like_count", "comments_count"]].sum(axis=1)

In [123]:
metrics_data = (
    merged_df.groupby("profile_name")[["like_count", "comments_count", "engagement"]]
    .sum()
    .reset_index()
    .sort_values("engagement", ascending=False)
)
metrics_data = metrics_data.rename(
    {
        "like_count": "total_likes",
        "comments_count": "total_comments",
        "engagement": "total_engagement",
    },
    axis=1,
)

In [124]:
posts_count = (
    merged_df.groupby("profile_name")
    .size()
    .reset_index()
    .rename({0: "total_posts"}, axis=1)
    .sort_values("total_posts", ascending=False)
)

In [125]:
kpi_data = metrics_data.merge(posts_count, on="profile_name", how="left")

In [126]:
kpi_data

Unnamed: 0,profile_name,total_likes,total_comments,total_engagement,total_posts
0,mario.pahl.shabbygarten,48298.0,1771,50069.0,500
1,beton.gold.official,17826.0,556,18382.0,431
2,grubenholz,13745.0,933,14678.0,138
3,ep3octaviars,2392.0,60,2452.0,8
4,rogersonderegger,1941.0,72,2013.0,103
5,press.julian,1847.0,66,1913.0,19
6,kfz_reifenservice_molfsee_,1785.0,18,1803.0,32
7,room.base24,635.0,1,636.0,24
8,seeluft.immobilien,259.0,44,303.0,82
9,international__vgartcurator,207.0,10,217.0,1


In [127]:
kpi_data.to_csv("./data/ranking_data/ranking_dataset.csv", index=False)

In [129]:
kpi_data.describe()

Unnamed: 0,total_likes,total_comments,total_engagement,total_posts
count,12.0,12.0,12.0,12.0
mean,7439.916667,297.333333,7737.25,114.5
std,14115.816966,545.356153,14645.684305,169.892212
min,160.0,1.0,188.0,1.0
25%,246.0,16.0,281.5,18.25
50%,1816.0,52.0,1858.0,28.0
75%,5230.25,193.0,5508.5,111.75
max,48298.0,1771.0,50069.0,500.0
