In [1]:
import requests
import time
import json
from typing import List
import pandas as pd

## Apify data procurement

In [2]:
class ApifyActorRunner:
    def __init__(self, token: str):
        self.base_url = "https://api.apify.com/v2"
        self.token = token

    def _headers(self):
        return {
            "Authorization": f"Bearer {self.token}",
            "Content-Type": "application/json",
        }

    def run_actor(self, actor_id: str, actor_input: dict) -> str:
        """Trigger the actor and return the runId."""
        url = f"{self.base_url}/acts/{actor_id}/runs"
        params = {"token": self.token}
        response = requests.post(
            url, headers=self._headers(), json=actor_input, params=params
        )
        response.raise_for_status()
        return response.json()["data"]["id"]

    def wait_for_run_to_finish(
        self, run_id: str, poll_interval: int = 5, timeout: int = 600
    ) -> str:
        """Poll for actor run status until it finishes, fails, or times out."""
        url = f"{self.base_url}/actor-runs/{run_id}"
        elapsed = 0
        while elapsed < timeout:
            response = requests.get(url, headers=self._headers())
            response.raise_for_status()
            status = response.json()["data"]["status"]
            if status in ["SUCCEEDED", "FAILED", "ABORTED", "TIMED-OUT"]:
                return status
            time.sleep(poll_interval)
            elapsed += poll_interval
        raise TimeoutError("Actor run timed out")

    def get_dataset_items(
        self, run_id: str, clean: bool = True, format: str = "json"
    ) -> list:
        """Fetch dataset items from completed actor run."""
        # First, get the dataset ID from the run
        url = f"{self.base_url}/actor-runs/{run_id}"
        response = requests.get(url, headers=self._headers())
        response.raise_for_status()
        dataset_id = response.json()["data"]["defaultDatasetId"]

        # Now fetch items from dataset
        dataset_url = f"{self.base_url}/datasets/{dataset_id}/items"
        params = {"clean": str(clean).lower(), "format": format}
        dataset_response = requests.get(
            dataset_url, headers=self._headers(), params=params
        )
        dataset_response.raise_for_status()
        return dataset_response.json()

    def run_actor_and_get_data(self, actor_id: str, actor_input: dict) -> list:
        """Full process: run, wait, and fetch data."""
        run_id = self.run_actor(actor_id, actor_input)
        status = self.wait_for_run_to_finish(run_id)
        if status != "SUCCEEDED":
            raise RuntimeError(f"Actor run did not succeed. Final status: {status}")
        return self.get_dataset_items(run_id)

In [3]:
import os
from dotenv import load_dotenv

load_dotenv()  # reads .env in your project root
apify_token = os.getenv("APIFY_TOKEN")

runner = ApifyActorRunner(token=apify_token)

In [5]:
private_accounts = ["https://www.instagram.com/tanja.beyrau/"]

business_accounts = [
    "https://www.instagram.com/markushlubek.immobilien/",
    "https://www.instagram.com/celinevenhofen.immobilien/",
    "https://www.instagram.com/ekincemdurmaz.immobilien/",
    "https://www.instagram.com/immotraum_westkueste/",
]

In [6]:
def fetch_username_from_url(url: str) -> str:
    url = url.rstrip("/")
    return url.split("/")[-1]

In [7]:
actor_id = "apify~instagram-post-scraper"

user_names = [fetch_username_from_url(url=url) for url in private_accounts]

actor_input = {
    "resultsLimit": 20,
    "skipPinnedPosts": False,
    "username": None,
}

In [8]:
data_response = []

for username in user_names:
    actor_input["username"] = [username]
    try:
        data = runner.run_actor_and_get_data(actor_id, actor_input)
        print(f"Actor run succeeded for profile: {username}")
        data_response.append(data)
        time.sleep(5)
    except Exception as e:
        print(f"Error during actor run: {e}")

Actor run succeeded for profile: tanja.beyrau


In [27]:
!pwd

/Users/kaushikdayalan/projects/streamlit_experiment/notebooks


In [9]:
len(data_response)

1

In [10]:
def raw_dump(username: str, data: List) -> None:
    with open(f"./data/raw_data/{username}_data.json", "w") as file:
        json.dump(data, file)


for username, data in zip(user_names, data_response):
    raw_dump(username=username, data=data)

In [11]:
def preprocess_reponses(data: List):
    processed_data = []

    for item in data:
        temp_data = {}
        temp_data["profile_url"] = item["inputUrl"]
        temp_data["profile_name"] = item["ownerUsername"]
        temp_data["media_type"] = item["type"]
        temp_data["post_url"] = item["url"]
        temp_data["caption"] = item["caption"]
        temp_data["comments_count"] = item["commentsCount"]
        temp_data["like_count"] = item["likesCount"]
        temp_data["post_timestamp"] = item["timestamp"]
        processed_data.append(temp_data)
    return processed_data

In [12]:
for username in user_names:
    with open(f"./data/raw_data/{username}_data.json", "r") as file:
        data = json.load(file)

    data = preprocess_reponses(data=data)
    df = pd.DataFrame(data)
    df.to_csv(f"./data/processed_data/{username}_data.csv", index=False)

## API data procurement

In [13]:
starter_url = "http://10.250.10.100"
base_post_url = "/instagramscraper"
download = "false"
endpoint = f"{starter_url}{base_post_url}/scraper/post_scraper"


public_usernames = [fetch_username_from_url(url=url) for url in business_accounts]

data_responses = []

for username in public_usernames:
    params = {"username": username, "download": download}
    response = requests.get(endpoint, params=params)
    followers_count = response.json()["business_discovery"]["followers_count"]
    data = pd.DataFrame(response.json()["business_discovery"]["media"]["data"])
    data["followers"] = followers_count
    data["engagement"] = data[["like_count", "comments_count"]].sum(axis=1)
    data["profile_url"] = f"https://www.instagram.com/{username}/"
    data["profile_name"] = username
    data = data.rename({"timestamp": "post_timestamp", "permalink": "post_url"}, axis=1)
    data = data[
        [
            "profile_url",
            "profile_name",
            "media_type",
            "post_url",
            "caption",
            "comments_count",
            "like_count",
            "post_timestamp",
        ]
    ]

    data.to_csv(f"./data/processed_data/{username}_data.csv", index=False)
    time.sleep(5)

In [14]:
from glob import glob

files = glob("./data/processed_data/*.csv")

In [15]:
files

['./data/processed_data/immotraum_westkueste_data.csv',
 './data/processed_data/ekincemdurmaz.immobilien_data.csv',
 './data/processed_data/celinevenhofen.immobilien_data.csv',
 './data/processed_data/markushlubek.immobilien_data.csv',
 './data/processed_data/tanja.beyrau_data.csv']

In [16]:
merged_df = pd.concat([pd.read_csv(file) for file in files])

In [17]:
merged_df["like_count"] = merged_df["like_count"].fillna(0)
merged_df["comments_count"] = merged_df["comments_count"].fillna(0)
merged_df["engagement"] = merged_df[["like_count", "comments_count"]].sum(axis=1)

In [18]:
metrics_data = (
    merged_df.groupby("profile_name")[["like_count", "comments_count", "engagement"]]
    .sum()
    .reset_index()
    .sort_values("engagement", ascending=False)
)
metrics_data = metrics_data.rename(
    {
        "like_count": "total_likes",
        "comments_count": "total_comments",
        "engagement": "total_engagement",
    },
    axis=1,
)

In [19]:
posts_count = (
    merged_df.groupby("profile_name")
    .size()
    .reset_index()
    .rename({0: "total_posts"}, axis=1)
    .sort_values("total_posts", ascending=False)
)

In [20]:
kpi_data = metrics_data.merge(posts_count, on="profile_name", how="left")

In [21]:
kpi_data

Unnamed: 0,profile_name,total_likes,total_comments,total_engagement,total_posts
0,celinevenhofen.immobilien,4739.0,204,4943.0,162
1,ekincemdurmaz.immobilien,1621.0,34,1655.0,49
2,markushlubek.immobilien,982.0,42,1024.0,43
3,immotraum_westkueste,149.0,4,153.0,15
4,finanzexperten.im.norden,137.0,0,137.0,1


In [22]:
kpi_data.to_csv("./data/ranking_data/ranking_dataset.csv", index=False)

In [23]:
kpi_data.describe()

Unnamed: 0,total_likes,total_comments,total_engagement,total_posts
count,5.0,5.0,5.0,5.0
mean,1525.6,56.8,1582.4,54.0
std,1900.90631,84.292348,1983.877466,63.52165
min,137.0,0.0,137.0,1.0
25%,149.0,4.0,153.0,15.0
50%,982.0,34.0,1024.0,43.0
75%,1621.0,42.0,1655.0,49.0
max,4739.0,204.0,4943.0,162.0
