# In this notebook, I have selected 5 random course_ids and fetched the user_action data of these courses and saved them in an excel file.

In [6]:
import pandas as pd
import numpy as np
import time, json, requests, os, re

from global_functions_1 import sanitize_filepath

In [16]:
# Get all the category_IDs
df = pd.read_csv("../data/all_category_ids.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69 entries, 0 to 68
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   category_id  69 non-null     int64 
 1   name         69 non-null     object
dtypes: int64(1), object(1)
memory usage: 1.2+ KB


In [17]:
df.sample(10, random_state=0)

Unnamed: 0,category_id,name
63,102,Introduction to Natural Language Processing
34,65,Algorithmic Thinking in Bioinformatics
57,93,Game Theory and Strategy
66,105,Control Engineering
49,82,Operating Systems
43,74,Statistical Computing
7,25,Programming in Python
62,100,Deep Learning Practice
67,106,Digital System Design
41,72,Advanced Algorithms


In [11]:
def execute_query(query_id, query_params=None):
    DISCOURSE_BASE_URL = "https://discourse.onlinedegree.iitm.ac.in"
    GROUP_NAME = "discourse_analytics"
    API_KEY_GLOBAL= os.environ.get('API_KEY')
    API_USERNAME = 'shubhamG'

    iteration_count = 0  # Initialize iteration counter
    results_list = []  # List to store results
    has_more_results = True  # Flag to control the loop for pagination

    # Check if query_params is provided
    if query_params is None:
        pass  # No parameters provided, continue with default
    else:
        # Ensure query_params is a dictionary
        if not isinstance(query_params, dict):
            raise ValueError("Query parameters must be a dictionary.")

    # Set up headers for the API request
    headers = {
        "Accept": "*/*",
        "Api-Key": API_KEY_GLOBAL,  # Get API key from userdata
        "Api-Username": API_USERNAME,  # Set the username for the API
        "Content-Type": "multipart/form-data"  # Set content type
    }

    # Loop until there are no more results
    while has_more_results:
        # Construct the request URL for the API
        request_url = f"{DISCOURSE_BASE_URL}/g/{GROUP_NAME}/reports/{query_id}/run"

        # Prepare the data payload for the request
        if query_params is not None:
            payload = {'page': str(iteration_count)}  # Add page number to payload
            payload.update(query_params)  # Update payload with additional query parameters
            data_payload = 'params=' + json.dumps(payload)  # Convert payload to JSON string
        else:
            data_payload = f'params={{"page": "{iteration_count}"}}'  # Default payload with page number

        try:
            # Send POST request to the API
            # print(data_payload)
            response = requests.request("POST", request_url, data=data_payload, headers=headers)
            response.raise_for_status()  # Raise an error for bad responses

            json_response = response.json()  # Parse the JSON response

            # Check if there are no results
            if json_response["result_count"] == 0:
                has_more_results = False  # No more results to fetch
                break

            # Iterate over the rows in the response
            for index in range(len(json_response['rows'])):
                # Append each row as a dictionary to the results list
                results_list.append(dict(zip(json_response['columns'], json_response['rows'][index])))

        except Exception as e:
            # Log key-related errors
            print(f'error: {e}')
            has_more_results = False  # Stop fetching results
            break

        iteration_count += 1  # Increment iteration count for pagination
        time.sleep(1.4)  # Wait before the next request

    results_dataframe = pd.DataFrame(results_list)  # Convert results list to DataFrame
    return results_dataframe  # Return the DataFrame with results

In [45]:
course_specific_data_path = "../data/course_specific_data/t1_2025"
os.makedirs(course_specific_data_path, exist_ok=True)
for row in df.itertuples():
    try:
        category_id = row.category_id
        category_name = sanitize_filepath(row.name) # Proper filenames without :," ", etc
        params = {"category_id":str(category_id),"start_date":"01/01/2025","end_date":'30/04/2025'}
        user_actions_df = execute_query(103, query_params=params)
        user_actions_df.to_excel(os.path.join(course_specific_data_path,f"{category_name}.xlsx"), 
                sheet_name="user_actions_data", 
                index=False)
    except Exception as exec:
        print(f"Error; {exec} for subject: {category_name}")
        continue