# This notebook creates some basic visualizations of the data for whole discourse. The data is NOT related to any specific category.

In [43]:
import altair as alt
import pandas as pd
import numpy as np
import os
import IPython.display

def create_stacked_bar_chart(raw_metrics, subject, path_to_static_folder):
    # Define the metrics we are interested in
    metrics = ['likes_received', 'likes_given', 'topics_viewed', 'posts_read', 'days_visited', 'solutions']

    # Ensure 'username' is included when filtering
    filtered_metrics = raw_metrics.set_index(['user_id', 'username'])[metrics].loc[:, (raw_metrics[metrics] != 0).any()]

    # Transform data to long format
    long_df = filtered_metrics.reset_index().melt(id_vars=["user_id", "username"], var_name="metric", value_name="count")


    if not long_df.empty:
        # Create the Altair stacked bar chart
        chart = alt.Chart(long_df).mark_bar().encode(
            x=alt.X("count:Q", title="Total User Interactions", 
                    axis=alt.Axis(format="~s", titleFontSize=14)),  # Formatting x-axis ticks
            y=alt.Y("username:N", title="Users", sort="-x", 
                    axis=alt.Axis(titleFontSize=14)),  # Changed from user_id to username
            color=alt.Color("metric:N", title="Activity Type"),
            tooltip=["username", "metric", "count"]  # Changed tooltip from user_id to username
        ).properties(
            title=f"Most Active Users ({subject})",
            width=600,
            height=400
        )


        # Save as interactive HTML
        viz_folder = os.path.join(path_to_static_folder, "visualizations")  # Folder for visualizations
        os.makedirs(viz_folder, exist_ok=True)
        chart.save(f'{viz_folder}/most_active_users_{subject.lower().replace(" ","_")}.html')

        # Display in Jupyter Notebook
        IPython.display.display(chart)
    else:
        print("No non-zero metrics to display.")


In [37]:
path_to_static_folder = "../static"
full_path = "../data/course_excel_data_t1_2024/data_all_users.xlsx"

# Load the unnormalized scores
unnormalized_df = pd.read_excel(full_path, sheet_name="unnormalized_scores")

# Select top 10 users based on their total engagement score
top_10_users = unnormalized_df.head(10)
top_10_users

Unnamed: 0,user_id,likes_received,likes_given,topics_viewed,posts_read,days_visited,solutions,cheers,initial_score,z_score
0,24869,11,32,15654,68814,346,0,360,54800.0,139.34
1,13277,1,0,7281,38027,910,0,115,30203.6,76.78
2,27041,84,78,2674,15563,111,0,562,12300.6,31.24
3,26170,0,4,2055,14711,130,0,32,11221.5,28.49
4,22533,6,0,3026,12294,444,1,145,10176.3,25.83
5,25409,3,9,2229,12601,267,0,121,9940.6,25.23
6,26414,1,0,1808,12007,261,0,85,9337.1,23.7
7,25291,7,60,2298,11567,248,2,242,9287.9,23.57
8,25731,0,1,2224,11609,180,0,30,9151.2,23.23
9,25397,1,421,1525,10969,276,0,586,8784.4,22.29


In [38]:
id_username_mapping = pd.read_csv("../data/id_username_mapping.csv")
id_username_mapping.sample(5, random_state=42)

Unnamed: 0,user_id,username,name,email
15943,19538,be20b030,Sarthak Jain,be20b030@smail.iitm.ac.in
3836,4711,Anay,Anay,21f1005495@ds.study.iitm.ac.in
9162,12613,ayush_sharma001,Ayush Sharma,22f1001876@ds.study.iitm.ac.in
34769,38515,24f3002839,Saumya Radhanpara Jatinbhai,24f3002839@ds.study.iitm.ac.in
655,762,AVB,Boss veeraraghavan annapillai,21f1000044@ds.study.iitm.ac.in


In [42]:
top_10_users_2 = top_10_users.merge(id_username_mapping, on="user_id")
# print(top_10_users_2.drop(columns=["email"]).to_markdown())

In [44]:
# Generate Altair visualization
create_stacked_bar_chart(top_10_users_2, subject="All Users", path_to_static_folder=path_to_static_folder)

In [27]:
top_10_users_2.columns

Index(['user_id', 'likes_received', 'likes_given', 'topics_viewed',
       'posts_read', 'days_visited', 'solutions', 'cheers', 'initial_score',
       'z_score', 'username', 'name', 'email'],
      dtype='object')

In [36]:
top_10_users_2["username"].value_counts()

username
tajamul_tahseen    1
21f3000961         1
Aditi18            1
23f3003765         1
23f2003636         1
Name: count, dtype: int64

In [29]:
top_10_users_2["username"] = top_10_users_2["username"].astype(str)
top_10_users_2.dtypes

user_id             int64
likes_received      int64
likes_given         int64
topics_viewed       int64
posts_read          int64
days_visited        int64
solutions           int64
cheers              int64
initial_score     float64
z_score           float64
username           object
name               object
email              object
dtype: object