Github Analysis!

In [1]:
from spark_utils import quick_start
spark = quick_start("TestConnection")

🚀 Creating Spark session: TestConnection
📡 Connecting to: spark://spark-master:7077
🗄️  MinIO endpoint: http://minio:9000
🔺 Delta Lake: ENABLED


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/02 01:51:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/07/02 01:51:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


✅ Spark 4.0.0 session created successfully!
🔗 Spark UI: http://localhost:4041
💡 S3A ready for s3a://delta-lake/ operations
🔺 Delta Lake ready for delta table operations!


In [2]:
# Load your data
dfalltime = spark.read.format("delta").load("s3a://delta-lake/analytics/analytics_github_technology_trends_alltime")


25/07/02 01:51:17 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [3]:
dfalltime.schema

StructType([StructField('analysis_date', StringType(), False), StructField('data_start_date', StringType(), False), StructField('data_end_date', StringType(), False), StructField('total_days_analyzed', IntegerType(), False), StructField('technology', StringType(), False), StructField('technology_category', StringType(), False), StructField('total_mentions_alltime', LongType(), False), StructField('avg_daily_mentions_alltime', DoubleType(), False), StructField('peak_day_mentions_alltime', LongType(), False), StructField('days_with_activity_alltime', IntegerType(), False), StructField('mention_rank_alltime', IntegerType(), False), StructField('category_rank_alltime', IntegerType(), False), StructField('data_completeness_alltime', DoubleType(), False), StructField('processed_at', TimestampType(), False)])

In [7]:
# Get top 10 globally ranked technologies
df_top10_global = dfalltime.orderBy("mention_rank_alltime").limit(10)

# Show important fields
df_top10_global.select(
    "mention_rank_alltime",
    "technology",
    "technology_category",
    "total_mentions_alltime",
    "avg_daily_mentions_alltime",
    "peak_day_mentions_alltime",
    "days_with_activity_alltime"
).show(truncate=False)


                                                                                

+--------------------+----------+-------------------+----------------------+--------------------------+-------------------------+--------------------------+
|mention_rank_alltime|technology|technology_category|total_mentions_alltime|avg_daily_mentions_alltime|peak_day_mentions_alltime|days_with_activity_alltime|
+--------------------+----------+-------------------+----------------------+--------------------------+-------------------------+--------------------------+
|1                   |ethereum  |blockchain_platform|6719041               |22699.462837837837        |38638                    |296                       |
|2                   |near      |blockchain_platform|6088718               |20569.993243243243        |38521                    |296                       |
|3                   |docker    |devops             |5761786               |19465.493243243243        |36059                    |296                       |
|4                   |npm       |package_manager    |51884

25/07/02 02:18:22 WARN StandaloneSchedulerBackend$StandaloneDriverEndpoint: Skipping onDisconnected RemoveExecutor call because the scheduler is stopping


In [10]:
# Commands to check category rankings in your Spark/Jupyter environment

# 1. Load the alltime analytics table
dfalltime = spark.read.format("delta").load("s3a://delta-lake/analytics/analytics_github_technology_trends_alltime")

# 2. Check unique category_rank_alltime values
print("=== UNIQUE CATEGORY RANK VALUES ===")
dfalltime.select("category_rank_alltime").distinct().orderBy("category_rank_alltime").show()

# 3. Count of each category rank value
print("=== CATEGORY RANK VALUE COUNTS ===")
dfalltime.groupBy("category_rank_alltime").count().orderBy("category_rank_alltime").show()

# 4. Check if there are any values other than 1
print("=== NON-1 CATEGORY RANKS ===")
dfalltime.filter(dfalltime.category_rank_alltime != 1).select(
    "technology", 
    "technology_category", 
    "total_mentions_alltime", 
    "mention_rank_alltime", 
    "category_rank_alltime"
).orderBy("technology_category", "category_rank_alltime").show(20)

# 5. Show category distribution to understand the ranking
print("=== CATEGORY DISTRIBUTION ===")
dfalltime.groupBy("technology_category").count().orderBy("count", ascending=False).show()

# 6. Sample of technologies by category with their ranks
print("=== SAMPLE BY CATEGORY (TOP CATEGORIES) ===")
for category in ["ai_framework", "language", "framework", "database", "blockchain"]:
    print(f"\n--- {category.upper()} ---")
    dfalltime.filter(dfalltime.technology_category == category).select(
        "technology", 
        "total_mentions_alltime", 
        "mention_rank_alltime", 
        "category_rank_alltime"
    ).orderBy("category_rank_alltime").show(10)

# 7. Check for any category with only 1 technology (these should have category_rank = 1)
print("=== CATEGORIES WITH ONLY 1 TECHNOLOGY ===")
category_counts = dfalltime.groupBy("technology_category").count()
single_tech_categories = category_counts.filter(category_counts['count'] == 1)  # Use bracket notation
single_tech_categories.show()

# If there are single-tech categories, show them:
if single_tech_categories.count() > 0:
    print("Technologies in single-tech categories:")
    single_cats = [row.technology_category for row in single_tech_categories.collect()]
    dfalltime.filter(dfalltime.technology_category.isin(single_cats)).select(
        "technology", 
        "technology_category", 
        "category_rank_alltime"
    ).show()

=== UNIQUE CATEGORY RANK VALUES ===
+---------------------+
|category_rank_alltime|
+---------------------+
|                    1|
|                    2|
|                    3|
|                    4|
|                    5|
|                    6|
|                    7|
|                    8|
|                    9|
|                   10|
|                   11|
|                   12|
|                   13|
|                   14|
|                   15|
|                   16|
|                   17|
|                   18|
|                   19|
|                   20|
+---------------------+
only showing top 20 rows
=== CATEGORY RANK VALUE COUNTS ===
+---------------------+-----+
|category_rank_alltime|count|
+---------------------+-----+
|                    1|   16|
|                    2|   16|
|                    3|   16|
|                    4|   14|
|                    5|   13|
|                    6|   13|
|                    7|   13|
|                    8|   12

In [None]:
dfalltime.printSchema()

In [None]:
dfalltime.show(5, truncate=False)

In [None]:
df = spark.read.format("delta").load("s3a://delta-lake/analytics/analytics_github_technology_trends_7d")
df.printSchema()

In [None]:
df = spark.read.format("delta").load("s3a://delta-lake/analytics/analytics_github_technology_trends_30d")

In [None]:
df.printSchema()