In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat_ws

# Step 1: Initialize Spark Session with Hadoop-AWS connector
print("🚀 Initializing Spark session...")
spark = SparkSession.builder \
    .appName("ConnectToS3") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1") \
    .getOrCreate()

# Step 2: Configure Spark to use s3a and AWS credentials
print("🔐 Configuring Spark to use AWS credentials...")
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("fs.s3a.aws.credentials.provider", "com.amazonaws.auth.profile.ProfileCredentialsProvider")
hadoop_conf.set("fs.s3a.endpoint", "s3.us-west-2.amazonaws.com") 


🚀 Initializing Spark session...


25/06/24 10:40:52 WARN Utils: Your hostname, Hariprasaths-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.1.8 instead (on interface en0)
25/06/24 10:40:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/hari14/.ivy2/cache
The jars for the packages stored in: /Users/hari14/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-58c10ce6-7f20-47a0-bbf9-03ff1d9eb0fa;1.0
	confs: [default]


:: loading settings :: url = jar:file:/opt/spark-3.3.3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.hadoop#hadoop-aws;3.3.1 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.901 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 109ms :: artifacts dl 3ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.901 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.1 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   0   ||   3   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-58c10ce6-7f20-47a0-bbf9-03ff1d9eb0fa
	confs: [default]


🔐 Configuring Spark to use AWS credentials...


In [2]:
# Step 3: Get list of years from user
years_input = input("📅 Please enter years to process (comma-separated, e.g. 2021,2022,2023): ")
years = [year.strip() for year in years_input.split(",")]
channel_name = "esl_dota2"


📅 Please enter years to process (comma-separated, e.g. 2021,2022,2023):  2021,2022,2023,2024,2025


In [5]:
# Step 4: Loop through years and check schema
for year in years:
    print(f"\n📁 Checking schema for year {year}...")
    path = f"s3a://twitch-emotes-analytics-project/data/gold/{channel_name}/{year}/all_data_parquet"
    try:
        df = spark.read.parquet(path)
        print(f"Total records - ({year}): {df.count()}")
        df.printSchema()
    except Exception as e:
        print(f"❌ Failed to read schema for {year}: {e}")


📁 Checking schema for year 2021...


                                                                                

Total records - (2021): 295479
root
 |-- i_user_id: long (nullable = true)
 |-- i_username: string (nullable = true)
 |-- i_display_color: string (nullable = true)
 |-- i_badge_names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- i_badge_titles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- i_badge_versions: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- i_user_status: string (nullable = true)
 |-- j_streamer: string (nullable = true)
 |-- k_emote_name: string (nullable = true)
 |-- t_timestamp: timestamp (nullable = true)
 |-- t_time_text: string (nullable = true)
 |-- t_seconds: long (nullable = true)


📁 Checking schema for year 2022...


                                                                                

Total records - (2022): 566898
root
 |-- i_user_id: long (nullable = true)
 |-- i_username: string (nullable = true)
 |-- i_display_color: string (nullable = true)
 |-- i_badge_names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- i_badge_titles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- i_badge_versions: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- i_user_status: string (nullable = true)
 |-- j_streamer: string (nullable = true)
 |-- k_emote_name: string (nullable = true)
 |-- t_timestamp: timestamp (nullable = true)
 |-- t_time_text: string (nullable = true)
 |-- t_seconds: long (nullable = true)


📁 Checking schema for year 2023...


                                                                                

Total records - (2023): 617113
root
 |-- i_user_id: long (nullable = true)
 |-- i_username: string (nullable = true)
 |-- i_display_color: string (nullable = true)
 |-- i_badge_names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- i_badge_titles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- i_badge_versions: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- i_user_status: string (nullable = true)
 |-- j_streamer: string (nullable = true)
 |-- k_emote_name: string (nullable = true)
 |-- t_timestamp: timestamp (nullable = true)
 |-- t_time_text: string (nullable = true)
 |-- t_seconds: long (nullable = true)


📁 Checking schema for year 2024...


                                                                                

Total records - (2024): 359847
root
 |-- i_user_id: long (nullable = true)
 |-- i_username: string (nullable = true)
 |-- i_display_color: string (nullable = true)
 |-- i_badge_names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- i_badge_titles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- i_badge_versions: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- i_user_status: string (nullable = true)
 |-- j_streamer: string (nullable = true)
 |-- k_emote_name: string (nullable = true)
 |-- t_timestamp: timestamp (nullable = true)
 |-- t_time_text: string (nullable = true)
 |-- t_seconds: long (nullable = true)


📁 Checking schema for year 2025...


[Stage 42:>                                                         (0 + 2) / 2]

Total records - (2025): 175534
root
 |-- i_user_id: long (nullable = true)
 |-- i_username: string (nullable = true)
 |-- i_display_color: string (nullable = true)
 |-- i_badge_names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- i_badge_titles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- i_badge_versions: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- i_user_status: string (nullable = true)
 |-- j_streamer: string (nullable = true)
 |-- k_emote_name: string (nullable = true)
 |-- t_timestamp: timestamp (nullable = true)
 |-- t_time_text: string (nullable = true)
 |-- t_seconds: long (nullable = true)



                                                                                