# Task 1: Big Data Analysis using PySpark
**Internship:** CODTECH

**Dataset Used:** US YouTube Trending Videos (`USvideos.csv`)

This notebook demonstrates big data analysis using PySpark, including dataset loading, exploration, and deriving insights.

In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg

# Start Spark session
spark = SparkSession.builder.appName("YouTubeDataAnalysis").getOrCreate()


In [None]:
# Load the dataset
df = spark.read.csv("USvideos.csv", header=True, inferSchema=True)
df.show(5)


In [None]:
# Print schema and row count
df.printSchema()
print("Total rows:", df.count())
print("Total columns:", len(df.columns))


In [None]:
# Drop rows with nulls if any
df_cleaned = df.dropna()
print("Cleaned row count:", df_cleaned.count())


In [None]:
# Top 5 most viewed videos
df_cleaned.select("title", "views").orderBy(col("views").desc()).show(5)


In [None]:
# Average likes per category
df_cleaned.groupBy("category_id").agg(avg("likes").alias("avg_likes")).orderBy("avg_likes", ascending=False).show()


In [None]:
# Most liked videos
df_cleaned.select("title", "likes").orderBy(col("likes").desc()).show(5)


In [None]:
spark.stop()