In [1]:
# This is your first data analytics code
import pandas as pd

# Create a small dataset
data = {
    "Name": ["Amit", "Rahul", "Priya", "Neha"],
    "Score": [78, 85, 90, 88]
}

df = pd.DataFrame(data)

# Display the data
df


Unnamed: 0,Name,Score
0,Amit,78
1,Rahul,85
2,Priya,90
3,Neha,88


In [2]:
# Install PySpark
!pip install pyspark




In [3]:
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder.appName("BeginnerBigDataAnalysis").getOrCreate()

spark


In [4]:
# Download a large public dataset (movies dataset)
!wget https://raw.githubusercontent.com/datasets/movies/master/data/movies.csv


--2025-12-29 10:12:33--  https://raw.githubusercontent.com/datasets/movies/master/data/movies.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2025-12-29 10:12:33 ERROR 404: Not Found.



In [6]:
import pandas as pd

url = "https://raw.githubusercontent.com/vega/vega-datasets/master/data/movies.json"

# Load data from URL
pdf = pd.read_json(url)

# Save as CSV
pdf.to_csv("movies.csv", index=False)

print("movies.csv downloaded successfully")


movies.csv downloaded successfully


In [7]:
!ls


movies.csv  sample_data


In [8]:
df = spark.read.csv("movies.csv", header=True, inferSchema=True)
df.show(5)


+--------------------+---------+---------------+------------+-----------------+------------+-----------+----------------+-----------+-------------------+-----------+--------------------+--------+----------------------+-----------+----------+
|               Title| US Gross|Worldwide Gross|US DVD Sales|Production Budget|Release Date|MPAA Rating|Running Time min|Distributor|             Source|Major Genre|       Creative Type|Director|Rotten Tomatoes Rating|IMDB Rating|IMDB Votes|
+--------------------+---------+---------------+------------+-----------------+------------+-----------+----------------+-----------+-------------------+-----------+--------------------+--------+----------------------+-----------+----------+
|      The Land Girls| 146083.0|       146083.0|        NULL|        8000000.0| Jun 12 1998|          R|            NULL|   Gramercy|               NULL|       NULL|                NULL|    NULL|                  NULL|        6.1|    1071.0|
|First Love, Last ...|  10876.0|

In [10]:
df.printSchema()


root
 |-- Title: string (nullable = true)
 |-- US Gross: double (nullable = true)
 |-- Worldwide Gross: double (nullable = true)
 |-- US DVD Sales: double (nullable = true)
 |-- Production Budget: double (nullable = true)
 |-- Release Date: string (nullable = true)
 |-- MPAA Rating: string (nullable = true)
 |-- Running Time min: double (nullable = true)
 |-- Distributor: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Major Genre: string (nullable = true)
 |-- Creative Type: string (nullable = true)
 |-- Director: string (nullable = true)
 |-- Rotten Tomatoes Rating: double (nullable = true)
 |-- IMDB Rating: double (nullable = true)
 |-- IMDB Votes: double (nullable = true)



In [11]:
from pyspark.sql.functions import year, to_date

# Convert Release Date to date format and extract year
df_with_year = df.withColumn(
    "year",
    year(to_date("Release Date"))
)

df_with_year.select("Title", "Release Date", "year").show(5)


{"ts": "2025-12-29 10:21:50.503", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[CAST_INVALID_INPUT] The value 'Jun 12 1998' of the type \"STRING\" cannot be cast to \"DATE\" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018", "context": {"file": "java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)", "line": "", "fragment": "to_date", "errorClass": "CAST_INVALID_INPUT"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o70.showString.\n: org.apache.spark.SparkDateTimeException: [CAST_INVALID_INPUT] The value 'Jun 12 1998' of the type \"STRING\" cannot be cast to \"DATE\" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018\n== DataFrame ==\n\"to_date\" was called from\n

DateTimeException: [CAST_INVALID_INPUT] The value 'Jun 12 1998' of the type "STRING" cannot be cast to "DATE" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"to_date" was called from
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)


In [14]:
from pyspark.sql.functions import year, to_date, col


In [15]:
df_with_year = df.withColumn(
    "year",
    year(to_date(col("Release Date"), "MMM dd yyyy"))
)

df_with_year.select("Title", "Release Date", "year").show(5)


+--------------------+------------+----+
|               Title|Release Date|year|
+--------------------+------------+----+
|      The Land Girls| Jun 12 1998|1998|
|First Love, Last ...| Aug 07 1998|1998|
|I Married a Stran...| Aug 28 1998|1998|
|Let's Talk About Sex| Sep 11 1998|1998|
|                Slam| Oct 09 1998|1998|
+--------------------+------------+----+
only showing top 5 rows


In [16]:
movies_per_year = df_with_year.groupBy("year").count().orderBy("year")
movies_per_year.show(10)


+----+-----+
|year|count|
+----+-----+
|1928|    1|
|1929|    1|
|1930|    1|
|1933|    1|
|1937|    2|
|1938|    1|
|1939|    2|
|1941|    1|
|1943|    1|
|1944|    3|
+----+-----+
only showing top 10 rows


### Big Data Analysis â€“ Insights

- A movie dataset with 3,201 records was analyzed using PySpark.
- The release year was extracted by parsing non-standard date formats.
- Group-by analysis revealed trends in movie releases across years.
- PySpark efficiently handled data transformation and aggregation, demonstrating scalability.

