# Visualisation

In [None]:
import pyspark
from pyspark.sql import SparkSession
import os
import time

spark = SparkSession.builder \
    .appName("MoviesRating") \
    .config("spark.executor.memory", "10g") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark.conf.set("spark.sql.debug.maxToStringFields", "-1")

In [None]:
films_hdfs_path = "hdfs://localhost:9000/haythem/imdb_dataset/output_directory/films.csv"
crew_hdfs_path = "hdfs://localhost:9000/haythem/imdb_dataset/output_directory/crew.csv"
films_crew_hdfs_path = "hdfs://localhost:9000/haythem/imdb_dataset/output_directory/films_crew.csv"

df_films = spark.read.format("csv").option("delimiter", ",").load(films_hdfs_path, header=True, inferSchema=True)
df_crew = spark.read.format("csv").option("delimiter", ",").load(crew_hdfs_path, header=True, inferSchema=True)
df_films_crew = spark.read.format("csv").option("delimiter", ",").load(films_crew_hdfs_path, header=True, inferSchema=True)


# Spark Visualisation

In [None]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import pyspark.sql.functions as F
from pyspark.sql.functions import col


genre_counts = df_films.groupBy('genres').count().orderBy(F.desc('count')).limit(10)
data_crew = df_crew.filter(col('primaryName').isNotNull() & col('primaryProfession').isNotNull())

## BAR_plot

In [None]:
# Extract genres and counts as lists
genres = genre_counts.select('genres').rdd.flatMap(lambda x: x).collect()
counts = genre_counts.select('count').rdd.flatMap(lambda x: x).collect()

# Create a Plotly figure
fig = make_subplots(rows=1, cols=1, subplot_titles=['Top 10 Genres by Count of Films'])

# Add bar trace to the figure
fig.add_trace(go.Bar(x=genres, y=counts, name='Count of Films', marker_color='skyblue'), row=1, col=1)

# Update layout
fig.update_layout(title_text='Top 10 Genres by Count of Films', xaxis_title='Genres', yaxis_title='Count of Films')

# Show the plot
fig.show()


## Scatter_plot

In [None]:
start_year_ = df_films.filter(col('startYear').isNotNull())
# Extract startYear and counts as lists
start_year_df = start_year_.groupBy('startYear').count().orderBy(F.asc('startYear'))

start_years = start_year_df.select('startYear').rdd.flatMap(lambda x: x).collect()
counts1 = start_year_df.select('count').rdd.flatMap(lambda x: x).collect()

# Create a Plotly figure
fig = make_subplots(rows=1, cols=1, subplot_titles=['Number of Films Produced Over the Years'])

# Add line trace to the figure
fig.add_trace(go.Scatter(x=start_years, y=counts1, mode='lines', name='Number of Films Produced Over the Years', line=dict(color='skyblue')), row=1, col=1)

# Update layout
fig.update_layout(title_text='Number of Films Produced Over the Years', xaxis_title='Start Year', yaxis_title='Number of Films')

# Show the plot
fig.show()

## Pie_plot

In [None]:
filtered_crew_ = data_crew.withColumn('primaryProfession', F.split(data_crew['primaryProfession'], ','))
# Explode the list of professions into multiple rows
filtered_crew_ = filtered_crew_.select(filtered_crew_['nconst'], F.explode(filtered_crew_['primaryProfession']).alias('profession'))

# Group by profession and count the occurrences
top_professions = filtered_crew_.groupBy('profession').count().orderBy(F.desc('count')).limit(10)


professions = top_professions.select('profession').rdd.flatMap(lambda x: x).collect()
counts_p = top_professions.select('count').rdd.flatMap(lambda x: x).collect()

# Create a Plotly pie chart
fig = go.Figure(data=[go.Pie(labels=professions, values=counts_p, hole=.3)])

# Update layout
fig.update_layout(
    title_text='Top 10 Professions in Film Crew',
)

# Show the plot
fig.show()

## Histogram_plot

In [None]:
# Create a Plotly histogram with custom partitions for average ratings
fig = go.Figure(data=[go.Histogram(x=average_ratings, xbins=dict(start=0, end=10, size=1), marker=dict(color='skyblue'))])

# Update layout
fig.update_layout(
    title_text='Distribution of Average Ratings',
    xaxis_title='Average Rating',
    yaxis_title='Frequency'
)

# Show the plot
fig.show()

# Pandas visualisation

In [None]:
import plotly.express as px
import pandas as pd

data_1_genres_pd = genre_counts.toPandas()
top_10_genres_pd = data_1_genres_pd.sort_values(by='count', ascending=False).head(20)

## Bar_plot_pd

In [None]:
# Create a bar chart
fig = px.bar(top_10_genres_pd, x='genres', y='count', title='Count of Films by Genre',
             labels={'genres': 'Genres', 'count': 'Count of Films'},
             height=600)

# Customize the layout for better readability
fig.update_layout(
    xaxis_tickangle=-45,
    xaxis_title_text='Genres',
    yaxis_title_text='Count of Films',
    title_text='Count of Films by Genre',
    title_x=0.5
)

# Show the plot
fig.show()