# Process Oscars Pageviews

In [1]:
import os
os.environ['PYARROW_IGNORE_TIMEZONE'] = '1'
import warnings
warnings.filterwarnings('ignore')
import pyspark.pandas as ps
from pyspark.sql import SparkSession

In [2]:
spark = (SparkSession
         .builder
         .master('local[*]')
         .config('spark.sql.execution.arrow.pyspark.enabled', 'true')
         .getOrCreate())
spark.sparkContext.setLogLevel('OFF')

In [3]:
import pyspark.sql.functions as F

In [4]:
wiki_1 = spark.read.parquet(
    "pageviews_parquet/*"
)

wiki_2 = spark.read.parquet(
    'pageviews_parquet_repeat/*'
)

wiki = wiki_1.union(wiki_2)

In [5]:
items = [
    "95th_Academy_Awards",
    "Everything_Everywhere_All_at_Once",
    "Michelle_Yeoh",
    "Ke_Huy_Quan",
    "Jamie_Lee_Curtis",
    "A24",
    "All_Quiet_on_the_Western_Front_(2022_film)",
    "The_Whale_(2022_film)",
    "Avatar:_The_Way_of_Water",
    "Black_Panther:_Wakanda_Forever",
    "The_Boy,_the_Mole,_the_Fox_and_the_Horse_(film)",
    "The_Elephant_Whisperers",
    "An_Irish_Goodbye",
    "Navalny_(film)",
    "RRR",
    "Top_Gun:_Maverick",
    "Women_Talking_(film)",
    "Daniels_(directors)",
    "Brendan_Fraser",
    "Hauschka",
    "Charlie_Mackesy",
    "M._M._Keeravani",
    "Sarah_Polley",
    "Miriam_Toews",
    "Guillermo_del_Toro",
    "Mark_Gustafson",
    "Alex_Bulkley",
    "Edward_Berger",
    "Daniel_Roher",
    "Odessa_Rae",
    "Shane_Boris",
    "Kartiki_Gonsalves",
    "Guneet_Monga",
    "Chandrabose_(lyricist)",
    "James_Mather_(sound_editor)",
    "Al_Nelson_(sound_engineer)",
    "Chris_Burdon",
    "Christian_M._Goldbeck",
    "Ernestine_Hipper",
    "James_Friend",
    "Adrien_Morot",
    "Judy_Chin",
    "Annemarie_Bradley",
    "Ruth_E._Carter",
    "Paul_Rogers_(film_editor)",
    "Joe_Letteri",
    "Richard_Baneham",
    "Eric_Saindon",
    "Daniel_Barrett_(visual_effects_supervisor)",
    "Guillermo_del_Toro's_Pinocchio"
]

In [6]:
oscars_daily = (wiki
 .select(
     F.col('_c1').alias('title'),
     F.col('_c2').alias('pageid'),
     F.col('_c3').alias('mode'),
     F.col('_c4').cast('int').alias('daily_count'),
     F.col('_c5').alias('hourly_counts'),
     F.to_date(F.regexp_substr(F.col('filename'),
                               F.lit(r"\b\d{8}\b")
                              ), 'yyyyMMdd').alias('date')
 ).filter((F.col('_c0')=='en.wikipedia') & (F.col('_c1').isin(items)))
  .dropDuplicates()
).cache()

In [7]:
oscars_daily.write.parquet(
            'oscars',
            mode='overwrite'
        )

In [8]:
spark.stop()