#PROJECT 1

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
import re
from pyspark.sql.window import Window
from pyspark import SparkContext
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("my project 1").getOrCreate()
sc = spark.sparkContext

# Read a CSV into a dataframe
# There is a smarter version, that will first check if there is a Parquet file and use it
def load_PD_file(filename_or_dir, schema) :
    dataPath = "/mnt/ddscoursedatastorage/fwm-stb-data/" + filename_or_dir
    df = spark.read.format("csv")\
      .option("header","false")\
      .option("delimiter", "|")\
      .schema(schema)\
      .load(dataPath)
    return df

df1 = spark.read.csv("/mnt/ddscoursedatastorage/dds-students/test.csv")


# Reading the Reference Parquet files

ref_data = spark.read.parquet('/ref_data_raw').withColumnRenamed("_device-id","device_id")\
                                                .withColumnRenamed("_dma","dma")\
                                                .withColumnRenamed("_dma-code","dma_code")\
                                                .withColumnRenamed("_household-id","household_id")\
                                                .withColumnRenamed("_household-type","household_type")\
                                                .withColumnRenamed("_system-type","system_type")\
                                                .withColumnRenamed("_zipcode","zipcode")
ref_data_count = ref_data.count()
print(ref_data_count)
#ref_data.limit(5).show()


# Reading the Daily Programs CSV file

daily_prog_schema =  StructType([StructField('prog_code',StringType()),
                     StructField('title',StringType()),
                     StructField('genre',StringType()),
                     StructField('air_date',StringType()),
                     StructField('air_time',StringType()),
                     StructField('Duration',FloatType())
                                       ])
daily_prog_data = load_PD_file("Daily program data/" , daily_prog_schema  )

#daily_prog_data.limit(3).show()

# Reading the 2.5% sample of the viewing data from a Parquet file

viewing_data = spark.read.parquet('/sample_viewing_2_5percent')

print(f'There are {viewing_data.count():,} entries in viewing_data dataframe!')
#viewing_data.show(5)

# Reading the Demographic CSV file

demographic_schema =  StructType([StructField('household_id',StringType()),
                      StructField('household_size',IntegerType()),
                      StructField('num_adults',IntegerType()),
                      StructField('num_generations',IntegerType()),
                      StructField('adult_range',StringType()),
                      StructField('marital_status',StringType()),
                      StructField('race_code',StringType()),
                      StructField('presence_children',StringType()),
                      StructField('num_children',IntegerType()),
                      StructField('age_children',StringType()), #format like range - 'bitwise'
                      StructField('age_range_children',StringType()),
                      StructField('dwelling_type',StringType()),
                      StructField('home_owner_status',StringType()),
                      StructField('length_residence',IntegerType()),
                      StructField('home_market_value',StringType()),
                      StructField('num_vehicles',IntegerType()),
                      StructField('vehicle_make',StringType()),
                      StructField('vehicle_model',StringType()),
                      StructField('vehicle_year',IntegerType()),
                      StructField('net_worth',IntegerType()),
                      StructField('income',StringType()),
                      StructField('gender_individual',StringType()),
                      StructField('age_individual',IntegerType()),
                      StructField('education_highest',StringType()),
                      StructField('occupation_highest',StringType()),
                      StructField('education_1',StringType()),
                      StructField('occupation_1',StringType()),
                      StructField('age_2',IntegerType()),
                      StructField('education_2',StringType()),
                      StructField('occupation_2',StringType()),
                      StructField('age_3',IntegerType()),
                      StructField('education_3',StringType()),
                      StructField('occupation_3',StringType()),
                      StructField('age_4',IntegerType()),
                      StructField('education_4',StringType()),
                      StructField('occupation_4',StringType()),
                      StructField('age_5',IntegerType()),
                      StructField('education_5',StringType()),
                      StructField('occupation_5',StringType()),
                      StructField('polit_party_regist',StringType()),
                      StructField('polit_party_input',StringType()),
                      StructField('household_clusters',StringType()),
                      StructField('insurance_groups',StringType()),
                      StructField('financial_groups',StringType()),
                      StructField('green_living',StringType())
                                       ])

demographic_data = load_PD_file("demographic/" , demographic_schema  )  

#display(demographic_data.limit(1))


203581233
There are 130,289,194 entries in viewing_data dataframe!


#PART 2

In [0]:
ref_data = ref_data.select(['device_id','dma','household_id']).na.drop().dropDuplicates()
ref_data.cache()
daily_prog_data.cache()

#QUESTION 2.1
prog_data_sep_genre = daily_prog_data.select("prog_code","genre")\
    .withColumn("genre",explode(split("genre",",")))\
        .na.drop().dropDuplicates()

ref_dma_data = ref_data.select("device_id","dma").na.drop().dropDuplicates()

#right join to not remove the dmas without viewings   
extended_view_data = viewing_data.join(broadcast(prog_data_sep_genre),on="prog_code").join(ref_dma_data,on="device_id",how="right")

total_views_dma_genre = extended_view_data.groupBy("dma","genre")\
    .agg(countDistinct("prog_code","event_date","event_time").alias("total_views"))\
        .orderBy("dma",desc("total_views")).repartition("dma") #order by total views desc

result_2_1 = total_views_dma_genre.groupBy("dma").agg(collect_list("genre").alias("genres")) 
display(result_2_1)
print("QUESTION 2.1")
top_5 = result_2_1.where((col("dma")=="Waco-Temple-Bryan") | (col("dma")=="New York")|(col("dma")=="Washington, DC (Hagrstwn)"))
rows = top_5.collect()
for row in rows:
    print(f"DMA: {row.dma} , TOP 5 GENRES: {row.genres[:5]}")
print("")

dma,genres
Sherman-Ada,"List(Reality, News, Sitcom, Children, Talk, House/garden, Comedy, Animated, Adventure, Consumer, Documentary, Crime, Drama, Action, Crime drama, Fantasy, Entertainment, Home improvement, Newsmagazine, Music, Mystery, Sports event, Educational, Football, Special, Game show, Shopping, Travel, Suspense, Soap, Animals, Sports non-event, Religious, History, Law, Science, Science fiction, Nature, Romance, Auto, Interview, Docudrama, How-to, Outdoors, Medical, Romance-comedy, Bus./financial, Politics, Horror, Public affairs, Sports talk, Cooking, Fashion, Western, Paranormal, Health, Variety, Comedy-drama, Auto racing, Playoff sports, Biography, Musical, Weather, Basketball, Golf, Technology, Dance, Anthology, Musical comedy, Fishing, War, Hunting, Historical drama, Baseball, Parenting, Soccer, Art, Community, Pro wrestling, Collectibles, Spanish, Gay/lesbian, Martial arts, Awards, Self improvement, Exercise, Anime, Auction, Action sports, Hockey, Debate, Military, Boat, Boxing, Environment, Aviation, Event, Volleyball, Horse, Figure skating, Standup, Rugby, Rodeo, Tennis, Arts/crafts, Gaming, Shooting, Softball, Mixed martial arts, Poker, Parade, Snowboarding, Equestrian, Dog show, Motorcycle racing, Gymnastics, Skiing, Card games, Motorsports, Miniseries, Adults only, Kayaking, Track/field, Sailing, Swimming, Fundraiser, Pool, Beach volleyball, Running, Beach soccer, Agriculture, Bicycle racing, Skateboarding)"
Wheeling-Steubenville,"List(Reality, News, Sports event, Documentary, Talk, Adventure, Drama, Sports non-event, Crime drama, Action, Sitcom, Comedy, Outdoors, Mystery, Children, Animated, Suspense, Football, Crime, Shopping, Baseball, Game show, Special, Fantasy, Consumer, Sports talk, Western, Music, Basketball, House/garden, Entertainment, Nature, Cooking, Newsmagazine, Law, Playoff sports, Animals, Weather, Science fiction, History, Educational, Romance, Medical, Politics, Travel, Hunting, Public affairs, Interview, Golf, Home improvement, Horror, Science, Variety, Religious, How-to, Romance-comedy, Bus./financial, Auto, Paranormal, Auto racing, Docudrama, War, Comedy-drama, Health, Fishing, Fashion, Soap, Biography, Historical drama, Collectibles, Community, Auction, Hockey, Anthology, Musical, Dance, Soccer, Mixed martial arts, Boxing, Parenting, Musical comedy, Aviation, Art, Martial arts, Pro wrestling, Shooting, Technology, Military, Adults only, Poker, Card games, Softball, Tennis, Archery, Action sports, Agriculture, Awards, Horse, Environment, Event, Self improvement, Track/field, Debate, Anime, Drag racing, Miniseries, Figure skating, Rodeo, Bicycle racing, Equestrian, Exercise, Motorsports, Volleyball, Parade, Motorcycle racing, Gaming, Standup, Computers, Gymnastics, Motorcycle, Dog show, Beach volleyball, Wrestling, Skiing, Gay/lesbian, Bowling, Rugby, Boat, Beach soccer, Diving, Arm wrestling, Swimming, Arts/crafts, Fundraiser, Spanish, Snowboarding, Billiards, Running, Lacrosse, Snowmobile, Cheerleading, Olympics, Pool, Kayaking, Theater, Triathlon, Skateboarding, Water polo, Watersports, Weightlifting, Music special, Luge, Cricket, Performing arts, Holiday, Yacht racing, Sailing, Bobsled, Intl soccer, Field hockey, Curling, Darts, Holiday-children special, Holiday special, Boat racing)"
Little Rock-Pine Bluff,"List(Reality, News, Talk, Drama, Documentary, Sitcom, Sports event, Comedy, Adventure, Children, Action, Animated, Sports non-event, Crime drama, Suspense, Football, Fantasy, Crime, Mystery, House/garden, Special, Music, Science fiction, Entertainment, Sports talk, Consumer, Western, Cooking, Game show, Horror, Basketball, Law, Politics, Interview, Newsmagazine, Educational, Outdoors, History, Playoff sports, Shopping, Weather, Home improvement, Public affairs, Religious, Travel, Baseball, Romance, Medical, Animals, Paranormal, Science, How-to, Golf, Bus./financial, Auto, Romance-comedy, Nature, War, Comedy-drama, Health, Auto racing, Hunting, Docudrama, Fashion, Biography, Historical drama, Soap, Variety, Anthology, Pro wrestling, Mixed martial arts, Musical, Soccer, Auction, Collectibles, Dance, Community, Fishing, Musical comedy, Spanish, Hockey, Adults only, Tennis, Parenting, Martial arts, Technology, Aviation, Boxing, Awards, Art, Military, Poker, Softball, Agriculture, Card games, Action sports, Shooting, Standup, Anime, Self improvement, Event, Debate, Rodeo, Exercise, Horse, Environment, Drag racing, Bicycle racing, Miniseries, Equestrian, Arts/crafts, Gaming, Archery, Motorcycle racing, Volleyball, Gymnastics, Track/field, Motorcycle, Parade, Motorsports, Skiing, Bowling, Figure skating, Wrestling, Fundraiser, Gay/lesbian, Diving, Snowboarding, Dog show, Lacrosse, Boat, Swimming, Rugby, Arm wrestling, Pool, Computers, Performing arts, Weightlifting, Beach volleyball, Billiards, Running, Triathlon, Cheerleading, Surfing, Olympics, Holiday, Watersports, Skateboarding, Music special, Curling, Snowmobile, Kayaking, Water polo, Intl soccer, Children-special, Yacht racing, Holiday special, Beach soccer, Sailing, Boat racing, Luge, Dog racing, Cricket, Field hockey, Bobsled, Theater, Speed skating, Polo, Table tennis, Mountain biking, Holiday-children special, Darts, Skating, Fencing, Badminton, Racquet, Intl hockey)"
Monroe-El Dorado,"List(Reality, News, Talk, Sitcom, Comedy, Drama, Documentary, Sports event, Adventure, Children, Animated, Action, Sports non-event, Crime drama, Football, Suspense, Fantasy, Crime, Consumer, Mystery, Game show, Entertainment, Special, House/garden, Sports talk, Outdoors, Science fiction, Cooking, Politics, Horror, Western, Interview, Shopping, Music, Basketball, Religious, Public affairs, Newsmagazine, Educational, Playoff sports, Weather, Travel, Animals, Home improvement, Law, Romance, Nature, History, Baseball, Medical, Paranormal, How-to, Romance-comedy, Science, Hunting, Bus./financial, Golf, Comedy-drama, Health, Auto, War, Docudrama, Fashion, Soap, Auto racing, Biography, Historical drama, Fishing, Anthology, Variety, Dance, Musical, Collectibles, Pro wrestling, Auction, Parenting, Soccer, Tennis, Mixed martial arts, Community, Martial arts, Musical comedy, Spanish, Art, Aviation, Awards, Self improvement, Technology, Hockey, Boxing, Softball, Poker, Adults only, Agriculture, Card games, Action sports, Military, Shooting, Anime, Standup, Drag racing, Event, Debate, Archery, Environment, Rodeo, Miniseries, Volleyball, Horse, Gaming, Exercise, Gymnastics, Motorcycle racing, Arts/crafts, Bicycle racing, Motorsports, Parade, Skiing, Wrestling, Track/field, Motorcycle, Figure skating, Equestrian, Gay/lesbian, Arm wrestling, Performing arts, Dog show, Diving, Snowboarding, Bowling, Triathlon, Beach volleyball, Lacrosse, Rugby, Swimming, Running, Boat, Weightlifting, Holiday, Computers, Pool, Olympics, Billiards, Fundraiser, Children-talk, Surfing, Cheerleading, Skateboarding, Watersports, Sailing, Music special, Snowmobile, Curling, Yacht racing, Water polo, Dog racing, Kayaking, Theater, Cricket, Intl soccer, Luge, Boat racing, Beach soccer, Holiday special, Bobsled, Field hockey, Music talk, Holiday music special, Darts, Bicycle, Polo)"
San Antonio,List()
Philadelphia,"List(News, Reality, Sitcom, Sports event, Comedy, Talk, Drama, Documentary, Sports non-event, Children, Adventure, Action, Animated, Crime drama, Football, Special, Fantasy, Baseball, Suspense, Sports talk, Mystery, Game show, House/garden, Crime, Entertainment, Shopping, Music, Consumer, Weather, Cooking, Newsmagazine, Law, Western, Science fiction, Outdoors, Public affairs, Educational, Playoff sports, History, Home improvement, Basketball, Romance, Animals, Horror, Interview, Travel, Medical, Politics, Nature, Romance-comedy, Science, Variety, Auto racing, How-to, War, Community, Religious, Golf, Bus./financial, Paranormal, Health, Comedy-drama, Auto, Fashion, Hockey, Tennis, Soap, Hunting, Docudrama, Biography, Historical drama, Dance, Musical, Adults only, Soccer, Anthology, Fishing, Collectibles, Auction, Horse, Softball, Musical comedy, Pro wrestling, Art, Poker, Awards, Card games, Mixed martial arts, Technology, Boxing, Parenting, Military, Martial arts, Standup, Aviation, Agriculture, Action sports, Parade, Self improvement, Debate, Music special, Spanish, Event, Fundraiser, Drag racing, Shooting, Rodeo, Volleyball, Miniseries, Environment, Wrestling, Equestrian, Motorcycle racing, Cheerleading, Anime, Computers, Archery, Bowling, Figure skating, Bicycle racing, Gay/lesbian, Gaming, Exercise, Motorsports, Skiing, Track/field, Arts/crafts, Gymnastics, Lacrosse, Running, Motorcycle, Dog show, Snowboarding, Rugby, Performing arts, Arm wrestling, Holiday, Snowmobile, Boat, Diving, Billiards, Swimming, Watersports, Intl soccer, Olympics, Triathlon, Field hockey, Skateboarding, Surfing, Pool, Weightlifting, Yacht racing, Mountain biking, Boat racing, Beach volleyball, Racquet, Squash, Kayaking, Holiday special, Dog racing, Rowing, Sailing, Cricket, Water polo, Curling, Beach soccer, Table tennis, Bicycle, Holiday-children special, Theater, Darts, Bobsled, Luge)"
Los Angeles,"List(Reality, Children, Animated, Cooking, Comedy, Adventure, Sitcom, Football, Consumer, Sports non-event, Documentary, Crime, Science, News, Educational, Entertainment, Fantasy, Sports event, Sports talk, Outdoors, Music, Shopping, Special, Travel, Action, Technology, History, Fashion, How-to, Religious, Talk, Suspense, Basketball, Crime drama, Drama, Hunting, Science fiction, Pro wrestling, Mystery, Romance, Newsmagazine, Law, Historical drama, Game show, Parenting, Animals, Nature, Soccer, Musical comedy, Musical, Biography, Romance-comedy, Paranormal, Action sports, Medical, Hockey, Awards, Playoff sports, Collectibles, War, Auction, House/garden, Docudrama, Home improvement, Horror, Mixed martial arts, Bus./financial)"
Fargo-Valley City,"List(Reality, Sitcom, News, Comedy, Documentary, Children, Adventure, Drama, Animated, Sports event, Talk, Action, Sports non-event, Crime drama, Fantasy, Consumer, Suspense, Cooking, House/garden, Football, Entertainment, Special, Music, Crime, Mystery, Science fiction, Outdoors, Western, Travel, Game show, Sports talk, Home improvement, Law, Auto, Hockey, History, Playoff sports, Educational, Shopping, Weather, Newsmagazine, How-to, Romance, Science, Basketball, Nature, Interview, Public affairs, Bus./financial, Politics, Horror, Baseball, Animals, Romance-comedy, Medical, Paranormal, Comedy-drama, Religious, Auto racing, Golf, Health, Hunting, War, Fashion, Docudrama, Biography, Adults only, Mixed martial arts, Soap, Historical drama, Auction, Fishing, Pro wrestling, Collectibles, Variety, Musical, Community, Dance, Soccer, Art, Anthology, Agriculture, Musical comedy, Boxing, Poker, Card games, Awards, Parenting, Aviation, Martial arts, Military, Standup, Action sports, Technology, Spanish, Anime, Shooting, Tennis, Exercise, Environment, Rodeo, Wrestling, Softball, Gaming, Motorcycle racing, Cheerleading, Debate, Event, Motorcycle, Horse, Archery, Bicycle racing, Equestrian, Self improvement, Miniseries, Skiing, Motorsports, Volleyball, Drag racing, Dog show, Figure skating, Parade, Track/field, Arts/crafts, Gay/lesbian, Swimming, Gymnastics, Bowling, Lacrosse, Snowmobile, Arm wrestling, Beach volleyball, Snowboarding, Diving, Fundraiser, Computers, Weightlifting, Boat, Billiards, Running, Music special, Performing arts, Rugby, Watersports, Pool, Kayaking, Beach soccer, Intl soccer, Triathlon, Curling, Olympics, Surfing, Water polo, Holiday, Skateboarding, Yacht racing, Cricket, Speed skating, Holiday special, Field hockey, Luge, Bobsled, Boat racing, Racquet, Dog racing, Sailing, Holiday-children special, Skating, Squash, Polo, Rowing)"
Victoria,"List(Children, Reality, Animated, Sitcom, Comedy, Adventure, Drama, News, Crime drama, Action, Fantasy, Talk, Sports event, Educational, Mystery, Documentary, Suspense, Sports non-event, Football, Music, Consumer, Cooking, House/garden, Horror, Paranormal, Entertainment, Western, Science fiction, Sports talk, Game show, Crime, Special, Home improvement, Travel, Shopping, Medical, Basketball, Baseball, Newsmagazine, Playoff sports, Law, History, Romance, Religious, Golf, Science, Romance-comedy, Public affairs, Comedy-drama, Politics, Interview, Weather, War, How-to, Outdoors, Animals, Bus./financial, Nature, Historical drama, Docudrama, Biography, Soap, Health, Collectibles, Anthology, Auto, Pro wrestling, Fashion, Musical, Auction, Auto racing, Musical comedy, Dance, Soccer, Spanish, Hunting, Variety, Parenting, Softball, Volleyball, Fishing, Community, Art, Technology, Mixed martial arts, Adults only, Martial arts, Tennis, Anime, Awards, Hockey, Poker, Card games, Boxing, Rodeo, Environment, Action sports, Standup, Agriculture, Aviation, Event, Exercise, Track/field, Gay/lesbian, Miniseries, Military, Shooting, Wrestling, Self improvement, Drag racing, Motorcycle racing, Arts/crafts, Debate, Gymnastics, Swimming, Gaming, Figure skating, Horse, Diving, Bowling, Computers, Arm wrestling, Bicycle racing, Performing arts, Motorsports, Archery, Sailing, Parade, Fundraiser, Music special, Skiing, Motorcycle, Cheerleading, Billiards, Olympics, Beach volleyball, Rugby, Lacrosse, Pool, Running, Cricket, Holiday, Children-special, Equestrian, Triathlon, Holiday special, Boat racing, Boat, Kayaking, Snowboarding, Water polo, Surfing, Weightlifting, Snowmobile)"
Oklahoma City,"List(Reality, News, Sitcom, Comedy, Documentary, Drama, Talk, Adventure, Children, Animated, Action, Sports event, Crime drama, Suspense, Sports non-event, Fantasy, Crime, Football, Mystery, Entertainment, Special, House/garden, Science fiction, Consumer, Game show, Horror, Music, Cooking, Western, Sports talk, History, Newsmagazine, Educational, Shopping, Religious, Basketball, Travel, Law, Politics, Romance, Home improvement, Paranormal, Weather, Interview, Playoff sports, Science, Public affairs, Auto, Comedy-drama, Romance-comedy, Outdoors, Animals, Medical, War, How-to, Nature, Golf, Bus./financial, Docudrama, Fashion, Health, Biography, Historical drama, Auto racing, Baseball, Soap, Pro wrestling, Musical, Variety, Collectibles, Auction, Anthology, Soccer, Mixed martial arts, Hunting, Spanish, Musical comedy, Dance, Poker, Adults only, Martial arts, Card games, Community, Art, Boxing, Parenting, Action sports, Hockey, Technology, Fishing, Aviation, Awards, Military, Softball, Tennis, Agriculture, Self improvement, Standup, Event, Environment, Anime, Rodeo, Wrestling, Miniseries, Horse, Motorcycle, Shooting, Debate, Bicycle racing, Figure skating, Gaming, Volleyball, Motorcycle racing, Exercise, Drag racing, Gymnastics, Skiing, Gay/lesbian, Parade, Motorsports, Equestrian, Snowboarding, Track/field, Arts/crafts, Diving, Dog show, Fundraiser, Running, Bowling, Computers, Boat, Swimming, Rugby, Lacrosse, Arm wrestling, Performing arts, Archery, Pool, Billiards, Music special, Intl soccer, Surfing, Olympics, Beach volleyball, Weightlifting, Skateboarding, Holiday, Sailing, Triathlon, Cheerleading, Watersports, Yacht racing, Snowmobile, Curling, Water polo, Kayaking, Field hockey, Theater, Beach soccer, Luge, Bobsled, Holiday special, Boat racing, Speed skating, Table tennis, Cricket, Mountain biking, Badminton, Holiday-children special, Bicycle, Polo, Holiday-children, Children-special, Dog racing, Intl hockey)"


QUESTION 2.1
DMA: Waco-Temple-Bryan , TOP 5 GENRES: ['Reality', 'News', 'Sitcom', 'Talk', 'Sports event']
DMA: Washington, DC (Hagrstwn) , TOP 5 GENRES: ['Reality', 'News', 'Sitcom', 'Comedy', 'Drama']
DMA: New York , TOP 5 GENRES: ['News', 'Reality', 'Talk', 'Sitcom', 'Documentary']



In [0]:
#Part 2.2
#NUMERICAL COLUMNS FIX
demographic_wealth_data = demographic_data.select("household_id", "net_worth", "income").distinct().\
    withColumn("income",when(col('income')=='A',11).otherwise(when(col('income')=='B',11)\
        .otherwise(when(col('income')=='C',12).otherwise(when(col("income")=='D',13)\
            .otherwise(when(col("income").isNull(),0).otherwise(col("income")))))))\
                .withColumn("income",col("income").cast('int'))\
                    .withColumn("net_worth",when(col('net_worth')=='A',11).otherwise(when(col('net_worth')=='B',11)\
                        .otherwise(when(col('net_worth')=='C',12).otherwise(when(col("net_worth")=='D',13)\
                            .otherwise(when(col("net_worth").isNull(),0).otherwise(col("net_worth")))))))\
                                .withColumn("net_worth",col("net_worth").cast('int'))

#GET MAX NETWORTH AND INCOME   
max_networth = demographic_wealth_data.agg(max("net_worth")).collect()[0][0]
max_income = demographic_wealth_data.agg(max("income")).collect()[0][0]

dma_wealth_data = ref_data.select("dma","household_id").distinct()\
    .join(broadcast(demographic_wealth_data),on="household_id").repartition("dma").groupBy("dma")\
        .agg(avg('net_worth').alias('avg_net_worth'), avg('income').alias('avg_income'))

dma_normalized_avg = dma_wealth_data.withColumn("WEALTH SCORE",( (col('avg_net_worth')/max_networth) + (col('avg_income')/max_income))).withColumnRenamed("dma","DMA").select("DMA","WEALTH SCORE").orderBy(desc("WEALTH SCORE"))


all_genres = set([row.genre for row in prog_data_sep_genre.select('genre').collect()])
counter = 1
schema = StructType([
    StructField('DMA', StringType(), nullable=False),
    StructField('WEALTH SCORE', DoubleType(), nullable=False),
    StructField('ORDERED LIST OF GENRES', ArrayType(StringType()), nullable=False)
])
result_2_2 = spark.createDataFrame([],schema)
dict_3={} #key:dma name value:unique top 8 list - this dictionary is used for question 3
print("QUESTION 2.2")
for row in dma_normalized_avg.collect():
    dma_name = row["DMA"]
    dma_wealth = row["WEALTH SCORE"]
    dma_top_gen = result_2_1.filter(col('dma')==dma_name).collect()
    if(len(dma_top_gen)!=0):
        current_list = dma_top_gen[0][1]
        top_8 = [genre for genre in current_list if genre in all_genres][:8]
        all_genres = all_genres-set(top_8)
        
    else:
        top_8 = []
    dict_3[dma_name]=top_8
    new_row = spark.createDataFrame([(dma_name,dma_wealth,top_8)],schema)
    result_2_2 = result_2_2.union(new_row)
result_2_2.show(25,truncate=False)
display(result_2_2)   

QUESTION 2.2
+--------------------------+------------------+-----------------------------------------------------------------------------------------------------------+
|DMA                       |WEALTH SCORE      |ORDERED LIST OF GENRES                                                                                     |
+--------------------------+------------------+-----------------------------------------------------------------------------------------------------------+
|San Antonio               |1.623931623931624 |[]                                                                                                         |
|Baltimore                 |1.3367165242165242|[News, Reality, Talk, Comedy, Sports event, Sitcom, Drama, Documentary]                                    |
|San Francisco-Oak-San Jose|1.330195301209794 |[Music, Children, Animated, Adventure, Educational, Fantasy, Sports non-event, Crime]                      |
|Detroit                   |1.2932170709948487|[Act

DMA,WEALTH SCORE,ORDERED LIST OF GENRES
San Antonio,1.623931623931624,List()
Baltimore,1.3367165242165242,"List(News, Reality, Talk, Comedy, Sports event, Sitcom, Drama, Documentary)"
San Francisco-Oak-San Jose,1.330195301209794,"List(Music, Children, Animated, Adventure, Educational, Fantasy, Sports non-event, Crime)"
Detroit,1.2932170709948487,"List(Action, Crime drama, Suspense, House/garden, Entertainment, Consumer, Sports talk, Football)"
Austin,1.254676045163184,"List(Special, Mystery, Cooking, Politics, Newsmagazine, Interview, Game show, Western)"
Sacramnto-Stkton-Modesto,1.2187876275636091,"List(Shopping, Law, Science fiction, Baseball, Travel, Public affairs, Horror, Basketball)"
Seattle-Tacoma,1.2176017291607932,"List(Home improvement, Outdoors, History, Religious, Medical, Romance, How-to, Playoff sports)"
Philadelphia,1.199571059220182,"List(Weather, Animals, Nature, Romance-comedy, Science, Variety, Auto racing, War)"
Harrisburg-Lncstr-Leb-York,1.1981715763228369,"List(Bus./financial, Golf, Hockey, Auto, Paranormal, Comedy-drama, Health, Tennis)"
Cleveland-Akron (Canton),1.1952732452732453,"List(Soap, Hunting, Fashion, Collectibles, Anthology, Docudrama, Mixed martial arts, Fishing)"


#PART 3

In [0]:
#QUESTION 3
#create new directory with the dma name
def createNewDirectory(new_dir_name):
    result = 'Question3/' + new_dir_name
    return result
    
#the schema for the new df
daily_prog_schema =  StructType([StructField('prog_code',StringType()),
                     StructField('title',StringType()),
                     StructField('genre',StringType()),
                     StructField('air_date',StringType()),
                     StructField('air_time',StringType()),
                     StructField('Duration',FloatType()),
                     StructField('genre_asso',StringType()) #genre_asso is the genre header of the file(each genre get a file, so that specify which file genre is)
                                       ])
#iterate the dma's and their top 8
for dma,genres in dict_3.items():
    output_dir = createNewDirectory(dma)
    new_df = spark.createDataFrame([],daily_prog_schema)
    if len(genres)!=0:
        for c_genre in genres:
            from_daily = daily_prog_data.filter(array_contains(split(col('genre'), ","), c_genre))
            from_daily = from_daily.withColumn('genre_asso', lit(c_genre))
            new_df = new_df.union(from_daily).na.drop().dropDuplicates()
            new_df.write.csv(output_dir + "/" + c_genre, header=True, mode="overwrite") #create a csv file with the genre df

In [0]:
#END OF PROJECT 1