d ##### Usecase:Get channel data by viewers
###### The usecase aims to display show-to-channel relationship is Many-to-Many. 
###### In other words, each show might appear on many channels, and each channel might broadcast many shows.
###### The description of the data is as below
###### TV show titles do not have spaces
###### Channels have 3 letters
###### TV show titles can appear multiple times, with different counts
###### A TV show and channel combination might appear multiple times
###### TV shows could appear on multiple channels
###### The output should have no commas or punctuation, only 1 space between the TV show title and number.


###### Solutions are given for following problem statements
###### Problem statement-1:Get the total number of viewers for shows on ABC
###### Problem statement-2:Get the number of viewers for the BAT channel
###### Problem statement-3:Get the most viewed show on ABC channel
###### Problem statement-4:Get the aired shows on  NOX, ABC channels
###### Problem statement-5:Get list of TOP10  channels by number of shows
###### Problem statement-6:Get all channels and shows with number of viewers
###### Problem statement-7:Get number of viewers for each channel
###### Problem statement-8:Get number of viewers for each show
###### Problem statement-9:Get TOP5 shows for each channel
###### Problem statement-10:Get sports programs for each channel
###### Problem statement-11:Get data of Talkshows for each channel
###### Problem statement-12:Get data of News programs for each channel

In [2]:
from pyspark.sql import SQLContext, Row, SparkSession
from pyspark.sql.functions import round,sum,avg,count,substring
from pyspark.sql.types import *
sqlContext = SQLContext(sc)
spark = SparkSession.builder.master('local').appName('channeldatabyviewer').enableHiveSupport().getOrCreate()
sqlContext.sql("set spark.sql.shuffle.partitions=10")

In [3]:
gennumadf = spark.read.csv("/FileStore/tables/channels/gennumA.txt", sep=',',schema ='showtitle string, showviewer bigint')
gennumadf.show(5,False)

In [4]:
gennumbdf = spark.read.csv("/FileStore/tables/channels/gennumB.txt", sep=',',schema ='showtitle string, showviewer bigint')
gennumbdf.show(5,False)

In [5]:
gennumcdf = spark.read.csv("/FileStore/tables/channels/gennumC.txt", sep=',',schema ='showtitle string, showviewer bigint')
gennumcdf.show(5,False)

In [6]:
genchanadf = spark.read.csv("/FileStore/tables/channels/genchanA.txt", sep=',',schema ='showtitle string, showchannel string')
genchanadf.show(5,False)

In [7]:
genchanbdf = spark.read.csv("/FileStore/tables/channels/genchanB.txt", sep=',',schema ='showtitle string, showchannel string')
genchanbdf.show(5,False)

In [8]:
genchancdf = spark.read.csv("/FileStore/tables/channels/genchanC.txt", sep=',',schema ='showtitle string, showchannel string')
genchancdf.show(5,False)

In [9]:
genchanall = genchanadf.union(genchanbdf).union(genchancdf)
genchanall.show(600,False)
genchanall.registerTempTable("genchanallTbl")

In [10]:
gennumall = gennumadf.union(gennumbdf).union(gennumcdf)
gennumall.show(600,False)
gennumall.registerTempTable("gennumallTbl")

###### Problem statement-1:Get the total number of viewers for shows on ABC

In [12]:
totalviewersforABC=sqlContext.sql('''select B.showtitle,sum(B.showviewer) as totoalnumber_of_viewers from genchanallTbl A 
                                      JOIN gennumallTbl B on (A.showtitle = B.showtitle) 
                                      where A.showchannel='ABC'
                                      GROUP BY B.showtitle
                                      ORDER BY B.showtitle ''')
totalviewersforABC.coalesce(1).write.csv("/FileStore/tables/totalviewersforABC", compression="none", header ='true')
totalviewersforABC.show(50,False)


###### Problem statement-2:Get the total number of viewers for the BAT channel

In [14]:
totalviewersforBAT=sqlContext.sql('''select sum(B.showviewer) as totalnumberofviewersforBAT 
                                     from genchanallTbl A join gennumallTbl B on A.showtitle = B.showtitle
                                     where A.showchannel='BAT' ''')
totalviewersforBAT.coalesce(1).write.csv("/FileStore/tables/totalviewersforBAT", compression="none", header ='true')
totalviewersforBAT.show(50,False)

###### Problem statement-3:Get the most viewed show on ABC channel

In [16]:
mostviewedonABC=sqlContext.sql('''select B.showtitle, SUM(B.showviewer)
                                   from genchanallTbl A join gennumallTbl B on A.showtitle = B.showtitle
                                   where A.showchannel='ABC'
                                   GROUP BY B.showtitle 
                                   ORDER BY SUM(B.showviewer) DESC LIMIT 1''')
mostviewedonABC.coalesce(1).write.csv("/FileStore/tables/mostviewedonABC", compression="none", header ='true')
mostviewedonABC.show(20,False)

###### Problem statement-4:Get the aired shows on  NOX, ABC channels

In [18]:
showsonABCNOX = sqlContext.sql('''select A.showchannel,B.showtitle 
                                   from genchanallTbl A join gennumallTbl B on A.showtitle = B.showtitle
                                   where A.showchannel='ABC' or A.showchannel='NOX'
                                   group by A.showchannel,B.showtitle 
                                   order by A.showchannel''')
showsonABCNOX.coalesce(1).write.csv("/FileStore/tables/showsonABCNOX", compression="none", header ='true')
showsonABCNOX.show(40,False)

###### Problem statement-5:Get list of TOP10  channels by number of shows

In [20]:
TOP10channels = sqlContext.sql('''select A.showchannel,count(A.showtitle) as numberofshows
                                  from genchanallTbl A 
                                  group by A.showchannel 
                                  order by numberofshows  desc limit 10''')
TOP10channels.coalesce(1).write.csv("/FileStore/tables/TOP10channels", compression="none", header ='true')
TOP10channels.show(20,False)

###### Problem statement-6:Get all channels and shows with no of viewers

In [22]:
channelviewersdataforall = sqlContext.sql('''select A.showchannel,A.showtitle,sum(B.showviewer) numberofviewers
                                  from genchanallTbl A join gennumallTbl B on A.showtitle = B.showtitle
                                  group by A.showchannel,A.showtitle
                                  order by A.showchannel,A.showtitle''')
channelviewersdataforall.coalesce(1).write.csv("/FileStore/tables/channelviewersdataforall", compression="none", header ='true')
channelviewersdataforall.show(20,False)

###### Problem statement-7:Get number of viewers for each channel

In [24]:
numofviewersforeachchannel=sqlContext.sql('''select A.showchannel, sum(B.showviewer) as noofviewers 
                                           from genchanallTbl A join gennumallTbl B on A.showtitle = B.showtitle 
                                           group by A.showchannel 
                                           order by A.showchannel''')
numofviewersforeachchannel.coalesce(1).write.csv("/FileStore/tables/numofviewersforeachchannel", compression="none", header ='true')
numofviewersforeachchannel.show(20,False)

###### Problem statement-8:Get number of viewers for each show

In [26]:
viewersdataforeachshow = sqlContext.sql('''select B.showtitle, sum(B.showviewer) as noofviewers 
                                           from genchanallTbl A join gennumallTbl B on A.showtitle = B.showtitle 
                                           group by B.showtitle 
                                           order by B.showtitle''')
viewersdataforeachshow.coalesce(1).write.csv("/FileStore/tables/viewersdataforeachshow", compression="none", header ='true')
viewersdataforeachshow.show(20,False)

###### Problem statement-9:Get TOP5 shows based on number of viewers for each channel

In [28]:
top5showsdata = sqlContext.sql('''select * from (select A.showchannel, A.showtitle, sum(B.showviewer) as number_of_viewers,
                                                  rank() over(partition by A.showchannel order by sum(B.showviewer) desc) ranking_order
                                                  from genchanallTbl A join gennumallTbl B on A.showtitle = B.showtitle 
                                                  group by A.showchannel, A.showtitle
                                                  order by A.showchannel)t
                                  WHERE ranking_order<=5 ORDER BY 1,4''')
top5showsdata.coalesce(1).write.csv("/FileStore/tables/top5showsdata", compression="none", header ='true')
top5showsdata.show(50,False)

###### Problem statement-10:Get data of sports programs for each channel

In [30]:
viewersdataforsportsshow = sqlContext.sql('''select A.showchannel, B.showtitle, sum(B.showviewer) as numberofviewers
                                         from genchanallTbl A join gennumallTbl B on A.showtitle = B.showtitle
                                         where B.showtitle LIKE '%Sports%' or B.showtitle LIKE '%Games%'
                                         group by A.showchannel,B.showtitle
                                         order by A.showchannel,B.showtitle''')
viewersdataforsportsshow.coalesce(1).write.csv("/FileStore/tables/viewersdataforsportsshow", compression="none", header ='true')
viewersdataforsportsshow.show(20,False)

###### Problem statement-11:Get data of Talk shows for each channel

In [32]:
viewersdatafortalksshow = sqlContext.sql('''select A.showchannel, B.showtitle, sum(B.showviewer) as numberofviewers
                                         from genchanallTbl A join gennumallTbl B on A.showtitle = B.showtitle
                                         where B.showtitle LIKE '%Talk%'
                                         group by A.showchannel,B.showtitle
                                         order by A.showchannel,B.showtitle''')
viewersdatafortalksshow.coalesce(1).write.csv("/FileStore/tables/viewersdatafortalksshow", compression="none", header ='true')
viewersdatafortalksshow.show(20,False)

###### Problem statement-12:Get data of News programs for each channel

In [34]:
viewersdatafornewsprograms = sqlContext.sql('''select A.showchannel, B.showtitle, sum(B.showviewer) as numberofviewers
                                         from genchanallTbl A join gennumallTbl B on A.showtitle = B.showtitle
                                         where B.showtitle LIKE '%News%'
                                         group by A.showchannel,B.showtitle
                                         order by A.showchannel,B.showtitle''')
viewersdatafornewsprograms.coalesce(1).write.csv("/FileStore/tables/viewersdatafornewsprograms", compression="none", header ='true')
viewersdatafornewsprograms.show(50,False)