d ##### Usecase:Get channel data by viewers
###### The usecase aims to display show-to-channel relationship is Many-to-Many. 
###### In other words, each show might appear on many channels, and each channel might broadcast many shows.
###### The description of the data is as below
###### TV show titles do not have spaces
###### Channels have 3 letters
###### TV show titles can appear multiple times, with different counts
###### A TV show and channel combination might appear multiple times
###### TV shows could appear on multiple channels
###### The output should have no commas or punctuation, only 1 space between the TV show title and number.


###### Solutions are given for following problem statements
###### Problem statement-1:Get the total number of viewers for shows on ABC
###### Problem statement-2:Get the number of viewers for the BAT channel
###### Problem statement-3:Get the most viewed show on ABC channel
###### Problem statement-4:Get the aired shows on  NOX, ABC channels
###### Problem statement-5:Get list of TOP10  channels by number of shows
###### Problem statement-6:Get all channels and shows with number of viewers
###### Problem statement-7:Get number of viewers for each channel
###### Problem statement-8:Get number of viewers for each show
###### Problem statement-9:Get TOP5 shows for each channel
###### Problem statement-10:Get sports programs for each channel
###### Problem statement-11:Get data of Talk shows for each channel
###### Problem statement-12:Get data of News programs for each channel

In [2]:
from pyspark.sql import SQLContext, Row
from pyspark.sql import HiveContext
from pyspark.sql.functions import round,sum,avg,count,substring
hiveContext = HiveContext(sc)
sqlContext.sql("set spark.sql.shuffle.partitions=10")

In [3]:
hiveContext.sql("create database channels_db")

In [4]:
hiveContext.sql("use channels_db")

In [5]:
hiveContext.sql("create table gennuma(showtitle string, showviewer string) row format delimited fields terminated by ','")
hiveContext.sql("create table gennumb(showtitle string, showviewer string) row format delimited fields terminated by ','")
hiveContext.sql("create table gennumc(showtitle string, showviewer string) row format delimited fields terminated by ','")

In [6]:
hiveContext.sql("load data local inpath 'dbfs:/FileStore/tables/channels/gennumA.txt' into table gennuma")
hiveContext.sql("load data local inpath 'dbfs:/FileStore/tables/channels/gennumB.txt' into table gennumb")
hiveContext.sql("load data local inpath 'dbfs:/FileStore/tables/channels/gennumC.txt' into table gennumc")

In [7]:
hiveContext.sql("create table genchana(showtitle string, showchannel string) row format delimited fields terminated by ','")
hiveContext.sql("create table genchanb(showtitle string, showchannel string) row format delimited fields terminated by ','")
hiveContext.sql("create table genchanc(showtitle string, showchannel string) row format delimited fields terminated by ','")

In [8]:
hiveContext.sql("load data local inpath 'dbfs:/FileStore/tables/channels/genchanA.txt' into table genchana")
hiveContext.sql("load data local inpath 'dbfs:/FileStore/tables/channels/genchanB.txt' into table genchanb")
hiveContext.sql("load data local inpath 'dbfs:/FileStore/tables/channels/genchanC.txt' into table genchanc")

In [9]:

hiveContext.sql("show tables").show(10,False)

In [10]:
hiveContext.sql("select * from genchana").show(50)

In [11]:
hiveContext.sql("select * from genchanb").show(50)

In [12]:
hiveContext.sql("select * from genchanc").show(50)

In [13]:
hiveContext.sql("select * from gennuma").show(50)

In [14]:
hiveContext.sql("select * from gennumb").show(5)

In [15]:
hiveContext.sql("select * from gennumc").show(5)

###### Problem statement-1:Get the total number of viewers for shows on ABC

In [17]:
hiveContext.sql("create table channelname as select * from genchana where showchannel='ABC'")
hiveContext.sql("insert into table channelname select * from genchanb where showchannel='ABC'")
hiveContext.sql("insert into table channelname select * from genchanc where  showchannel='ABC'")

In [18]:
hiveContext.sql("select * from channelname").show(10)

In [19]:
hiveContext.sql("create table showchannelviewertb(showtitle string, showviewer bigint)")

In [20]:
hiveContext.sql('''INSERT into table showchannelviewertb select
gennuma.showtitle,sum(CAST(gennuma.showviewer AS BIGINT)) from channelname JOIN gennuma on
(channelname.showtitle = gennuma.showtitle) GROUP BY  gennuma.showtitle''')

In [21]:
hiveContext.sql('''INSERT into table showchannelviewertb select
gennumb.showtitle,sum(CAST(gennumb.showviewer AS BIGINT)) from channelname JOIN gennumb on
(channelname.showtitle = gennumb.showtitle) GROUP BY  gennumb.showtitle''')

In [22]:
hiveContext.sql('''INSERT into table showchannelviewertb select
gennumc.showtitle,sum(CAST(gennumc.showviewer AS BIGINT)) from channelname JOIN gennumc on
(channelname.showtitle = gennumc.showtitle) GROUP BY  gennumc.showtitle''')

In [23]:
totalviewersforABC=hiveContext.sql('''select B.showtitle,sum(B.showviewer) as totoalnumber_of_viewers from channelname A 
                                      JOIN showchannelviewertb B on (A.showtitle = B.showtitle) 
                                      GROUP BY B.showtitle
                                      ORDER BY B.showtitle ''')
totalviewersforABC.coalesce(1).write.csv("/FileStore/tables/totalviewersforABC", compression="none", header ='true')
totalviewersforABC.show(50,False)


###### Problem statement-2:Get the total number of viewers for the BAT channel

In [25]:
hiveContext.sql("create table channelnameBAT as select * from genchana where showchannel='BAT'")
hiveContext.sql("insert into table channelnameBAT select * from genchanb where showchannel='BAT'")
hiveContext.sql("insert into table channelnameBAT select * from genchanc where  showchannel='BAT'")

In [26]:
hiveContext.sql("select * from channelnameBAT").show(10)

In [27]:
hiveContext.sql("create table showchannelviewertblBAT(showtitle string, showviewer bigint)")

In [28]:
hiveContext.sql('''INSERT into table showchannelviewertblBAT select
gennuma.showtitle,sum(CAST(gennuma.showviewer AS BIGINT)) from channelnameBAT JOIN gennuma on
(channelnameBAT.showtitle = gennuma.showtitle) GROUP BY  gennuma.showtitle''')

In [29]:
hiveContext.sql('''INSERT into table showchannelviewertblBAT select
gennumb.showtitle,sum(CAST(gennumb.showviewer AS BIGINT)) from channelnameBAT JOIN gennumb on
(channelnameBAT.showtitle = gennumb.showtitle) GROUP BY  gennumb.showtitle''')

In [30]:
hiveContext.sql('''INSERT into table showchannelviewertblBAT select
gennumc.showtitle,sum(CAST(gennumc.showviewer AS BIGINT)) from channelnameBAT JOIN gennumc on
(channelnameBAT.showtitle = gennumc.showtitle) GROUP BY  gennumc.showtitle''')

In [31]:
totalviewersforBAT = hiveContext.sql("select sum(showviewer) as totanumberofviewersBAT from showchannelviewertblBAT")
totalviewersforBAT.coalesce(1).write.csv("/FileStore/tables/totalviewersforBAT", compression="none", header ='true')
totalviewersforBAT.show(50,False)

###### Problem statement-3:Get the most viewed show on ABC channel

In [33]:
hiveContext.sql("create table showchannelviewertblMAX(showtitle string, showviewer bigint)")

In [34]:
hiveContext.sql('''INSERT into table showchannelviewertblMAX select B.showtitle,sum(B.showviewer) from
channelname A JOIN showchannelviewertb B on (A.showtitle = B.showtitle) GROUP BY
B.showtitle''')

In [35]:
mostviewedonABC=hiveContext.sql("select showtitle, showviewer from showchannelviewertblMAX ORDER BY showviewer DESC LIMIT 1")
mostviewedonABC.coalesce(1).write.csv("/FileStore/tables/mostviewedonABC", compression="none", header ='true')
mostviewedonABC.show(20,False)

###### Problem statement-4:Get the aired shows on  NOX, ABC channels

In [37]:
hiveContext.sql("create table channelname1 as select * from genchana where showchannel ='ABC'")
hiveContext.sql("insert into table channelname1 select * from genchanb where showchannel = 'ABC'")
hiveContext.sql("insert into table channelname1 select * from genchanc where showchannel = 'ABC'")

In [38]:
hiveContext.sql("insert into table channelname1 select * from genchana where showchannel ='NOX'")
hiveContext.sql("insert into table channelname1 select * from genchanb where showchannel = 'NOX'")
hiveContext.sql("insert into table channelname1 select * from genchanc where showchannel = 'NOX'")

In [39]:
showsonABCNOX = hiveContext.sql("select showchannel,showtitle from channelname1 group by showchannel,showtitle order by showchannel")
showsonABCNOX.coalesce(1).write.csv("/FileStore/tables/showsonABCNOX", compression="none", header ='true')
showsonABCNOX.show(20,False)

###### Problem statement-5:Get list of TOP10  channels by number of shows

In [41]:
hiveContext.sql("create table allchannels as select * from genchana")
hiveContext.sql("insert into table allchannels select * from genchanb")
hiveContext.sql("insert into table allchannels select * from genchanc")

In [42]:
TOP10channels = hiveContext.sql('''select showchannel,count(showchannel) as numberofshows
                        from allchannels
                        group by showchannel 
                        order by numberofshows  desc limit 10''')
TOP10channels.coalesce(1).write.csv("/FileStore/tables/TOP10channels", compression="none", header ='true')
TOP10channels.show(20,False)

###### Problem statement-6:Get all channels and shows with no of viewers

In [44]:
hiveContext.sql("drop table showchannelviewerallchan ")

In [45]:
hiveContext.sql("create table showchannelviewerallchan(showchannel string,showtitle string, showviewer bigint)");

In [46]:
hiveContext.sql('''INSERT into table showchannelviewerallchan select
allchannels.showchannel,gennuma.showtitle,sum(CAST(gennuma.showviewer AS BIGINT)) from allchannels JOIN gennuma on
(allchannels.showtitle = gennuma.showtitle) GROUP BY allchannels.showchannel,gennuma.showtitle''')

In [47]:
hiveContext.sql('''INSERT into table showchannelviewerallchan select
allchannels.showchannel,gennumb.showtitle,sum(CAST(gennumb.showviewer AS BIGINT)) from allchannels JOIN gennumb on
(allchannels.showtitle = gennumb.showtitle) GROUP BY allchannels.showchannel,gennumb.showtitle''')

In [48]:
hiveContext.sql('''INSERT into table showchannelviewerallchan select
allchannels.showchannel,gennumc.showtitle,sum(CAST(gennumc.showviewer AS BIGINT)) from allchannels JOIN gennumc on
(allchannels.showtitle = gennumc.showtitle) GROUP BY  allchannels.showchannel,gennumc.showtitle''')

In [49]:
channelviewersdataforall=hiveContext.sql("select * from showchannelviewerallchan order by 1,2")
channelviewersdataforall.coalesce(1).write.csv("/FileStore/tables/channelviewersdataforall", compression="none", header ='true')
channelviewersdataforall.show(20,False)

###### Problem statement-7:Get number of viewers for each channel

In [51]:
numofviewersforeachchannel=hiveContext.sql("select showchannel, sum(showviewer) as noofviewers from showchannelviewerallchan group by showchannel order by 1,2")
numofviewersforeachchannel.coalesce(1).write.csv("/FileStore/tables/numofviewersforeachchannel", compression="none", header ='true')
numofviewersforeachchannel.show(20,False)

###### Problem statement-8:Get number of viewers for each show

In [53]:
viewersdataforeachshow = hiveContext.sql('''select showtitle, sum(showviewer) as numberofviewers 
                                         from showchannelviewerallchan group by showtitle order by 1,2''')
viewersdataforeachshow.coalesce(1).write.csv("/FileStore/tables/viewersdataforeachshow", compression="none", header ='true')
viewersdataforeachshow.show(20,False)

###### Problem statement-9:Get TOP5 shows based on number of viewers for each channel

In [55]:
top5showsdata = hiveContext.sql('''select * from (select scvc.showchannel, scvc.showtitle, sum(scvc.showviewer) as number_of_viewers,
                                                  rank() over(partition by scvc.showchannel order by sum(scvc.showviewer) desc) ranking_order
                                                  from showchannelviewerallchan scvc
                                                  group by scvc.showchannel, scvc.showtitle
                                                  order by scvc.showchannel)t
                               WHERE ranking_order<=5 ORDER BY 1,4''')
top5showsdata.coalesce(1).write.csv("/FileStore/tables/top5showsdata", compression="none", header ='true')
top5showsdata.show(50,False)

###### Problem statement-10:Get data of sports programs for each channel

In [57]:
viewersdataforsportsshow = hiveContext.sql('''select showchannel, showtitle, sum(showviewer) as numberofviewers
                                         from showchannelviewerallchan
                                         where showtitle LIKE '%Sports%' or showtitle LIKE '%Games%'
                                         group by showchannel,showtitle
                                         order by showchannel,showtitle''')
viewersdataforsportsshow.coalesce(1).write.csv("/FileStore/tables/viewersdataforsportsshow", compression="none", header ='true')
viewersdataforsportsshow.show(20,False)

###### Problem statement-11:Get data of Talk shows for each channel

In [59]:
viewersdatafortalksshow = hiveContext.sql('''select showchannel, showtitle, sum(showviewer) as numberofviewers
                                         from showchannelviewerallchan
                                         where showtitle LIKE '%Talk%'
                                         group by showchannel,showtitle
                                         order by showchannel,showtitle''')
viewersdatafortalksshow.coalesce(1).write.csv("/FileStore/tables/viewersdatafortalksshow", compression="none", header ='true')
viewersdatafortalksshow.show(20,False)

###### Problem statement-12:Get data of News programs for each channel

In [61]:
viewersdatafornewsprograms = hiveContext.sql('''select showchannel, showtitle, sum(showviewer) as numberofviewers
                                         from showchannelviewerallchan
                                         where showtitle LIKE '%News%'
                                         group by showchannel,showtitle
                                         order by showchannel,showtitle''')
viewersdatafornewsprograms.coalesce(1).write.csv("/FileStore/tables/viewersdatafornewsprograms", compression="none", header ='true')
viewersdatafornewsprograms.show(50,False)