In [1]:
#//***************************************
#//*** Apply Common Cleanup operations
#//***************************************
#//*** In anticpation that I'll be re-using text cleanup code. I'm adding some robustness to the function.
#//*** Adding kwargs to disable features that default to true.
#//*** Whether an action is skipped or executed is based on a boolean value stored in action_dict.
#//*** Key values will default to true. If code needs to be defaulted to False, a default_false list can be added later
#//*** All Boolean kwarg keya are stored in kwarg list. This speeds up the coding of the action_dict.
#//*** As Kwargs are added 
def mr_clean_text(input_series, input_options={}):
    
    #//*** import time library
    try:
        type(time)
    except:
        import time
    
    #//*** Start Timing the process
    start_time = time.time()
    
    #//*** Add some data validation. I'm preparing this function for additional use. I'm checking if future users (ie future me)
    #//*** may throw some garbage at this function. Experience has taught me to fail safely wherever possible.

    #//*** All kwargs are listed here. These initialize TRUE by default.
    key_list = [ "lower", "newline", "html", "remove_empty", "punctuation" ]
    
    #//*** Build Action Dictionary
    action_dict = { } 
    
    #//*** Build the keys from kwarg_list and default them to TRUE
    for key in key_list:
        action_dict[key] = True
        
    #//*** Loop through the input kwargs (if any). Assign the action_dict values based on the kwargs:
    for key,value in input_options.items():
        print(key,value)
        action_dict[key] = value
    
    
    #//*************************************************************************
    #//*** The Cleanup/Processing code is a straight lift from DSC550 - Week02
    #//*************************************************************************
    #//*** Convert to Lower Case, Default to True
    if action_dict["lower"]:
        input_series = input_series.str.lower()
    
   
    #//*** Remove New Lines
    if action_dict["newline"]:
        #//*** Rmove \r\n
        input_series = input_series.str.replace(r'\r?\n',"")

        #//*** Remove \n new lines
        input_series = input_series.str.replace(r'\n',"")

    #//*** Remove html entities, observed entities are &gt; and &lt;. All HTML entities begin with & and end with ;.
    #//*** Let's use regex to remove html entities
    if action_dict["html"]:
        input_series = input_series.str.replace(r'&.*;',"")

    #//*** Remove the empty lines
    if action_dict["remove_empty"]:
        input_series = input_series[ input_series.str.len() > 0]

    #//*** Remove punctuation
    if action_dict["punctuation"]:
        #//*** Load libraries for punctuation if not already loaded.
        #//*** Wrapping these in a try, no sense in importing libraries that already exist.
        #//*** Unsure of the cost of reimporting libraries (if any). But testing if library is already loaded feels
        #//*** like a good practice
        try:
            type(sys)
        except:
            import sys

        try:
            type(unicodedata)
        except:
            import unicodedata
        
        #//*** replace Comma and Period with a space.
        for punct in [",",".","$"]:
            input_series = input_series.str.replace(punct," ")

        #//*** Remove punctuation using the example from the book
        punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P') )
        input_series = input_series.str.translate(punctuation)

    print(f"Text Cleaning Time: {time.time() - start_time}")

    return input_series
#//*** Remove Stop words from the input list
def remove_stop_words(input_series):
    
    #//*** This function removes stop_words from a series.
    #//*** Works with series.apply()
    def apply_stop_words(input_list):

        #//*** Load Stopwords   
        for word in input_list:
            if word in stop_words:
                input_list.remove(word)
        return input_list

    #//*** import nltk if needed
    try:
        type(nltk)
    except:
        import nltk
        
    stopwords = nltk.corpus.stopwords

    #//*** Stopwords requires an additional download
    try:
        type(stopwords)
    except:
        nltk.download('stopwords')


    #//*** import time library
    try:
        type(time)
    except:
        import time

    #//*** Start Timing the process
    start_time = time.time()


    #//*** The stop_words include punctuation. Stop Word Contractions will not be filtered out.
    stop_words = []

    #//*** Remove apostrophies from the stop_words
    for stop in stopwords.words('english'):
        stop_words.append(stop.replace("'",""))

    
    #//*** Remove Stop words from the tokenized strings in the 'process' column
    #input_series = input_series.apply(remove_stop_words,stop_words)
    
    input_series = input_series.apply(apply_stop_words)

    print(f"Stop Words Time: {time.time() - start_time}")
    
    return input_series
#//*** Tokenize a Series containing Strings.
#//*** Breaking this out into it's own function for later reuse.
#//*** Not a lot of code here, but it helps to keep the libraries localized. This creates standarization for future
#//*** Stoneburner projects. Also has the ability to add functionality as needed.

def tokenize_series(input_series):
    
    try:
        type(nltk)
    except:
        import nltk
    
    word_tokenize = nltk.tokenize.word_tokenize 
    
    #//*** import time library
    try:
        type(time)
    except:
        import time
    
    #//*** Start Timing the process
    start_time = time.time()
    try:
        input_series = input_series.apply(word_tokenize)
    except:
        #//*** Try again is punkt not downloaded
        nltk.download('punkt')
        input_series = input_series.apply(word_tokenize)
        
    
    print(f"Tokenize Time: {time.time() - start_time}")
    
    return input_series




In [2]:
#//*** This thread helped with the connection
#//***https://stackoverflow.com/questions/37692780/error-28000-login-failed-for-user-domain-user-with-pyodbc

#//**** Update the ODBC Driver
#//**** https://docs.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver15
import pyodbc
import datetime
import json
from datetime import date, timedelta
import pandas as pd
print(pyodbc.drivers())

from sklearn.feature_extraction.text import TfidfVectorizer
import time

#//*** Use the whole window in the IPYNB editor
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#//*** Maximize columns and rows displayed by pandas
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

# Some other example server values are
# server = 'localhost\sqlexpress' # for a named instance
# server = 'myserver,port' # to specify an alternate port
server = 'tcp:OM-CASF-DB01' 
server = 'OM-CASF-DLSQL' 
# server = '10.218.97.2'
database = 'DaletDB' 

with open('./ignore_folder/misc.json') as f:
    data = json.loads(f.read())

username = data["user"] 
password = data["password"]
del data
#cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
cnxn = pyodbc.connect('Trusted_Connection=yes;DRIVER={SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
cursor = cnxn.cursor()

['SQL Server', 'ODBC Driver 17 for SQL Server', 'SQL Server Native Client 11.0']


In [3]:
#//************************************************************
#//************************************************************
#//*** get all Rundowns in given Quarter
#//************************************************************
#//************************************************************

#//*** Table: titles
#//*** Search Titles by Date to get Rundown IDs
#//*** Titles Contains All titled Objects including scripts rundowns and MOS objects
#//*** title_type_id = 25 <--- Rundown Objects
#//*** duration > 900000 <--- Rundowns Longer than 15 minutes (15 * 60 Seconds * 1000 ms)
#//***                        Automatically filters out short content like cutins
title_cols = ['title_id','title_type_id','title','duration','start_date']
import datetime


tgt_year = 2021
quarter = "Q4"

#//*** Build the BETWEEN portion of the query based on quarter and YEAR

quarter_query = ""
if quarter == "Q1":
    quarter_query = f"'{tgt_year}-01-01T00:00:00' AND '{tgt_year}-03-31T23:59:59'"

elif quarter == "Q2":
    quarter_query = f"'{tgt_year}-04-01T00:00:00' AND '{tgt_year}-06-30T23:59:59'"

elif quarter == "Q3":
    quarter_query = f"'{tgt_year}-07-01T00:00:00' AND '{tgt_year}-09-30T23:59:59'"

elif quarter == "Q4":
    quarter_query = f"'{tgt_year}-09-01T00:00:00' AND '{tgt_year}-12-31T23:59:59'"

query = f"""
SELECT *
FROM titles 
WHERE title_type_id = 25 
    AND duration > 900000
    AND start_date BETWEEN {quarter_query}
    AND title <> 'Prepak'
    AND title <> 'Tool Kit'
    AND title <> 'Tricaster'
    AND title <> 'PRODUCER HOLD'
    AND title <> 'Promo'
    AND title <> 'Breaking News'
    AND title <> 'Dalet OD XPression'
    AND title <> 'PRACTICE'
    AND title <> '7AM DIGITAL'
"""


cursor.execute(query)
results = cursor.fetchall()
#results = cursor.fetchmany(100)

all_rundowns_df = pd.read_sql(query,cnxn)

print(all_rundowns_df.columns)

all_rundowns_df = all_rundowns_df[title_cols]

#//*** Filter out shows that SAY DO NOT USE
all_rundowns_df = all_rundowns_df[all_rundowns_df['title'].str.contains('DO NOT USE')==False]

#//*** Convert Start_Date to Date only
all_rundowns_df['start_date'] = all_rundowns_df['start_date'].apply(lambda x: x.date())

all_dates = all_rundowns_df['start_date'].unique()

#//*** Pick one day to test with
one_day_df = all_rundowns_df[all_rundowns_df['start_date'] == all_dates[-3]]


print("Quarterly Shows : ", len(all_rundowns_df))
print("Quarterly Show Hours: ", all_rundowns_df['duration'].sum() / 60000 / 60)
all_rundowns_df.iloc[-40:]
all_rundowns_df

Index(['title_id', 'title_type_id', 'title_interface_id', 'user_id', 'title',
       'interpret', 'author', 'client', 'duration', 'is_online', 'is_recorded',
       'record_date', 'start_date', 'end_date', 'kill_date', 'audio_duration',
       'use_manual_duration', 'soundfile_id', 'year', 'keywords',
       'site_origin', 'no_overwrite', 'is_rotational_cart', 'replacement',
       'last_words', 'is_opener', 'weight', 'beats_pm', 'master_record',
       'compagny_id', 'compagny_disp_name', 'album_disp_name', 'package_id',
       'album_id', 'song_id', 'day_part_rest_id', 'keep_date'],
      dtype='object')
Quarterly Shows :  765
Quarterly Show Hours:  579.8536830555555


Unnamed: 0,title_id,title_type_id,title,duration,start_date
0,32556826,25,5PM Weekday,1680011,2021-09-01
1,32557644,25,6PM Weekday,3515979,2021-09-01
2,32560085,25,11PM WEEKDAY,2113044,2021-09-01
3,32564498,25,5AM Weekday,3435031,2021-09-01
4,32566026,25,6AM Weekday,3630093,2021-09-01
...,...,...,...,...,...
766,34016510,25,3PM Getting Answers,1799998,2021-12-01
767,34016644,25,4PM Weekday,3674003,2021-12-01
768,34017262,25,5PM Weekday,1680011,2021-12-01
769,34017676,25,6PM Weekday,3515979,2021-12-01


In [4]:
def convert_duration(x):
    mins = str(x // 60)
    secs = str(x % 60)
    
    if len(secs) == 1:
        secs = "0" + secs
    
    return mins + ":" + secs

In [21]:
#//***************************************************************************
#//***************************************************************************
#//*** Scrape All Stories For the Given Quarter and export to XLS
#//***************************************************************************
#//***************************************************************************
qtr_df = pd.DataFrame()
for date in all_rundowns_df['start_date'].unique():
    one_day_df = all_rundowns_df[all_rundowns_df['start_date'] == date]
    print(date, "/", all_rundowns_df['start_date'].unique()[-1])
    for row in one_day_df.iterrows():
        loop_rundown = row[1]
        loop_title_id = loop_rundown['title_id']
        #print("Rundown ID: ", loop_title_id, loop_rundown['title'])

        #print(loop_rundown)
        #//*** Get the Blocks associated with the Selected Rundown
        query = f"""
        SELECT block_id
        FROM items 

        WHERE clock_id = '{loop_title_id}'
        """
        loop_blocks = pd.read_sql(query,cnxn)['block_id'].unique()

        #//*** Find the A-Block of Selected Rundown
        #//**** Block Query gets just the A BLOCK from the Rundown Block
        #//*** Combines all Blocks into a single query and returns only the A-Block
        block_query = ""
        for block_id in loop_blocks:
            if block_id == loop_blocks[0]:
                block_query += f"(block_id='{block_id}' "
            else:
                block_query += f"OR block_id='{block_id}' "
        block_query += ") AND title='A BLOCK'"

        query = f"""
        SELECT *
        FROM blocks 
        WHERE {block_query}
        """

        tdf = pd.read_sql(query,cnxn)
        #//*** A Block of Selected Rundown
        if len(tdf) == 0:
            print("================")
            print("NO A-Block Found")
            print("================")
            print(one_day_df)
            continue

        a_block = tdf["block_id"].values[0]
        #print("A Block:",a_block)

        #//******************************
        #//*** Get A-Block Story Titles 
        #//******************************
        query = f"""
        SELECT title_id
        --SELECT block_id,item_id,title_id
        FROM spots 
        WHERE block_id = '{a_block}'
        """

        title_id_list = pd.read_sql(query,cnxn)['title_id'].values


        #//****************************************************
        #//*** Get title_id, Story Slug, and Duration
        #//*** Only get Stories with Greater than 0 Duration
        #//****************************************************

        #//*** Build Single Query to get all Titles from Selected Rundown

        title_id_query = ""
        for title_id in title_id_list:
            if title_id == title_id_list[0]:
                title_id_query += f"title_id='{title_id}' "
            else:
                title_id_query += f"OR title_id='{title_id}' "

        query = f"""
        SELECT title_id,title_type_id,title,duration
        FROM titles 
        WHERE {title_id_query}
        """

        titles_df = pd.read_sql(query,cnxn)

        #//*** Remove Stories with Zero Duration
        #//*** This Removes stories with no text
        titles_df = titles_df[titles_df['duration'] > 0]

        #//*** Remove stories with the word tease in slug
        titles_df = titles_df[titles_df['title'].str.lower().str.contains('tease') == False]
        #print(titles_df)
        #//*** Get StoryBody Text
        story_id_list = titles_df['title_id'].values
        story_id_query = ""
        for story_id in story_id_list:
            if story_id == story_id_list[0]:
                story_id_query += f"TitleId='{story_id}' "
            else:
                story_id_query += f"OR TitleId='{story_id}' "

        query = f"""
        SELECT TitleId,StoryText
        FROM StoryContent
        WHERE {story_id_query}
        """

        text_df = pd.read_sql(query,cnxn)
        #print(text_df)

        #//*** Build Stories_df List of all Stories in show with Text and Title
        #//*** Merge titles_df and text_df 
        stories_df = titles_df.merge(text_df,left_on='title_id',right_on='TitleId')

        #//*** Delete the duplicate column
        del stories_df['TitleId']

        #//********************
        #//*** Clean the Text
        #//********************

        #//*** Remove Brackets [[ ]]
        stories_df['StoryText'] = stories_df['StoryText'].str.replace('\[\[\*\*\*.*?\*\*\*\]\]','\\n',regex=True)

        #//*** Remove paranthesis (( ))
        stories_df['StoryText'] = stories_df['StoryText'].str.replace('\(\(.*?\)\)','\\n',regex=True)

        #//*** Covert \r\n to \n
        stories_df['StoryText'] = stories_df['StoryText'].str.replace('\\r\\n','\\n',regex=True)

        #//*** Convert Multiple \n to single \n
        stories_df['StoryText'] = stories_df['StoryText'].str.replace('\\n\\W*\\n','\\n',regex=True)

        #//*** Delete leading \n
        stories_df['StoryText'] = stories_df['StoryText'].str.replace('^\\W*\\n','',regex=True)
        #print(stories_df.iloc[7]['StoryText'])

        #//*** Rename title column to storyslug
        stories_df.columns = ['storyslug' if x=='title' else x for x in list(stories_df.columns)]


        #//*** Start Date, Title, from the current rundown. This Column will have the same value for all rows.
        static_cols = ['start_date','title']

        for col in static_cols:
            stories_df[col] = loop_rundown[col]

        #//*** Shift the static_cols to the beginning of the column list for readability
        cols = (static_cols + list(stories_df.columns))[:(len(static_cols)*-1)]

        stories_df = stories_df[cols]


        #//*** Rename title column to storyslug
        stories_df.columns = ['rundown' if x=='title' else x for x in list(stories_df.columns)]


        #print(cols)
        #print(stories_df)
        qtr_df = pd.concat([qtr_df,stories_df],ignore_index=True)
        #//*** End Get Single Rundown Stories
    
    #//*** End Each Day of Rundowns

#//*** Remove Stories with 0 length Text
qtr_df['StoryText'] = qtr_df['StoryText'].astype(str)
qtr_df['length'] = qtr_df['StoryText'].apply(lambda x : len(x))
qtr_df = qtr_df[qtr_df['length'] > 20]

if 'title_type_id' in qtr_df.columns:
    del qtr_df['title_type_id']
    
if 'length' in qtr_df.columns:
    del qtr_df['length']
#tdf = qtr_df
qtr_df['duration'] = (qtr_df['duration'] /1000).astype(int)    

qtr_df['time'] = qtr_df['duration'].apply(lambda x: convert_duration(x))

#//**** Move StoryText to the last Column
cols = list(qtr_df.columns)
cols.remove('StoryText')
cols.append('StoryText')
qtr_df = qtr_df[cols]

#//*** Drop Duplicate Scripts
qtr_df = qtr_df.drop_duplicates(subset=['StoryText'])

# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(f'{tgt_year}_{quarter}_Stories.xlsx', engine='xlsxwriter')
qtr_df.to_excel(writer,sheet_name='sheet1')
writer.save()

qtr_df
#print("Done")

    
    
    
    

2021-09-01 / 2021-12-02
2021-09-02 / 2021-12-02
2021-09-03 / 2021-12-02
2021-09-04 / 2021-12-02
2021-09-05 / 2021-12-02
2021-09-06 / 2021-12-02
2021-09-07 / 2021-12-02
2021-09-08 / 2021-12-02
2021-09-09 / 2021-12-02
2021-09-10 / 2021-12-02
2021-09-11 / 2021-12-02
2021-09-12 / 2021-12-02
2021-09-13 / 2021-12-02
2021-09-14 / 2021-12-02
2021-09-15 / 2021-12-02
2021-09-16 / 2021-12-02
2021-09-17 / 2021-12-02
2021-09-18 / 2021-12-02
2021-09-19 / 2021-12-02
2021-09-20 / 2021-12-02
2021-09-21 / 2021-12-02
2021-09-22 / 2021-12-02
2021-09-23 / 2021-12-02
2021-09-24 / 2021-12-02
2021-09-25 / 2021-12-02
2021-09-26 / 2021-12-02
2021-09-27 / 2021-12-02
2021-09-28 / 2021-12-02
2021-09-29 / 2021-12-02
2021-09-30 / 2021-12-02
2021-10-01 / 2021-12-02
2021-10-02 / 2021-12-02
2021-10-03 / 2021-12-02
2021-10-04 / 2021-12-02
2021-10-05 / 2021-12-02
2021-10-06 / 2021-12-02
2021-10-07 / 2021-12-02
2021-10-08 / 2021-12-02
2021-10-09 / 2021-12-02
2021-10-10 / 2021-12-02
2021-10-11 / 2021-12-02
2021-10-12 / 202

Unnamed: 0,start_date,rundown,title_id,storyslug,duration,time,StoryText
7,2021-09-01,5PM Weekday,32556910,5PCOLDOPEN 1,9,0:09,[[QUOTE: we're going to start to see the conta...
8,2021-09-01,5PM Weekday,32556916,5P HELLO,7,0:07,"Good Evening, I'm Ama Daetz.\nand I'm Dan Ash..."
9,2021-09-01,5PM Weekday,32556923,5P HELLO FRAMES,2,0:02,"Good Evening, I'm Ama Daetz.\nand I'm Dan Ash..."
13,2021-09-01,5PM Weekday,32570152,5P DIXIE FIRE OC-VO,24,0:24,The massive Dixie Fire is showing no signs of ...
16,2021-09-01,5PM Weekday,32570960,5P TAHOE FIRE PKG,65,1:05,HERE ARE SOME OF THE WORDS WE HAVE HEAR FROM F...
...,...,...,...,...,...,...,...
23109,2021-12-01,8AM Saturday,34004914,STORY1,5,0:05,Good morning. I'm Liz Kreutz.\nABC7 News at 9 ...
23115,2021-12-01,5AM Sunday,34006163,5aTOSS oc,10,0:10,GOOD MORNING..\nIT'S [[ date]].\nYou're watchi...
23119,2021-12-01,6AM Weekday,34007603,6A HELLO,10,0:10,"\tGood Morning.\n\tIt's Friday, December 3rd.\..."
23124,2021-12-01,Program,34009179,6aTOSS oc,10,0:10,GOOD MORNING..\nIT'S [[ date]].\nThis is ABC7 ...


In [29]:
#tgt_year = 2021
#quarter = "Q4"

qtr_df = pd.read_excel(f'{tgt_year}_{quarter}_Stories.xlsx')
#//*** Trim unnamed First Column (original Index)
del qtr_df[qtr_df.columns[0]]

#qtr_df['search'] = qtr_df['StoryText'].str.lower()
qtr_df

Unnamed: 0,start_date,rundown,title_id,storyslug,duration,time,StoryText
0,2021-09-01,5PM Weekday,32556910,5PCOLDOPEN 1,9,0:09,[[QUOTE: we're going to start to see the conta...
1,2021-09-01,5PM Weekday,32556916,5P HELLO,7,0:07,"Good Evening, I'm Ama Daetz.\nand I'm Dan Ash..."
2,2021-09-01,5PM Weekday,32556923,5P HELLO FRAMES,2,0:02,"Good Evening, I'm Ama Daetz.\nand I'm Dan Ash..."
3,2021-09-01,5PM Weekday,32570152,5P DIXIE FIRE OC-VO,24,0:24,The massive Dixie Fire is showing no signs of ...
4,2021-09-01,5PM Weekday,32570960,5P TAHOE FIRE PKG,65,1:05,HERE ARE SOME OF THE WORDS WE HAVE HEAR FROM F...
...,...,...,...,...,...,...,...
7535,2021-12-01,8AM Saturday,34004914,STORY1,5,0:05,Good morning. I'm Liz Kreutz.\nABC7 News at 9 ...
7536,2021-12-01,5AM Sunday,34006163,5aTOSS oc,10,0:10,GOOD MORNING..\nIT'S [[ date]].\nYou're watchi...
7537,2021-12-01,6AM Weekday,34007603,6A HELLO,10,0:10,"\tGood Morning.\n\tIt's Friday, December 3rd.\..."
7538,2021-12-01,Program,34009179,6aTOSS oc,10,0:10,GOOD MORNING..\nIT'S [[ date]].\nThis is ABC7 ...


In [209]:
def harvest_terms(input_df,terms,**kwargs):
    
    post_story_count = 0
    post_filter = ""
    post_filter_field = ""
    post_filter_all = False
    format_title_id = True
    
    for key,value in kwargs.items():
        if key == "post_story_count":
            post_story_count = value
        
        if key == "post_filter":
            post_filter = value
        
        if key == "post_filter_field":
            post_filter_field = value
        
        if key == "post_filter_all":
            post_filter_all = value
        
        if key == "format_title_id":
            format_title_id = value
    
    harvest_field = 'StoryText'
    df = input_df.copy()
    df['search'] = df[harvest_field].str.lower()
    
    combined_dex = []
    
    #//*** Terms are an OR operation.
    #//*** Get index values for each term, combine all index results to build dataframe
    for term in terms:
        term = term.lower()
        combined_dex = combined_dex + list(df[df['search'].str.contains(term)].index)
        
    #print(combined_dex)
    df = input_df.loc[combined_dex]
    
    origdex = df.index
    #//*** gather adjacent stories
    if post_story_count > 0:
        
        
        newdex = []

        for index in df.index:
            newdex.append(index)

            for x in range(1,post_story_count+1):
                newdex.append(index+x)
        df = input_df.loc[newdex]
        df = df.drop_duplicates()  
    
        #//*** Check for Post Filters
        if len(post_filter) > 0:

            #//*** Validate post_filter_field
            if post_filter_field not in df.columns:
                print("Need valid post_filter_field.")
                print(f"post_filter_field={list(df.columns)}")
                return
            
            if post_filter_all:
                df = df[df[post_filter_field].str.contains(post_filter)]
            else:
                post_df = df[df[post_filter_field].str.contains(post_filter)]
                #print(list(origdex))
                #print(list(post_df.index))

                mergedex = sorted(list(origdex) + list(post_df.index))
                df = input_df.loc[mergedex]
    
    #//*** Post Filter Single Result
    else:
        #//*** Check for Post Filters
        if len(post_filter) > 0:

            #//*** Validate post_filter_field
            if post_filter_field not in df.columns:
                print("Need valid post_filter_field.")
                print(f"post_filter_field={list(df.columns)}")
                return
        
            df = df[df[post_filter_field].str.contains(post_filter)]
        
        
    
    #//*** Format title_id with a formatted header for easy copy and paste
    if format_title_id:
        #print(dir(df['start_date'].iloc[0]))
        months = df['start_date'].apply(lambda x: x.month_name())
        dow = df['start_date'].apply(lambda x: x.day_name())
        day = df['start_date'].apply(lambda x: x.day ).astype(str)
        year = df['start_date'].apply(lambda x: x.year ).astype(str)
        df['title_id'] = dow + ", " + months + " " + day + ", " + year + ", ABC7 News " 
        df['title_id'] = df['title_id'] + df['rundown'].str.replace("Weekday","")
        df['title_id'] = df['title_id'] + "(" + df['time'] + ")"
        
        dm = df['start_date'].apply(lambda x: x.month ).astype(str)
        
        df['start_date'] = dm + "-" + day + "-" + year
        
    
    df['StoryText'] = df['StoryText'].apply(lambda x: x.title())
        
    df = df.drop_duplicates()        
    return df

harvest_terms(qtr_df,["melanie woodrow","dan noyes"],
              post_story_count=0, 
              post_filter = "PKG",
              post_filter_field="storyslug",
              
              )

#babba_df = qtr_df[qtr_df['StoryText'].str.contains("building a better bay area")].copy()

#for index in babba_df.index:
    


Unnamed: 0,start_date,rundown,title_id,storyslug,duration,time,StoryText
165,9-2-2021,5PM Weekday,"Thursday, September 2, 2021, ABC7 News 5PM (2:18)",5P UCSF DOCTOR PKG,138,2:18,(Anchor: As The San Francisco Board Of Educati...
1496,9-16-2021,4PM Weekday,"Thursday, September 16, 2021, ABC7 News 4PM (3...",4P COCO SCHOOL VACCINE PKG,214,3:34,(Anchor: The Contra Costa County Health Office...
1590,9-18-2021,4PM Weekday,"Saturday, September 18, 2021, ABC7 News 4PM (2...",4P RELIGIOUS EXEMPTION PKG,132,2:12,((Anchor: More Than Half Of San Francisco'S Un...
1759,9-21-2021,4PM Weekday,"Tuesday, September 21, 2021, ABC7 News 4PM (2:04)",4P OAK CRIME OVERVIEW PKG,124,2:04,": For The Second Consecutive Year, The City Of..."
1860,9-22-2021,5PM Weekday,"Wednesday, September 22, 2021, ABC7 News 5PM (...",5P BAYVIEW RV PROBLEM PKG,139,2:19,"(Anchor: Human Waste, Garbage And Needles .. T..."
2088,9-24-2021,5PM Weekday,"Friday, September 24, 2021, ABC7 News 5PM (2:10)",5P ZOGG FIRE PKG,130,2:10,"(Anchor: Nearly One Year Ago, The Zogg Fire Cl..."
2337,9-26-2021,5PM Weekday,"Sunday, September 26, 2021, ABC7 News 5PM (2:48)",5P HATE CRIME ARREST PKG,168,2:48,(Anchor: The San Francisco District Attorney A...
2435,9-27-2021,4PM Weekday,"Monday, September 27, 2021, ABC7 News 4PM (2:27)",4P COVID AIR TRAVEL PKG,147,2:27,(Anchor: Travelers Could Be Looking At New Cov...
2636,9-29-2021,5PM Weekday,"Wednesday, September 29, 2021, ABC7 News 5PM (...",5P EVICTION MORITORIUM PKG,92,1:32,"""We Have Arrived At The Moment That Many Have ..."
2925,10-3-2021,5PM Weekday,"Sunday, October 3, 2021, ABC7 News 5PM (2:14)",5P FRUITVALE SHOOTING PKG,134,2:14,"On September 13Th At 3:22 In The Afternoon, An..."


In [210]:
out = {
    "Race and Justice" : harvest_terms(qtr_df,["race and social justice"], 
              post_story_count=3, 
              post_filter = "PKG", 
              post_filter_field="storyslug"
              ),
    "Education" : harvest_terms(qtr_df,["education reporter"],
              post_story_count=3, 
              post_filter = "PKG", 
              post_filter_field="storyslug"
              ),
    "7OYS" : harvest_terms(qtr_df,["Michael Finney"],
              post_story_count=3, 
              #post_filter = "VO", 
              #post_filter_field="storyslug"
              ),
    "Climate" : harvest_terms(qtr_df,["climate"],
              post_story_count=0, 
              post_filter = "PKG",
              post_filter_field="storyslug",
              #post_filter_all = True,
              ),
    "I-Team" : harvest_terms(qtr_df,["melanie woodrow","dan noyes"],
              post_story_count=0, 
              post_filter = "PKG",
              post_filter_field="storyslug",
              ),
    "BABBA" : harvest_terms(qtr_df,["building a better bay area"], 
              post_story_count=3, 
              post_filter = "PKG", 
              post_filter_field="storyslug"
              ),

    "COVID" : harvest_terms(qtr_df,["covid"],
              post_story_count=0, 
              post_filter = "PKG",
              post_filter_field="storyslug",
              #post_filter_all = True,
              ),

}



# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(f'{tgt_year}_{quarter}_Collected_Stories.xlsx', engine='xlsxwriter')

for sheet,df in out.items():
    df.to_excel(writer,sheet_name=sheet)
#qtr_df.to_excel(writer,sheet_name='sheet1')
writer.save()

In [17]:

babba_df = qtr_df[qtr_df['StoryText'].str.contains("building a better bay area")].copy()
babba_df

#qtr_df[qtr_df['StoryText'].str.contains("race and culture")]['StoryText'].iloc[0]

race_df = qtr_df[qtr_df['StoryText'].str.contains("race and social justice")]
race_df

education_df = qtr_df[qtr_df['StoryText'].str.contains("education reporter")]
education_df

oys_df = qtr_df[qtr_df['StoryText'].str.contains("seven on your side")]
oys_df

covid_df = qtr_df[qtr_df['StoryText'].str.contains("covid")]
covid_df = covid_df[covid_df['storyslug'].str.contains("PKG")]
covid_df


climate_df = qtr_df[qtr_df['StoryText'].str.contains("climate")]
climate_df = climate_df[climate_df['storyslug'].str.contains("PKG")]
climate_df

iteam_df = qtr_df[qtr_df['StoryText'].str.contains("i-team")]
iteam_df = qtr_df[(qtr_df['StoryText'].str.contains("melanie woodrow")) | (qtr_df['StoryText'].str.contains("dan noyes"))]
iteam_df = iteam_df[iteam_df['storyslug'].str.contains("PKG")]
#iteam_df.loc[3501]['StoryText']
iteam_df
#tdf[tdf['duration']> 60]


Unnamed: 0,start_date,rundown,title_id,storyslug,duration,time,StoryText
209,2021-09-02,5PM Weekday,32591116,5P UCSF DOCTOR PKG,138,2:18,(anchor: as the san francisco board of educati...
1788,2021-09-16,4PM Weekday,32826770,4P COCO SCHOOL VACCINE PKG,214,3:34,(anchor: the contra costa county health office...
1824,2021-09-17,6PM Weekday,32827622,6P AFGHAN HELP PKG,114,1:54,((coldopen vo: one cell phone image tells the ...
1896,2021-09-18,4PM Weekday,32982082,4P RELIGIOUS EXEMPTION PKG,132,2:12,((anchor: more than half of san francisco's un...
2086,2021-09-21,4PM Weekday,32881173,4P OAK CRIME OVERVIEW PKG,124,2:04,": for the second consecutive year, the city of..."
2197,2021-09-22,5PM Weekday,32902361,5P BAYVIEW RV PROBLEM PKG,139,2:19,"(anchor: human waste, garbage and needles .. t..."
2505,2021-09-24,5PM Weekday,32939510,5P ZOGG FIRE PKG,130,2:10,"(anchor: nearly one year ago, the zogg fire cl..."
2793,2021-09-26,5PM Weekday,32998656,5P HATE CRIME ARREST PKG,168,2:48,(anchor: the san francisco district attorney a...
2906,2021-09-27,4PM Weekday,33016678,4P COVID AIR TRAVEL PKG,147,2:27,(anchor: travelers could be looking at new cov...
2921,2021-09-27,5PM Weekday,33016429,5P COVID AIR TRAVEL PKG,147,2:27,(anchor: travelers could be looking at new cov...
