# <center><b>RSS Cache Detection Challenge</b></center>


## **Normal publish times**

In [None]:
import pandas as pd
import re
import os
import glob

# define path where the csv files are stored
path = r"YourFilepath\RSSCacheDetection\ACCOUNT_ARTICLE_SOURCES\NORMAL"

# create variable which stores all csv files
csv_files = glob.glob(os.path.join(path, "*.csv"))

# create table headers with columns as keys and empty list for values
table = {'feed_identifier': [], 'Considered to be Cached?': [], 'Caching Interval (secs)': []}

# iterate through each file
for f in csv_files:
    
    pattern = 'sources_(.*).csv'
    
    # set feed identifier as string in filename between sources_ and .csv
    feed_identifier = re.search(pattern, f).group(1)
    table["feed_identifier"].append(feed_identifier)
    
    try:
        
        # read csv into pandas dataframe
        df = pd.read_csv(f)

        # sort time_created in ascending order, convert dataframe to list
        df.sort_values(by=['time_created'], ascending=True)
        data = df.values.tolist()
        
        # create list just of time_created values. Use set() to find unique values
        time_created = [d[2] for d in data]
        time_created = list(set(time_created))
        temp = []

        # iterate through each time_created and create list of differences between each other value
        for i in range(len(time_created)):
            for j in range(i+1,len(time_created)):
                if j!=i:
                    temp.append(abs(time_created[i]-time_created[j]))
        
        # use set() to find unique differences
        temp_unique = set(temp)
        
        # initialize cached to True
        cached = True
        
        # remove 0 if exists as this is just the difference between the same creation times
        if min(temp_unique)==0:
            temp_unique.remove(0)

        # find minimum value, this will be the lowest common denominator if caching exists     
        min_ = min(temp_unique)
        caching_interval = min_
        
        # iterate through differences, if any aren't exactly divisible by minimum value, caching doesn't exist
        for t in temp:
            if t%min_ != 0:
                cached = False
                caching_interval = 0

        # set final condition of caching to be that minimum value (the caching interval) is >= 600
        if cached == True and min_ <= 600:
            cached = False
            
        # add caching status and interval values to table
        table["Considered to be Cached?"].append(cached)
        table["Caching Interval (secs)"].append(caching_interval)

    except ValueError:
        
        # if there is a ValueError, which can be caused by unique creation time, caching doesn't exist
        cached = False
        caching_interval = 0
        table["Considered to be Cached?"].append(cached)
        table["Caching Interval (secs)"].append(caching_interval)

from pathlib import Path

filepath = Path(r"YourFilepath\companyx_normal.csv")

# convert table from dictionary to pandas dataframe
df = pd.DataFrame.from_dict(table)

# export table as file to given file path
df.to_csv(filepath, index=False)          
df

## **Bad publish times**

In [None]:
import pandas as pd
import re
import os
import glob

path = r"YourFilepath\RSSCacheDetection\ACCOUNT_ARTICLE_SOURCES\BAD_PUBLISH_TIME"
csv_files = glob.glob(os.path.join(path, "*.csv"))
table = {'feed_identifier': [], 'Considered to be Cached?': [], 'Caching Interval (secs)': []}
  
for f in csv_files:
    
    pattern = 'sources_(.*).csv'
    feed_identifier = re.search(pattern, f).group(1)
    table["feed_identifier"].append(feed_identifier)
    
    try:
        df = pd.read_csv(f)

        df.sort_values(by=['time_created'], ascending=True)
        data = df.values.tolist()
        time_created = [d[2] for d in data]
        time_created = list(set(time_created))
        temp = []

        for i in range(len(time_created)):
            for j in range(i+1,len(time_created)):
                if j!=i:
                    temp.append(abs(time_created[i]-time_created[j]))

        temp_unique = set(temp)
            
        cached = True
        
        if min(temp_unique)==0:
            temp_unique.remove(0)

        min_ = min(temp_unique)
        caching_interval = min_
        for t in temp:
            if t%min_ != 0:
                cached = False
                caching_interval = 0

        if cached == True and min_ <= 600:
            cached = False
            
        table["Considered to be Cached?"].append(cached)
        table["Caching Interval (secs)"].append(caching_interval)

    except ValueError:
        cached = False
        caching_interval = 0
        table["Considered to be Cached?"].append(cached)
        table["Caching Interval (secs)"].append(caching_interval)

from pathlib import Path  
filepath = Path(r"YourFilepath\companyx_bad.csv")  
df = pd.DataFrame.from_dict(table)
df.to_csv(filepath, index=False)          
df