In [1]:
#import libraries / dependencies
import openpyxl
from openpyxl import load_workbook, Workbook
import csv
from pyexcel.cookbook import merge_all_to_a_book, merge_csv_to_a_book
import os
import re

In [2]:
#FUNCTIONS BLOCK

#get_filenames: 
#Arguments: raw data filepath (absolute)
#Returns: A tuple of strings representing the files that will be used to 
#represent the data at each step of refinement
def get_filenames(rdfile):
    #EXAMPLE '/Users/default/Downloads/Videology Reports/TruOptik - Monthly Usage - June 2018.xlsx'
    rd = rdfile.split('/')
    path = '/'.join(rd[:-1])
    extractedfile = path+'/Extracted-'+rd[-1][:-4]+'csv'
    correctedfile = path+'/Corrected-'+rd[-1][:-4]+'csv'
    finalfile = path+'/Final-'+rd[-1]
    correctedfile_excel = path+'/Corrected-'+rd[-1]
    return extractedfile, correctedfile, finalfile, correctedfile_excel

#replacements:
#Argument: the videology category (string)
#Returns: the category with all necessary corrections (string)
#Basically there are duplicates in the categories that are *slightly* different
#and this takes care of those. it's pretty ugly but I haven't thought of a better way yet
def replacements(category):
    category = category.replace('Financial','Financial/Insurance Attributes & Behaviors')
    category = category.replace('Financial/Insurance Attributes & Behaviors/Insurance Attributes & Behaviors','Financial/Insurance Attributes & Behaviors')
    category = category.replace('"','')
    category = category.replace('Home & Garden','Home Imp/Décor/Home & Garden/Home&Fam/DIY').replace('Home Imp/Décor/Home Imp/Décor/Home & Garden/Home&Fam/DIY/Home&Fam/DIY','Home Imp/Décor/Home & Garden/Home&Fam/DIY')
    category = category.replace('Demos','Demo')
    category = category.replace('Sports & Outdoors','Sporting & Healthy Living + Sporting Goods/Outdoor')
    category = category.replace('Education/Career','Education & Career')
    category = category.replace('Entertainment/Media','Entertainment & Media')
    category = category.replace('Fashion & Style', 'Fashion/Style; Apparel; Accessories')
    category = category.replace('Health','Health & Fitness/Wellness').replace('Health & Fitness/Wellness & Fitness/Wellness','Health & Fitness/Wellness')
    return category

In [3]:
#BLOCK 1: EXTRACT THE ATTRIBUTES, IDS, AND USAGE DATA FROM THE EXCEL SHEET and get unique values. Then store in CSV

#absolute filepath to raw_data (Videology Report)
raw_data = '/Users/default/Downloads/Videology Reports/filedump/TruOptik - Monthly Usage - April 2018.xlsx'

#This program will create the edited files in the same location as the original, so look there for the data
extractedfile, correctedfile, finalfile, correctedfile_excel = get_filenames(raw_data)

#use openpyxl library to create a workbook object from the xlsx file so we can work w it
rd_wb = load_workbook(raw_data)

#get the worksheet with the data
#usually the sheet is named 'Behavioral Cost', but sometimes there is a trailing space or its named something else
#because Videology is allergic to consistency. Either way changing the string will get you the desired worksheet
rd_ws = rd_wb['Behavioral Cost']

#the header has the names for each column in the worksheet
header = [x.value for x in list(rd_ws.rows)[0]]
print(header)

#we want the index of the columns for the following: 
#Retargeting Attribute
#Attribute ID (sometimes called External Retargeting ID so just change string)
#Usage (sometimes called Billable Usage)
ra_ind = header.index('Retargeting Attribute')
erid_ind= header.index('Attribute ID')
usage_ind = header.index('Usage')

print([ra_ind, erid_ind, usage_ind])

#The following block of code creates a dictionary of unique attributes
#Dictionary format is "unique_records[attribute] = usage"

unique_records = {}
for item in [[i.value for i in x] for x in rd_ws.iter_rows(min_row=2)]:
    #edit the row if it exists
    if item[ra_ind] is not None: 
        row = [item[ra_ind].replace(',',';').replace('  ',' '),item[erid_ind],item[usage_ind]]
    else:
        pass
    #check for membership in the dictionary keys
    if row[0] in list(unique_records.keys()):
        #if the usage value is an int, add it to the dictionary (this gets rid of excel formulas)
        if isinstance(item[usage_ind],int):
            unique_records[row[0]] += int(item[usage_ind])
    #otherwise create new entry
    elif row[0] not in list(unique_records.keys()):
        if isinstance(item[usage_ind],int):
            unique_records[row[0]] = int(item[usage_ind])

#write this unique record data to a csv
extracted_data = open(extractedfile,'w')
for kv in unique_records.items():
    write = csv.writer(extracted_data)
    write.writerow(kv)
extracted_data.close()

['Provider', 'Country', 'Retargeting Attribute', 'Attribute ID', 'Device Type', 'Format Type', 'CPU', 'Usage', 'Data Costs', 'Rate Adjustment', 'Adjusted Cost (PC/Mobile)']
[2, 3, 7]


In [4]:
#BLOCK 1A: IF THE DATA ISN'T IN A FRIENDLY FORMAT IN THE RAW DATA, INSTEAD USE A SLIGHTLY EDITED VERSION AND CONTINUE FROM THERE
#This block was written because in 1 videology report they had the attribute separated into multiple columns instead of their
#normal format
refined_data = '/Users/default/Documents/PythonMisc/Videology Reports - Completed/Edited - Dec2017Categories copy.xlsx'
wb = load_workbook(refined_data)
ws = wb.active
category_dict = {}
for line in [[i.value for i in x] for x in ws.iter_rows()]:
    if line[1].strip() in list(category_dict.keys()):
        category_dict[line[1].strip()] += int(line[2])
    else:
        category_dict[line[1].strip()] = int(line[2])
for key, value in category_dict.items():
    index = list(category_dict.keys()).index(key) + 1
    ws.cell(row = index, column = 4).value = key
    ws.cell(row = index, column = 5).value = value
wb.save(refined_data)

In [5]:
#BLOCK 2: CATEGORIZE THE UNIQUE VALUES USING PREVIOUS DATA
#load reference file and create a list of lists of its values

#this block loads an excel file and creates a dictionary in the form: dict[attribute] = category
#with this dictionary we take care of a lot of repeated values so recategorization is easier 
#some new attributes won't be in the database so they might be blank but i'm working on fixing that with another method
db_path = '/Users/default/Documents/PythonMisc/VideologyDatabase.xlsx'
db_file = load_workbook(db_path)
db_ws = db_file.active
db_contents = {}
for attribute, category in [[i.value for i in x] for x in db_ws.iter_rows(max_col=2)]:
    db_contents[attribute] = category
    
print(len(db_contents))

#load file with missing categories and create list of values
edit_file = open(extractedfile,'r')
corrected_vals = []
for line in edit_file:
    #preprocessing the line
    edited_line = line.replace('"','').strip().split(',',1)
    attribute = edited_line[0]
    usage = edited_line[1]
    #add to corrected values if it is in the database otherwise leave it blank
    try:
        corrected_vals.append([attribute,db_contents[attribute], usage]) 
    except KeyError:
        corrected_vals.append([attribute,'',usage])
        
#write to another file
corrected_file = open(correctedfile,'w')
for value in corrected_vals:
    write = csv.writer(corrected_file)
    write.writerow(value)
corrected_file.close()

1410


In [6]:
#BLOCK 3: CREATE DICTIONARY BASED ON UNIQUE CATEGORIES AND WRITE TO XL FILE
#ONLY RUN THIS BLOCK AFTER EDITS HAVE BEEN MADE TO CSV CREATED IN PREVIOUS BLOCK

#convert corrected csv to excel file so we can write into it
merge_csv_to_a_book([correctedfile],correctedfile_excel)


#create dictionary
category_dict = {}
corrected_csv = open(correctedfile)
for line in corrected_csv:
    #more preprocessing
    line = line.replace('"""Fashion/Style, Apparel, Accessories"""', ',Fashion/Style; Apparel; Accessories').replace(',,',',').strip()
    #Some categories have commas which fuck up reading the csv so i replaced them w semicolons
    if len(line.split(',')) > 3:
        line = line.replace('Style,','Style;').replace('Apparel,','Apparel;')
    line = line.split(',')
    
    attribute,category = line[:2]
    usage=int(line[2])
    category = replacements(category)
    
    #same dictionary logic as before (should probably be a fcn)
    if category in list(category_dict.keys()):
        category_dict[category] += usage
    elif category not in list(category_dict.keys()):
        category_dict[category] = usage

#now write to an excel file with the dictionary
wb = load_workbook(correctedfile_excel)
ws = wb.active
for key, value in category_dict.items():
    index = list(category_dict.keys()).index(key)+1
    ws.cell(row = index, column = 6).value = key
    ws.cell(row = index, column = 7).value = value

wb.save(finalfile)




In [39]:
#SIDE PROJECT: CREATE A DATABASE OF VALUES THAT HAVE CORRECT LABELS
#RUN ONLY ONCE: THIS WILL OVERWRITE THE CURRENT DATABASE
folder_path = '/Users/default/Documents/PythonMisc/Videology Reports - Completed'
folder_contents = os.listdir(folder_path)

db_path = '/Users/default/Documents/PythonMisc/VideologyDatabase.xlsx'
db_file = load_workbook(db_path)
db_ws = db_file.active
db_contents = {}
pattern = '^[Ee]'

print(len(db_contents))
for attribute, category in [[i.value for i in x] for x in db_ws.iter_rows(max_col=2)]:
    db_contents[attribute] = category

for file in folder_contents:
    if re.match(pattern, file):
        wb = load_workbook(folder_path + '/' + file)
        ws = wb.active

        #only want the first 2 columns (attribute and category)
        for attribute, category in [[i.value for i in x] for x in ws.iter_rows(max_col=2)]:
            if attribute is None:
                attribute = 'TEST'
                category = 'TEST'
            if '>' == attribute[-1]:
                attribute = attribute[:-2]
            try:
                category = replacements(category)
            except:
                print('no bueno')
            db_contents[attribute] = category

print(len(db_contents))

#Now, we write the completed dictionary to the database file again
for i, (key, value) in enumerate(db_contents.items()):
    db_ws.cell(row=i+1, column = 1).value = key
    db_ws.cell(row=i+1, column = 2).value = value
    
db_file.save(db_path)

0


NameError: name 'pattern' is not defined