# How to run cells:

Click the run button above in the toolbar (▶), or press <kbd>Shift</kbd> + <kbd>Enter</kbd>

## Cell 1: The Spreadsheets App 

Run the cell below. A file picker will show up. Simply open the file picker, select your files, and duplicates will be automatically detected, displayed, and downloadable.

In [11]:
# Install openpyxl so we can read and save excel spreadsheets in pandas.
print("Starting!\n\n")
try:
    import micropip
    await micropip.install('ipywidgets')
    %pip install openpyxl
except ImportError:
    pass
# Import needed libraries. Key one here is pandas, which allows us to do operate on "DataFrames" which are spreadsheet objects.
from pathlib import Path
import pandas as pd
from ipywidgets import widgets
from IPython.display import display
from IPython.display import Markdown
import io
import base64
import traceback

def download_button(buffer, filename: str, button_description: str):
    payload = base64.b64encode(buffer.read()).decode()

    html_button = f"""<html>
    <head>
    <meta name="viewport" content="width=device-width, initial-scale=1">
    </head>
    <body>
    <a download="{filename}" href="data:text/csv;base64,{payload}" >
    <button class="p-Widget jupyter-widgets jupyter-button widget-button mod-warning">{button_description}</button>
    </a>
    </body>
    </html>
    """
    return widgets.HTML(html_button)


file_picker = widgets.FileUpload(accept=".xlsx,.csv", multiple=True)
vbox = widgets.VBox()

def on_file_chosen(change):
    print("Searching files...")

    vbox.children = ()
    
    for file in file_picker.value:
        out = widgets.Output()
        vbox.children += (out,)
        with out:
            try:
                print()
                display(Markdown(f"#### Loading file: {file.name}"))
                ext = file.name.split(".")[-1]
                name = file.name.split(".")[0]
                if(ext == "xlsx"):
                    df = pd.read_excel(io.BytesIO(file.content), header=None, engine="openpyxl")
                elif(ext == "csv"):
                    df = pd.read_csv(io.BytesIO(file.content), header=None)
                else:
                    display(f"Skipping: {file.name}, unsupported format '.{ext}'\n")
                    continue
                print(f"Duplicates:")
                duplicates = df[df.duplicated(keep=False)]
                display(duplicates)
                print(f"Total duplicate rows: {len(duplicates.index)} out of {len(df.index)}")
                csv = io.BytesIO()
                duplicates.to_csv(csv, header=None)
                csv.seek(0)
                display(download_button(csv, f"{name}-duplicates.csv", "Download as CSV"))
                excel = io.BytesIO()
                duplicates.to_excel(excel, header=None)
                excel.seek(0)
                display(download_button(excel, f"{name}-duplicates.xlsx", "Download as Excel"))
            except Exception as e:
                print("An error occured:", repr(e))
                traceback.print_exc()
          
on_file_chosen(None)
file_picker.observe(on_file_chosen, names="value")
print("Select your files using the file selector below:")
display(widgets.VBox([file_picker, vbox]))

Starting!


Searching files...
Select your files using the file selector below:


VBox(children=(FileUpload(value=(), accept='.xlsx,.csv', description='Upload', multiple=True), VBox()))