In [23]:
from PIL import Image, ImageDraw
from pytesseract import pytesseract
import numpy as np
import cv2
from matplotlib import pyplot as plt
import pandas as pd
import re
from ediblepickle import checkpoint
#from string import Template
import pickle
import os

In [2]:
def convert_to_bw(image):
    """
    Converts images to black and white instead of colored, which helps
    tesseract read them
    """
    monocolor = image.convert('L')
    return monocolor.point(lambda x: 0 if x < 240 else 255, '1')

In [14]:
def get_snorkel_text(image_path):
    """
    Gets text (specifically, date, decription, and numbers) from snorkel reports
    and returns them as a list of text lines. There are two formats (.jpg/jpeg and .png), which have
    slightly different placements data placements. 
    
    Input: image file
    Returns: Raw text output from tesseract in the form:
            (date, description, [number 1, number 2, number 3])
    """
    img = Image.open(image_path)
    height, width = img.size
#    print(image_path)
    
    # Tesseract doesn't read columns of numbers well, so numbers are cropped into seperate boxes.
    
    if '.png' in image_path:
        num_left = width * 75 / 100
        num_right = width * 98 / 100

        num1_top = width * 25 / 100
        num1_bottom = width * 42 / 100

        num2_top = width * 45 / 100
        num2_bottom = width * 62 / 100

        num3_top = width * 66 / 100
        num3_bottom = width * 83 / 100

        text_left = 0
        text_right = width * 74.5 / 100
        text_top = width * 21 / 100
        text_bottom = width * 84 / 100

        date_left = width * 52 /100
        date_right = width * 99 / 100
        date_top = height * 5 / 100
        date_bottom = height * 12 / 100
        
    else:
        num_left = width * 74.5 / 100
        num_right = width * 88 / 100

        num1_top = width * 18 / 100
        num1_bottom = width * 35 / 100

        num2_top = width * 38 / 100
        num2_bottom = width * 55 / 100

        num3_top = width * 58 / 100
        num3_bottom = width * 75 / 100

        text_left = 0
        text_right = num_left
        text_top = width * 18 / 100
        text_bottom = width * 80 / 100

        date_left = width * 11.5 /100
        date_right = width * 50 / 100
        date_top = height * 12 / 100
        date_bottom = text_top
    
    im_description = img.crop((text_left, text_top, text_right, text_bottom))
    im_1_number = img.crop((num_left, num1_top, num_right, num1_bottom))
    im_2_number = img.crop((num_left, num2_top, num_right, num2_bottom))
    im_3_number = img.crop((num_left, num3_top, num_right, num3_bottom))
    im_date = img.crop((date_left, date_top, date_right, date_bottom))
    
 
    im_text_description = pytesseract.image_to_string(im_description).strip()
    im_text_date = pytesseract.image_to_string(im_date, config='--psm 7').strip()
    
    # Limit tesseract readings to numbers and period and only reads one line (or char?) of text
    # Results in a more accurate text conversion
    num_config = "--psm 10 -c tessedit_char_whitelist=0123456789."
    im_text_1number = pytesseract.image_to_string(convert_to_bw(im_1_number), config=num_config).strip()
    im_text_2number = pytesseract.image_to_string(convert_to_bw(im_2_number), config=num_config).strip()
    im_text_3number = pytesseract.image_to_string(convert_to_bw(im_3_number), config=num_config).strip()
    
    return im_text_date, im_text_description, [im_text_1number, im_text_2number, im_text_3number]

In [4]:
def get_snorkel_text_new_format(image_path):
    """
    Gets text (specifically, date, decription, and numbers) from the new format snorkel reports
    and returns them as a list of text lines. 
    
    Input: image file
    Returns: Raw text output from tesseract in the form:
            (date, description, [number 1, number 2, number 3])
    """
    img = Image.open(image_path)
    height, width = img.size
    
    # Tesseract doesn't read columns of numbers well, so numbers are cropped into seperate boxes.
#     num_left = width * 78 / 100
#     num_right = width * 94 / 100

#     num1_top = width * 27 / 100
#     num1_bottom = width * 40 / 100

#     num2_top = width * 47 / 100
#     num2_bottom = width * 60 / 100

#     num3_top = width * 68 / 100
#     num3_bottom = width * 81 / 100

    num_left = width * 75 / 100
    num_right = width * 98 / 100

    num1_top = width * 25 / 100
    num1_bottom = width * 42 / 100

    num2_top = width * 45 / 100
    num2_bottom = width * 62 / 100

    num3_top = width * 66 / 100
    num3_bottom = width * 83 / 100

    text_left = 0
    text_right = width * 74.5 / 100
    text_top = width * 21 / 100
    text_bottom = width * 84 / 100

    date_left = width * 56 /100
    date_right = width * 98 / 100
    date_top = height * 5 / 100
    date_bottom = height * 12 / 100

    im_description = img.crop((text_left, text_top, text_right, text_bottom))
    im_1_number = img.crop((num_left, num1_top, num_right, num1_bottom))
    im_2_number = img.crop((num_left, num2_top, num_right, num2_bottom))
    im_3_number = img.crop((num_left, num3_top, num_right, num3_bottom))
    im_date = img.crop((date_left, date_top, date_right, date_bottom))
    
    im_text_description = pytesseract.image_to_string(im_description).strip()
    im_text_date = pytesseract.image_to_string(im_date, config='--psm 7').strip()
    
    # Limit tesseract readings to numbers and period and only reads one line (or char?) of text
    # Results in a more accurate text conversion
    num_config = "--psm 10 -c tessedit_char_whitelist=0123456789."
    im_text_1number = pytesseract.image_to_string(convert_to_bw(im_1_number), config=num_config).strip()
    im_text_2number = pytesseract.image_to_string(convert_to_bw(im_2_number), config=num_config).strip()
    im_text_3number = pytesseract.image_to_string(convert_to_bw(im_3_number), config=num_config).strip()
    
    return im_text_date, im_text_description, [im_text_1number, im_text_2number, im_text_3number]

In [25]:
get_snorkel_text_new_format(cmd_path + 'new_format.png')

('Saturday, March 11 2023',
 "Northwest (Napili, Kapalua, Honolua)\ne Smallest waves inside Kapalua Bay\n\ne Powerful waves & currents today\n\ne Mostly sunny with light winds\n\nKa'anapali (Black Rock, Kahekili-Airport Beach)\ne Kahekili & Black Rock both fantastic!\n\ne Still some surf along the shoreline\n\ne Sunny skies & calm winds likely\n\nSouth Shore (Olowalu, Kihei, Makena Landing)\ne Mile Marker 14 is the most calm\n\ne Biggest waves in Makena & Wailea\n\ne Sunshine & fairly calm winds expected",
 ['2.0', '7.5', '4.0'])

In [5]:
def img_output_to_list(image_to_text):
    """
    inputs description text into a formatted list of lists. Inner list has the format
    columns = ['date', south_description', 'south_rating', 'kaanapali_description', 'kaanapali_rating',
                'northwest_description', 'northwest_rating']
    
    """
    im_text_date, im_text_description, im_numbers = image_to_text
    # split_descripts = im_text_description.split('\n\n')
    
    # Assigns numbers and descriptions to each region according to their index within the image description
    # Find kaanapali via 'anapa' due to spelling error in images
    region_list = [(im_text_description.find('South'), 'south'), (im_text_description.find('anapa'), 'kaanapali'), \
                   (im_text_description.find('West' or 'North'), 'northwest') ]
    
    region_score = {}
    for i, region in enumerate(sorted(region_list, key=lambda x: x[0])):
        region_score[region[1]] = im_numbers[i]
    
    # Remove misc characters from date
    #print(im_text_date)
#    date = re.findall(r"([MTWFS]\S+day, [JFMASOND]\S+ \d\d? 20\d\d)", im_text_date)[0]
#    date = re.findall(r"([JFMASOND]\S+ \d\d? 20\d\d)", im_text_date)[0]
    
    return im_text_date, im_text_description, region_score['south'], region_score['kaanapali'], region_score['northwest']
    

In [6]:
def get_months_data(cmd_path, month, year):
    if os.path.isfile(cmd_path + 'cache/' + str(year) + '_' + month + '.p'):
        with open(cmd_path + 'cache/' + str(year) + '_' + month + '.p', 'rb') as f:
            return pickle.load(f)
    
    data = []
    path = cmd_path + "Maui_Snorkel_Report_" + str(year) + '/' + month + '/'
    files = os.listdir(path)
    
    for file in files:
        if not file.startswith('.'):
            data.append(img_output_to_list(get_snorkel_text(path + file)))
        
    with open(cmd_path + 'cache/' + str(year) + '_' + month + '.p', 'wb') as f:
        pickle.dump(data, f)
              
    return data

In [79]:
def get_months_data_new_format(cmd_path, month, year):
    if os.path.isfile(cmd_path + 'cache/' + str(year) + '_' + month + '.p'):
        with open(cmd_path + 'cache/' + str(year) + '_' + month + '.p', 'rb') as f:
            return pickle.load(f)
    
    data = []
    path = cmd_path + "Maui_Snorkel_Report_" + str(year) + '/' + month + '/'
    files = os.listdir(path)
    
    for file in files:
        if not file.startswith('.'):
            data.append(img_output_to_list(get_snorkel_text_new_format(path + file)))
        
    with open(cmd_path + 'cache/' + str(year) + '_' + month + '.p', 'wb') as f:
        pickle.dump(data, f)
              
    return data

In [6]:
cmd_path = r"/Users/jkharada/Documents/Data_incubator/Capstone_project/snorkel_reports/"
old_years = [2017, 2018, 2019, 2020, 2021, 2022]
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
new_years = [2022, 2023]

data = []
# Get data in the old format
for yr in old_years:
    for mo in months:
        data += (get_months_data_old_format(cmd_path, mo, yr))
        
# Get data in new format

In [7]:
df = pd.DataFrame(data, columns=['date', 'description', 'south_rating', 'kaanapali_rating', 'northwest_rating'])
df_reduced = df[['date','south_rating','kaanapali_rating','northwest_rating']]
df_reduced.drop(list(range(2032, 2095)), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced.drop(list(range(2032, 2095)), inplace=True)


In [None]:
df_reduced.drop([2012, 2013, 2014,2015, 2018, 2021, 2024, 2027, 2028], inplace = True)
df_reduced[2000:]

In [28]:
new_data = []

for yr in new_years:
    for mo in months:
        new_data += (get_months_data_new_format(cmd_path, mo, yr))

In [None]:
#df_new_data = pd.DataFrame(new_data, columns=['date', 'description', 'south_rating', 'kaanapali_rating', 'northwest_rating'])
df_new_reduced = df_new_full[['date','south_rating','kaanapali_rating','northwest_rating']]
df_new_reduced

In [None]:
df_new_data.drop(list(range(278,365)), inplace=True)
df_new_full = pd.concat([df_new_data, df_oct_2022, df_nov_2022, df_dec_2022])

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(df_new_full)

In [78]:
df_new_full.to_csv(cmd_path + 'new_format_data_Jan2022_current.csv', index=False)

In [None]:
data_2016 = []
months = ['June', 'July', 'August', 'September', 'October', 'November', 'December']
#path = cmd_path + "Maui_Snorkel_Report_2016/" + month + '/'
#files = os.listdir(path)

for mo in months:
    data_2016 += (get_months_data(cmd_path, mo, 2016))

In [64]:
dec_2022_data = get_months_data_new_format(cmd_path, 'December', 2022)

In [73]:
df_dec_2022 = pd.DataFrame(dec_2022_data, columns=['date', 'description', 'south_rating', 'kaanapali_rating', 'northwest_rating'])
#df_oct_2022 = df_oct_2022[['date', 'south_rating', 'kaanapali_rating', 'northwest_rating']]
#df_oct_2022

In [17]:
df_2016 = pd.DataFrame(data_2016, columns=['date', 'description', 'south_rating', 'kaanapali_rating', 'northwest_rating'])
df_2016.to_csv(cmd_path+'data_2016.csv', index=False)

In [43]:
'.png' in '/Users/jkharada/Documents/Data_incubator/Capstone_project/snorkel_reports/Maui_Snorkel_Report_2022/October/OCTOBER-17-2022.jpg'

False

In [12]:
cmd_path = r"/Users/jkharada/Documents/Data_incubator/Capstone_project/snorkel_reports/"

old_format_full = pd.read_csv('/Users/jkharada/Documents/Data_incubator/Capstone_project/old_format_dataset.csv')

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(df_2016)

In [108]:
# import hand edited dataset (edited to fix dates, some missing values)

full_dataset = pd.read_csv(cmd_path + 'full_dataset_hand_edited.csv')

In [109]:
# Get uniformly formatted dates (those in datasets have misc periods and extra values, mispellings, missing spaces, etc)
full_dataset['parsed_date'] = full_dataset['date'].apply(lambda x: ' '.join(list(re.findall(r"([JFMASOND][a-z]+),? ?(\d\d?).?,? ?(20\d\d)", x)[0]))
                                if re.search(r"([JFMASOND][a-z]+),? ?(\d\d?).?,? ?(20\d\d)", x) else None)

In [110]:
full_dataset['cleaned_south_rating'] = full_dataset['south_rating'].apply(lambda x: x/10 if x > 10 else x)
full_dataset['cleaned_kaanapali_rating'] = full_dataset['kaanapali_rating'].apply(lambda x: x/10 if x > 10 else x)
full_dataset['cleaned_northwest_rating'] = full_dataset['northwest_rating'].apply(lambda x: x/10 if x > 10 else x)

In [111]:
#rename columns and standardize data
cleaned_usable_dataset = full_dataset[['parsed_date', 'cleaned_south_rating', 'cleaned_kaanapali_rating', 'cleaned_northwest_rating']].dropna()
cleaned_usable_dataset.rename(columns= {'parsed_date':'date', 'cleaned_south_rating':'south_rating', 
                                        'cleaned_kaanapali_rating':'kaanapali_rating', 'cleaned_northwest_rating':'northwest_rating'},
                             inplace=True)
cleaned_usable_dataset['date'] = pd.to_datetime(cleaned_usable_dataset['date'])

In [113]:
#output cleaned data to csv
cleaned_usable_dataset.to_csv(cmd_path+'cleaned_usable_dataset.csv', index=False)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(cleaned_usable_dataset)