In [2]:
import pandas as pd
import numpy as np
import re

### Clean and standardize the data

In [18]:
file_path = "ufo_data.csv"
df = pd.read_csv(file_path)

df_dropped_nas = df.dropna(subset=["shape", "duration", "occurred_date_time"])


# Cleaning function takes in dataframe, column, and patterns to filter out worthless and/or inconsistent values
# Function also standardizes the date and filters out the time in "occurred_date_time" column
def clean_dataframe(df, column_name, patterns):
    for pattern in patterns:
        df = df[~df[column_name].str.contains(pattern, regex=True, na=False)]

    df[column_name] = pd.to_datetime(df[column_name], errors="coerce", format="%m/%d/%y %H:%M")

    df = df.dropna(subset=[column_name])

    df[column_name] = df[column_name].dt.strftime('%m/%d/%y')
    
    return df

column_name = "occurred_date_time"
patterns = [r'\?', r'unknown', r'&', r'ongoing']

df_cleaned_dates = clean_dataframe(df_dropped_nas, column_name, patterns)

# Cleaning function to standardize time measurement in "duration" column to seconds
def clean_duration(df, column_name, filter_values):
    pattern = "|".join(filter_values)
    df_cleaned = df[~df[column_name].str.contains(pattern, case=False, na=False)]

    def convert_to_seconds(duration):
    
        duration = duration.lower()

        match = re.match(r'(\d+)\s*(seconds|minutes|hours?)', duration)

        if match:
            value = int(match.group(1))
            unit = match.group(2)

            if 'second' in unit:
                return value
            elif 'minute' in unit:
                return value * 60
            elif 'hour' in unit:
                return value * 3600
        return None
        
    df_cleaned.loc[:, column_name] = df_cleaned[column_name].apply(convert_to_seconds)
    df_cleaned = df_cleaned.dropna(subset=[column_name])
    return df_cleaned

filter_values = ["months", "years", "constant", "few", "ongoing"]

df_cleaned_duration = clean_duration(df_cleaned_dates, "duration", filter_values)

# Cleaning function to filter out non-US locations
def clean_city(df, column_name, filter_values):
    pattern = "|".join(filter_values)
    df = df[~df[column_name].str.contains(pattern, case=False, regex=True, na=False)]

    df = df.dropna(subset=[column_name])

    return df

city_filters = ["(Canada)", "(Portugal)", "(Spain)", "(Germany)", "(UK)", "(England)", "(Brazil)", "(Ecuador)", 
                "(Estonia)", "(Puerto Rico)", "(South Africa)", "(France)", "(Philippines)", "(Nigeria)", 
                "(Holland)", "(Australia)", "(Ireland)", "(Colombia)", "(Scotland)", "(Cyprus)", "(South Korea)",
                "(Norway)", "(Croatia)", "(Italy)", "(Singapore)", "(Chile)", "(Malta)", "(Greece)", "(Syria)", 
                "(Sweden)", "(Kyrgyzstan)", "(Myanmar)", "(Japan)", "(Mexico)", "(Argentina)", "(Egypt)", "(Poland)",
                "(Turkey)", "(Iraq)", "(India)", "(Jamaica)", "(Malaysia)", "(Venezuela)", "(Israel)", "(Kosovo)", 
                "(Belize)", "(Belgium)", "(Jordan)", "(Costa Rica)", "(Netherlands)", "(The Netherlands)", "(New Zealand)",
                "(Corsica)", "(in former Yugoslavia)", "(Bahamas)", "(location unspecified)", "(Serbia)"] 

df_cleaned_city = clean_city(df_cleaned_duration, "city", city_filters)

df_cleaned_city.dropna()

output_path = "cleaned_data.csv"
df_cleaned_city.to_csv(output_path, index=False)

  df = df[~df[column_name].str.contains(pattern, case=False, regex=True, na=False)]
