
## Part 2 Data Discovery

You will be given a set of datasets and you are tasked to perform data discovery on the data sets.

<b>The datasets are provided in the group lockers on brightspace. Let me know if you are having trouble accessing the datasets</b>

The process is to have the goal of finding datasets that are related to each other, finding relationships between the datasets.

The relationships that we are primarily working with are Join and Union relationships.

So please implement two methods for allowing us to find those pesky Join and Union relationships.

Try to do this with the datasets as is and no processing.



In [1]:
import csv
import pandas as pd
from pathlib import Path
import os
import statistics

from random import shuffle
from time import time
import pickle

import itertools
from collections import Counter
from difflib import SequenceMatcher
import unicodedata

In [32]:

INFO = True







### CSV Reading



In [3]:

def try_sniffer(filepath, sample_size=8192, delimiters=",;\t|:"):
    """Try csv.Sniffer on a bigger sample; return delimiter or None."""
    with open(filepath, "r", encoding="utf-8", errors="replace") as f:
        sample = f.read(sample_size)
    try:
        dialect = csv.Sniffer().sniff(sample, delimiters)
        return dialect.delimiter
    except csv.Error:
        return None


In [4]:

def detect_delimiter_counts(filepath, candidates=["_",",",";","\t","|",":"], n_lines=20):

    lines = []
    with open(filepath, "r", encoding="utf-8", errors="replace") as f:
        for _ in range(n_lines):
            line = f.readline()
            lines.append(line)
    if not lines:
        return None

    best = None
    best_score = 0.0
    for d in candidates:
        counts = [line.count(d) for line in lines]
        nonzero = sum(1 for c in counts if c > 0)
        if nonzero == 0:
            continue
        # score = median count * fraction of lines that contain the delimiter
        med = statistics.median([c for c in counts if c > 0])
        score = med * (nonzero / len(lines))
        if score > best_score:
            best_score = score
            best = d
    return best


In [1]:

def read_csv_flexible(filepath, verbose=True):
    p = Path(filepath)
    if verbose:
        print(f"Reading {p} ...")


    # 1) Heuristic by counting delimiter occurrences
    delim = detect_delimiter_counts(p)
    if delim:
        try:
            df = pd.read_csv(p, sep=delim, engine="python", on_bad_lines="skip")
            if verbose:
                print(f" -> Heuristic chose delimiter: {repr(delim)}")
            print(len(df.columns))
            return df
        except Exception as e:
            if verbose:
                print(f"    reading with heuristic delimiter failed: {e}")

    # 2) Last resort: read entire file as single column (one line per row)
    if verbose:
        print(" -> Falling back to reading file as single column (no delimiter).")
    df = pd.read_csv(p, sep="\n", header=None, names=["raw_line"], engine="python")
    return df



### Set Similarity Measures


In [None]:

def levenshtein(x, y):
    if len(x) == 0 or len(y) == 0:
        return len(x) + len(y)
    
    if x[0] == y[0]:
        return levenshtein( x[1:], y[1:] )
    return 1+levenshtein( x[1:], y[1:] )
    

In [None]:

# We return zero when both sets are empty because even though it's normally undefined
# we want to avoid over-filling our pairs of similar sets with useless pairs
# as such if we set their similarity to zero they will never be treated as similar
def jaccard(x,y):
    if len(x) == 0 and len(y)==0:
        return 0
    return len( x & y ) / len( x | y )
    


### Union Finding


In [None]:

def bipartite_match( node, seen, match, adjacency ):
    for node2 in adjacency[node]:
        if node2 in seen:
            continue
        seen.add( node2 )
        if node2 not in match or bipartite_match( match[node2], seen, match, adjacency ):
            match[node2] = node
            return True
    return False


In [None]:

def exists_mapping( pairs, threshold=0.8 ):
    adjacency = {}
    cols1 = set()
    cols2 = set()
    for p in pairs:
        c1, c2 = p['col1'], p['col2']
        if not c1 in adjacency:
            adjacency[c1] = []
        adjacency[c1].append(c2)
        cols1.add(c1)
        cols2.add(c2)

    matches = 0
    for node in adjacency:
        if bipartite_match( node, set(), {}, adjacency ):
            matches += 1
    
    if INFO:
        print( f"Matched: {matches}\tNeeded {len(cols1)}" )
    return matches >= threshold*len( cols1 )



### Lazo


In [8]:

def k_shingle( doc, k=2 ):
    if len( doc ) < k:
        return set( [ doc ] )
    shingles = set()
    for i in range( len(doc) + 1 - k ):
        shingles.add( doc[i:i+k] )
    return shingles


In [9]:

def dframe_shingles( dframe, k=2 ):
    shingles = {}
    for column in dframe.columns:
        shingles[column] = set([])
        for datum in dframe[column]:
            shingles[column].update( k_shingle( str(datum), k=k ) )
    return shingles
        

In [10]:

def shingle_dframes( dframes, shingle_size=2 ):
    vocab_file = f"./cache/vocab.{shingle_size}.pickle"
    vocab_loaded = False
    if os.path.exists( vocab_file ):
        with open( vocab_file, 'rb' ) as file:
            vocab = pickle.load( file )
        vocab_loaded = True
    else:
        vocab = set([])

    shingles = {}
    for title, dframe in dframes.items():
        cache_file = f"./cache/{title}.{shingle_size}.pickle"
        if os.path.exists( cache_file ):
            with open( cache_file, 'rb' ) as file:
                shingles[title] = pickle.load( file )
        else:
            shingles[title] = dframe_shingles( dframe, k=shingle_size )
            with open( cache_file, 'wb' ) as file:
                pickle.dump( shingles[title], file, pickle.HIGHEST_PROTOCOL )
        
        if not vocab_loaded:
            for _, col_shingles in shingles[title].items():
                vocab.update( col_shingles )
                
    if not vocab_loaded:
        with open( vocab_file, 'wb' ) as file:
            pickle.dump( vocab, file, pickle.HIGHEST_PROTOCOL )

    return shingles, vocab
        

In [11]:

def onehot_shingles( shingles, vocab ):

    onehot = { title: { col: [] for col in shingles[title] } for title in shingles }
    
    for shingle in vocab:
        for title in shingles:
            for col in shingles[title]:
                if shingle in shingles[title][col]:
                    onehot[title][col].append( 1 )
                else:
                    onehot[title][col].append( 0 )

    return onehot
    

In [12]:

def one_perm_hashing( onehot, vocab_length, num_hashes=100, n_bands=10 ):
    assert num_hashes % n_bands == 0
    assert num_hashes > n_bands
    rows_per_band = num_hashes // n_bands
    bin_size = vocab_length // num_hashes
    
    idx = list( range( vocab_length ) )
    shuffle( idx )
    
    banded_sig = [ [ -1 for _ in range( rows_per_band ) ] for _ in range( n_bands ) ]
    for b in range( num_hashes ):
        start = b * bin_size
        end = (b + 1) * bin_size
        if b == num_hashes - 1:
            end = vocab_length
        bin_indices = idx[start:end]

        band_id = b // rows_per_band
        for i in bin_indices:
            if onehot[i] == 1:
                j = b % rows_per_band
                banded_sig[band_id][j] = i
                break

    return banded_sig
    

In [13]:

def hash_dframes( onehot, vocab_length, signature_length=100, num_bands=10 ):
    buckets = [ {} for _ in range(num_bands) ]
    for title in onehot:
        for column in onehot[title]:
            bands = one_perm_hashing( onehot[title][column], vocab_length, num_hashes=signature_length, n_bands=num_bands )
        
            for band_idx, band in enumerate( bands ):
                band_hash = hash( tuple(band) )
                if band_hash not in buckets[band_idx]:
                    buckets[band_idx][band_hash] = [ { 'title': title, 'column': column } ]
                else:
                    buckets[band_idx][band_hash].append( { 'title': title, 'column': column } )

    return buckets


In [16]:

def Lazo( documents, shingle_size=2, signature_length=100, num_bands=10, similarity_threshold = 0.7, union_threshold=0.8 ):
    """Function should be able to perform data discovery to find related datasets
    Possible Input: List of datasets
    Output: List of pairs of related datasets
    """

    t_start_shingling = time()
    
    shingles, vocab = shingle_dframes( documents, shingle_size=shingle_size  )
    vocab_length = len(vocab)

    t_end_shingles = time()
    if INFO:
        print( f"Finished shingling ({1000*(t_end_shingles-t_start_shingling):.4f}ms)" )

    onehot = onehot_shingles( shingles, vocab )
    
    t_end_onehot = time()
    if INFO:
        print( f"Finished onehot ({1000*(t_end_onehot-t_end_shingles):.4f}ms)" )

    buckets = hash_dframes( onehot, vocab_length )

    t_end_hashing = time()
    if INFO:
        print( f"Finished hashing ({1000*(t_end_hashing-t_end_onehot):.4f}ms)" )

    
    similarities = {}
    for band_idx, band_buckets in enumerate(buckets):
        #print( f"Band {band_idx+1} buckets:" )
        for key in band_buckets:
            #print( f"Bucket {key}:" )
            bucket = band_buckets[key]
            if len( bucket ) < 2:
                continue
            
            for i in range( len(bucket) ):
                
                doc1 = bucket[i]
                title1 = doc1['title']
                col1 = doc1['column']
                #print( f"{title1}, ", end="" )
                for j in range( i ):
                    doc2 = bucket[j]
                    title2 = doc2['title']
                    col2 = doc2['column']
                    similarity = jaccard( shingles[title1][col1], shingles[title2][col2] )
                    #print( f"{title1}\t{title2}:\tSimilarity: {similarity}")
                    if similarity < similarity_threshold:
                        continue
                    
                    #print( f"{col1}\t{col2}:\tSimilarity: {similarity}")
                    if title1 not in similarities:
                        similarities[title1] = {}
                    if title2 not in similarities[title1]:
                        similarities[title1][title2] = []
                    similarities[title1][title2].append( { 'col1': col1, 'col2': col2, 'similarity': similarity } )
                #print("")
        #print("")
    
    #print( similarities )
    join_candidates = []
    union_candidates = []
    for table1 in similarities:
        for table2 in similarities[table1]:
            if table1 == table2:
                continue
            #print( table1, table2 )
            
            if documents[table1].shape[1] != documents[table2].shape[1]:
                join_candidates.append( [table1, table2] )
                continue
            
            if exists_mapping( similarities[table1][table2], union_threshold=union_threshold ):
                union_candidates.append( [table1, table2] )
            else:
                join_candidates.append( [table1, table2] )

    t_end_bucketing = time()
    if INFO:
        print( f"Finished bucketing ({1000*(t_end_bucketing-t_end_hashing):.4f}ms)" )
    
    return { 'join': join_candidates, 'union': union_candidates }
    


### Josie


In [None]:

def normalize_string(s: str) -> str:
    return unicodedata.normalize("NFKC", str(s)).lower().strip()


In [None]:

def table_signature(df: pd.DataFrame):
    sig = []
    for col in df.columns:
        if col.startswith("__"):  # skip metadata
            continue
        if pd.api.types.is_integer_dtype(df[col]):
            sig.append("int")
        elif pd.api.types.is_float_dtype(df[col]):
            sig.append("float")
        elif pd.api.types.is_datetime64_any_dtype(df[col]):
            sig.append("datetime")
        else:
            sig.append("string")
    return sig


In [None]:

def column_similarity(col1, col2, sample_size=10):
    vals1 = col1.dropna().astype(str).map(normalize_string)
    vals2 = col2.dropna().astype(str).map(normalize_string)

    if sample_size and len(vals1) > sample_size:
        vals1 = vals1.sample(sample_size, random_state=42)
    if sample_size and len(vals2) > sample_size:
        vals2 = vals2.sample(sample_size, random_state=42)

    set1, set2 = set(vals1), set(vals2)
    jaccard_sim = jaccard( set1, set2 )

    name_sim = SequenceMatcher(None, normalize_string(col1.name), normalize_string(col2.name)).ratio()
    return 0.5 * jaccard_sim + 0.5 * name_sim


In [None]:

# Union check
def unionable_by_type(sig1, sig2, threshold=0.8):
    if not sig1 or not sig2:
        return False
    counter1, counter2 = Counter(sig1), Counter(sig2)
    common_count = sum(min(counter1[t], counter2[t]) for t in counter1)
    overlap_ratio = common_count / max(len(sig1), len(sig2))
    return overlap_ratio >= threshold


In [None]:

# Relationship finder
def find_relationships(datasets, join_threshold=0.6, union_threshold=0.8):
    relationships = {"joins": [], "unions": []}

    for (i, (name1, df1)), (j, (name2, df2)) in itertools.combinations(enumerate(datasets.items()), 2):
        sig1 = table_signature(df1)
        sig2 = table_signature(df2)

        if unionable_by_type(sig1, sig2, threshold=union_threshold):
            print(f"[UNION] {name1} <--> {name2} (overlap ≥ {union_threshold})")
            relationships["unions"].append({
                "table1": name1,
                "table2": name2,
                "signature1": sig1,
                "signature2": sig2
            })

        joinable_cols = []
        found = False
        # takes a lot of time, so I only picked a few columns for speed, accuracy should
        # theoretically increase for a higher threshold and more columns
        for col1 in df1.columns[:10]:
            if found:
                break
            for col2 in df2.columns[:10]:
                if found:
                    break
                dtype1 = "string"
                dtype2 = "string"
                if pd.api.types.is_integer_dtype(df1[col1]): dtype1 = "int"
                elif pd.api.types.is_float_dtype(df1[col1]): dtype1 = "float"
                elif pd.api.types.is_datetime64_any_dtype(df1[col1]): dtype1 = "datetime"
                if pd.api.types.is_integer_dtype(df2[col2]): dtype2 = "int"
                elif pd.api.types.is_float_dtype(df2[col2]): dtype2 = "float"
                elif pd.api.types.is_datetime64_any_dtype(df2[col2]): dtype2 = "datetime"

                if dtype1 != dtype2:
                    continue

                sim = column_similarity(df1[col1], df2[col2])
                if sim >= join_threshold:
                    print(f"[JOIN] {name1}.{col1} <--> {name2}.{col2} (dtype={dtype1}, sim={sim:.2f})")
                    joinable_cols.append({
                        "col1": col1,
                        "col2": col2,
                        "similarity": sim,
                        "dtype": dtype1
                    })
                    found = True
                    break

        if joinable_cols:
            relationships["joins"].append({
                "table1": name1,
                "table2": name2,
                "joinable_columns": joinable_cols
            })

    return relationships





### Testing on Unclean Data


In [19]:

file_names = [ f[:-4] for f in os.listdir('./lake49') if os.path.isfile( os.path.join('./lake49', f ) ) ]

swamps = {}
for swamp_name in file_names:
    file_path = f"./lake49/{swamp_name}.csv" 
    data, bad_rows = read_csv( file_path )
    #with open( file_path, mode='r', encoding='utf-8' ) as file:
    #    data = pd.read_csv( file, delimiter=delimiter, header=None, low_memory=False, dtype=str )
    swamps[swamp_name] = pd.DataFrame( data )
    if INFO:
        print( f" {len(data)}, { len( bad_rows ) }" )

candidates = Lazo( swamps, shingle_size=5, signature_length=100, num_bands=25, similarity_threshold=0.5, union_threshold=0.8 )

print( "\n\n====== Lazo ======" )
print( f"Join Candidates: {len(candidates['join'])}" )
for candidate in candidates['join']:
    print( f"{candidate[0]}, {candidate[1]}, JOIN" )

print( f"\nUnion Candidates: {len(candidates['union'])}" )
for candidate in candidates['union']:
    print( f"{candidate[0]}, {candidate[1]}, UNION" )


candidates = find_relationships( swamps, join_threshold=0.6, union_threshold=0.8)

print( "\n====== Josie ======" )
print( f"Join Candidates: {len(candidates['join'])}" )
for candidate in candidates['join']:
    print( f"{candidate[0]}, {candidate[1]}, JOIN" )

print( f"\nUnion Candidates: {len(candidates['union'])}" )
for candidate in candidates['union']:
    print( f"{candidate[0]}, {candidate[1]}, UNION" )


Error: Could not determine delimiter

### Sanity Check

In [20]:

## Sanity check
data = pd.read_csv("./adult.csv", header=0, na_values='?')

if INFO:
    print( data.shape )
    print( data.columns )

# Check for overly empty feature columns
for column in data.columns:
    nans = data[column].isna()
    if nans.sum() >= 0.6*data.shape[0]:
        print( f"Dropping {column}" )
        data.drop( column )

# Drop any rows with NaN
# Try imputation
data.dropna( inplace=True )


# Capital loss and capital gain represent essentially the exact same information
# Capital delta can represent both at the same time without increasing dimensionality
data['capital-delta'] = data['capital-gain'] - data['capital-loss']


# Education is the same as education-num if it were ordinally encoded.
# Capital loss/gain already encoded with capital delta
data.drop( columns=[ 'education', 'capital-gain', 'capital-loss' ], inplace=True )

if INFO:
    print( data.shape )

slicepoint = data.shape[0]//2
train, test = data[:slicepoint], data[slicepoint:]

documents = { 'train': train, 'test': test }

candidates = Lazo( documents, shingle_size=5, signature_length=100, num_bands=25, similarity_threshold=0.5, union_threshold=0.8 )

print( "\n\n====== Lazo ======" )
print( f"Join Candidates: {len(candidates['join'])}" )
for candidate in candidates['join']:
    print( f"{candidate[0]}, {candidate[1]}, JOIN" )

print( f"\nUnion Candidates: {len(candidates['union'])}" )
for candidate in candidates['union']:
    print( f"{candidate[0]}, {candidate[1]}, UNION" )


candidates = find_relationships( documents, join_threshold=0.6, union_threshold=0.8)

print( "\n====== Josie ======" )
print( f"Join Candidates: {len(candidates['join'])}" )
for candidate in candidates['join']:
    print( f"{candidate[0]}, {candidate[1]}, JOIN" )

print( f"\nUnion Candidates: {len(candidates['union'])}" )
for candidate in candidates['union']:
    print( f"{candidate[0]}, {candidate[1]}, UNION" )

Files: test, train, UNION


You would have noticed that the data has some issues in them.
So perhaps those issues have been troublesome to deal with.

Please try to do some cleaning on the data.

After performing cleaning see if the results of the data discovery has changed?

Please try to explain this in your report, and try to match up the error with the observation.

### Data Cleaning

In [19]:

def detect_column_dtype( column, samples=1000 ):
    vals = set( column )
    step = len( vals ) // samples

    if len( vals ) == 2:
        return 'bool'
    
    likely_dtypes = {
        'int': 0,
        'float': 0,
        'datetime': 0,
        'string': 1
    }
    
    count = 0
    for val in vals:
        if count == samples:
            break
        
        count += 1
        
        try:
            tmp = pd.to_datetime( val )
            likely_dtypes['datetime'] += 1
            continue
        except (ValueError, TypeError):
            pass
    
        try:
            tmp = int( val )
            likely_dtypes['int'] += 1
            continue
        except (ValueError, TypeError):
            pass
            
        try:
            tmp = float( val )
            likely_dtypes['float'] += 1
            continue
        except (ValueError, TypeError):
            pass

        likely_dtypes['string'] += 1

    if likely_dtypes['float'] > 0.1*likely_dtypes['int']:
        likely_dtypes['float'] += likely_dtypes['int']
    
    dtype = 'string'
    most_likely = likely_dtypes['string']
    if likely_dtypes['float'] >= most_likely:
        most_likely = likely_dtypes['float']
        dtype = 'float'
    if likely_dtypes['int'] >= most_likely:
        most_likely = likely_dtypes['int']
        dtype = 'int'
    if likely_dtypes['datetime'] >= most_likely:
        dtype = 'datetime'

    if INFO:
        print ( likely_dtypes )
    return dtype
    

In [20]:

def find_header( column, dtype ):
    counter = 0
    for val in column:
        if dtype == 'int':
            try:
                tmp = int( val )
                break
            except (ValueError, TypeError):
                counter += 1
        
        if dtype == 'float':
            try:
                tmp = float( val )
                break
            except (ValueError, TypeError):
                counter += 1
        
        if dtype == 'datetime':
            try:
                tmp = pd.to_datetime( val )
                break
            except (ValueError, TypeError):
                counter += 1

    return counter


In [21]:

def change_column_type( column, dtype ):
    if dtype in ('int', 'float'):
        column = pd.to_numeric( column, errors='coerce' )
        return column
    if dtype in ('datetime'):
        column = pd.to_datetime( column, errors='coerce' )
        return column
    if dtype in ('bool'):
        column = column.map( lambda x:
                             True if str(x).lower() in ('true','1') else
                             False if str(x).lower() in ('false','0') else
                             pd.NA
                           )
        return column
    return column.astype(str)
        

In [42]:

def make_dframe( file_path, na_row_thresh=0.4, na_col_thresh=0.4 ):
    data, bad_rows = read_csv( file_path )
    lake = pd.DataFrame( data )
    counter = 0
    
    header_lengths = set()
    for c_name, column in lake.items():
        dtype = detect_column_dtype( column )
        if dtype not in ( 'string', 'bool' ):
            header_lengths.add( find_header( column, dtype ) )
    if len( header_lengths ) == 0:
        header_len = 0
    else:
        header_len = min( header_lengths )

    if header_len == 0:
        return lake

    header = pd.MultiIndex.from_arrays( lake.iloc[:header_len].values )
    header = [ ' '.join( map(str, col)).strip() for col in header.values ]
    
    lake = lake.iloc[header_len:]
    lake.columns = header
    
    for c_name, column in lake.items():
        dtype = detect_column_dtype( column )
        lake[c_name] = change_column_type( column, dtype )

    #print( lake.head() )

    if INFO:
        print( f"Initial shape: {lake.shape}")
    na_thresh = int( ( 1 - na_row_thresh ) * lake.shape[1] )
    if INFO:
        print( f"Row thresh: {na_thresh}" )
        
    lake.dropna( thresh=na_thresh, inplace=True )
    if INFO:
        print( f"Drop Na rows shape: {lake.shape}")
        
    lake.drop_duplicates( inplace = True )
    if INFO:
        print( f"Drop duplicate rows shape: {lake.shape}")
    
    lake_T = lake.T
    na_thresh = int( ( 1 - na_col_thresh ) * lake_T.shape[1] )
    if INFO:
        print( f"Column thresh: {na_thresh}" )
        
    lake_T.dropna( thresh=na_thresh, inplace=True )
    if INFO:
        print( f"Drop Na cols shape: {lake_T.shape}")
        
    lake_T.drop_duplicates( inplace = True )
    if INFO:
        print( f"Drop duplicate cols shape: {lake_T.shape}")
    
    lake = lake_T.T

    return lake


In [23]:
## Cleaning data, scrubbing, washing, mopping

def cleaningData( datasets ):
    """Function should be able to clean the data
    Possible Input: List of datasets
    Output: List of cleaned datasets
    """
    
    lakes = {}
    for lake_name in file_names:
        file_path = f"./lake49/{lake_name}.csv"
        lake_name = f"{lake_name}_clean"
        cache_path = f"./clean/{lake_name}.csv"
        print( f"{lake_name}" )
        if os.path.exists( cache_path ):
            with open( cache_path, 'r' ) as file:
                lakes[lake_name] = pd.read_csv( cache_path, header=0 )
        else:
            lakes[lake_name] = make_dframe( file_path )
            #delimiter, _ = find_delimiter( file_path )
            #lakes[lake_name] = pd.read_csv( file_path, delimiter=delimiter )
            with open( cache_path, 'w' ) as file:
                lakes[lake_name].to_csv( cache_path, header=True, index=False )
        print( lakes[lake_name].shape )
        
    return lakes


### Testing on Clean Data

In [43]:
file_names = [ f[:-4] for f in os.listdir('./lake49') if os.path.isfile( os.path.join('./lake49', f ) ) ]
    
lakes = cleaningData( file_names )

candidates = Lazo( swamps, shingle_size=5, signature_length=100, num_bands=25, similarity_threshold=0.5, union_threshold=0.8 )

print( "\n\n====== Lazo ======" )
print( f"Join Candidates: {len(candidates['join'])}" )
for candidate in candidates['join']:
    print( f"{candidate[0]}, {candidate[1]}, JOIN" )

print( f"\nUnion Candidates: {len(candidates['union'])}" )
for candidate in candidates['union']:
    print( f"{candidate[0]}, {candidate[1]}, UNION" )


candidates = find_relationships( swamps, join_threshold=0.6, union_threshold=0.8)

print( "\n====== Josie ======" )
print( f"Join Candidates: {len(candidates['join'])}" )
for candidate in candidates['join']:
    print( f"{candidate[0]}, {candidate[1]}, JOIN" )

print( f"\nUnion Candidates: {len(candidates['union'])}" )
for candidate in candidates['union']:
    print( f"{candidate[0]}, {candidate[1]}, UNION" )

table_17_clean
Candidates:
{',': [], '"': [], '.': []}
Delimiters: {',': {20}, '"': {3}, '.': {5}}
Delimiter: ','	Frequency: 20
{'int': 98, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 4, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 10, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 2, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 509, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 2, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 0, 'float': 0, 'datetime': 0, 'string': 1001}
{'int': 60, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 10, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 0, 'float': 330, 'datetime': 0, 'string': 2}
{'int': 0, 'float': 386, 'datetime': 0, 'string': 2}
{'int': 44, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 2, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 2, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 4, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 5, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 2, 'float': 0, 'datetime': 0, 'str

  column = pd.to_datetime( column, errors='coerce' )


{'int': 0, 'float': 0, 'datetime': 1000, 'string': 1}
{'int': 0, 'float': 2, 'datetime': 1, 'string': 1}
{'int': 0, 'float': 63, 'datetime': 0, 'string': 1}
{'int': 172, 'float': 0, 'datetime': 30, 'string': 2}
{'int': 0, 'float': 1000, 'datetime': 0, 'string': 1}
{'int': 0, 'float': 0, 'datetime': 0, 'string': 2}
Initial shape: (26145, 40)
Row thresh: 24
Drop Na rows shape: (22538, 40)
Drop duplicate rows shape: (22538, 40)
Column thresh: 13522
Drop Na cols shape: (28, 22538)
Drop duplicate cols shape: (26, 22538)
(22538, 26)
table_11_clean
Candidates:
{'_': [], ' ': [], '[': [], ']': [], '.': []}
Delimiters: {'_': {12, 6}, ' ': {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 22, 24}, '[': {1, 3}, ']': {1, 3}, '.': {2, 4, 7}}
Delimiter: '_'	Frequency: 12
{'int': 0, 'float': 0, 'datetime': 22, 'string': 2}
{'int': 0, 'float': 0, 'datetime': 0, 'string': 16}
{'int': 0, 'float': 0, 'datetime': 0, 'string': 16}
{'int': 0, 'float': 0, 'datetime': 0, 'string': 5}
{'int': 0, 'float

  column = pd.to_datetime( column, errors='coerce' )


Drop duplicate cols shape: (23, 18157)
(18157, 23)
table_6_clean
Candidates:
{',': [], '.': [], ' ': [], '/': []}
Delimiters: {',': {37}, '.': {1, 2, 3}, '/': {1}}
Delimiter: ','	Frequency: 37
{'int': 6, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 6, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 5, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 0, 'float': 49, 'datetime': 1, 'string': 2}
{'int': 2, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 3, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 3, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 6, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 2, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 3, 'float': 0, 'datetime': 0, 'string': 3}
{'int': 2, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 4, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 0, 'float': 0, 'datetime': 0, 'string': 9}
{'int': 5, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 3, 'float': 0, 'datetime': 0, 'string': 2}
{'int': 0, 'float': 0, 'datetime': 0, 'st

In [26]:
find_delimiter( "./lake49/table_14.csv" )
with open( "./lake49/table_14.csv", 'r' ) as csvfile:
    delimiter = csv.Sniffer().sniff(csvfile.read(1024))
print(delimiter)

Candidates:
{',': [], ' ': [], '"': [], '.': []}
Delimiters: {',': {18}, ' ': {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34}, '.': {9}}
<class 'csv.Sniffer.sniff.<locals>.dialect'>


### Pandas Sanity Check

In [32]:

file_names = [ f[:-4] for f in os.listdir('./lake49') if os.path.isfile( os.path.join('./lake49', f ) ) ]

lakes = {}
for lake_name in file_names:
    file_path = f"./lake49/{lake_name}.csv"
    delimiter, _ = find_delimiter( file_path )
    lake_name = lake_name + '_pdclean'
    lakes[lake_name] = pd.read_csv( file_path, delimiter=delimiter )

join_candidates, union_candidates = Lazo( lakes, shingle_size=5, signature_length=100, num_bands=25, similarity_threshold=0.5 )

print( f"Join Candidates: {len(join_candidates)}" )
for candidate in join_candidates:
    print( f"{candidate[0]}, {candidate[1]}, JOIN" )

print( f"\nUnion Candidates: {len(union_candidates)}" )
for candidate in union_candidates:
    print( f"{candidate[0]}, {candidate[1]}, UNION" )

  lakes[lake_name] = pd.read_csv( file_path, delimiter=delimiter )


Join Candidates: 95
table_0_pdclean, table_17_pdclean, JOIN
table_8_pdclean, table_17_pdclean, JOIN
table_8_pdclean, table_0_pdclean, JOIN
table_7_pdclean, table_17_pdclean, JOIN
table_7_pdclean, table_0_pdclean, JOIN
table_7_pdclean, table_8_pdclean, JOIN
table_10_pdclean, table_17_pdclean, JOIN
table_10_pdclean, table_0_pdclean, JOIN
table_10_pdclean, table_8_pdclean, JOIN
table_9_pdclean, table_17_pdclean, JOIN
table_9_pdclean, table_0_pdclean, JOIN
table_9_pdclean, table_8_pdclean, JOIN
table_9_pdclean, table_10_pdclean, JOIN
table_9_pdclean, table_7_pdclean, JOIN
table_1_pdclean, table_17_pdclean, JOIN
table_1_pdclean, table_8_pdclean, JOIN
table_1_pdclean, table_7_pdclean, JOIN
table_1_pdclean, table_9_pdclean, JOIN
table_1_pdclean, table_10_pdclean, JOIN
table_16_pdclean, table_17_pdclean, JOIN
table_16_pdclean, table_0_pdclean, JOIN
table_16_pdclean, table_8_pdclean, JOIN
table_16_pdclean, table_7_pdclean, JOIN
table_16_pdclean, table_9_pdclean, JOIN
table_16_pdclean, table_1_p

## Discussions

1)  Different aspects of the data can effect the data discovery process. Write a short report on your findings. Such as which data quality issues had the largest effect on data discovery. Which data quality problem was repairable and how you choose to do the repair.

<!-- For the set of considerations that you have outlined for the choice of data discovery methods, choose one and identify under this new constraint, how would you identify and resolve this problem? -->

Max 400 words