# Data Cleaning

In [None]:
import pandas as pd 
import numpy as np
from datetime import datetime

#Load data
pre_cleaned_df = pd.read_csv('UC_Irvine_ML_datasets.csv')

#inspect dataframe information
pre_cleaned_df.info()

In [None]:
#select only relevant columns
pre_cleaned_df = pre_cleaned_df[[
                                'header', 'DataSetCharacteristics', 'NumberofInstances', 'Area',
                                'AttributeCharacteristics', 'NumberofAttributes', 'DateDonated',
                                'AssociatedTasks','MissingValues', 'NumberofWebHits'
                                ]]
pre_cleaned_df.info()

## Phase 1 Data Cleaning

In [None]:
import data_cleaning
from data_cleaning import get_Univ_Loc_match

#fill na values
cleandata = data_cleaning.fillna(pre_cleaned_df)

#de-normalize categorical columns into indicator dummy variables
cleandata = data_cleaning.create_characteristics_columns(cleandata)
cleandata = data_cleaning.create_attribute_columns(cleandata)
cleandata = data_cleaning.create_tasks_columns(cleandata)

#convert 'DateDonated' to real date value
cleandata = data_cleaning.convert_to_datetime(cleandata)

#drop all records with NA values
cleanest_data = data_cleaning.final_na_drop(cleandata)


cleanest_data.info()

## Phase 2 Data Cleaning

In [None]:
#re-import the file 
src_df = pd.read_csv('cleanest_data.csv', encoding="latin-1")

#add lookup-text-search institution / location values to src
src_df['source_institution_places'] = src_df['Source'].apply(get_Univ_Loc_match)

#add year from DateDonated column
src_df['YearAdded'] = src_df['DateDonated'].apply(lambda x: pd.to_datetime(x, infer_datetime_format=True).year)
#additionally add the age of the dataset, subtracted from 2020
src_df['DatasetAge'] = src_df['YearAdded'].apply(lambda x: 2020-x)

#add column multiply rows*columns to get number of cells in dataset
def calc_num_cells(x):
    out = x['NumberofInstances'] * x['NumberofAttributes']
    return out
src_df['DatapointCount'] = src_df.apply(calc_num_cells, axis=1)

src_df.info()

# Geograpgic Analysis

In [None]:

import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo

In [None]:

def worldmap(df):
    pyo.init_notebook_mode()
    #'multivariate_data', 'time_series_data', 'data_generator_data', 'domain_theory_data', 'image_data', 'relational_data', 'sequential_data', 'spatial_data', 'univariate_data', 'spatio_temporal_data', 'text_data', 'transactional_data'
    df = df[df['source_institution_places'].str.len() > 6]
    datasetcount = len(df.index)
    srclist = df[['source_institution_places']].values.tolist()
    countrylist = []
    for x in srclist:
        if x != [np.nan]:
            x = x[0].split("|")
            for xsub in x:
                xsub = xsub.split(";")
                countrylist.append(xsub)
    df = pd.DataFrame(countrylist, columns = ['University', 'City', 'Country', 'CODE',])
    
    df = df.groupby(['Country', 'CODE'],as_index=False).size().reset_index()
    df.columns = [*df.columns[:-1], 'Dataset Count']

    #merge dataframes
    df = df.merge(country_codes, how='right', on=['CODE'])
    df['Country'] = df['Country_y']
    df = df.drop(['Country_x', 'Country_y'], axis=1)
    df = df[df.Country != 'Antarctica']
    df=df.fillna(0)
    df['hover_text'] = 'Country: ' + df['Country'] +  '\nNumber of Datasets: ' + df['Dataset Count'].astype(str)

    def dataset_count_calc(x):
        #maxval = 15
        out = (float(x) * 100.00) / float(datasetcount)
        #out = maxval if out > maxval else out
        return out

    df['Dataset Count Pct'] = df['Dataset Count'].apply(dataset_count_calc)

    fig = go.Figure(
        data=go.Choropleth(
        locations = df['CODE'],
        z = df['Dataset Count Pct'],
        hovertext = df['hover_text'],
        colorscale = 'Blues',
        autocolorscale=False,
        reversescale=False,
        marker_line_color='darkgray',
        marker_line_width=0.5,
        colorbar_title = 'Sourced % Datasets',
        zmin=0,
        zmax=10
    ))  
    fig.update_layout(
        margin={"r":0,"t":0,"l":0,"b":0},
        title_text='UC Irvine ML Dataset Analysis',
        geo=dict(
            showframe=False,
            showcoastlines=False,
            projection_type='mercator'
        ),
        annotations = [dict(
            x=0.55,
            y=0.1,
            xref='paper',
            yref='paper',
            text='Source: <a href="http://archive.ics.uci.edu/ml/datasets.php">\
                UCI Machine Learning Repository</a>',
            showarrow = False
        )]
    )
    #print(df)
    #fig.write_html(basedir + "viz_worldmap_sourced_pct_datasets.html")
    return fig

country_codes = pd.read_csv("all_country_codes.csv")
src_df = pd.read_csv('cleanest_data_augmented.csv', encoding="latin-1")

worldmap(src_df).show()