This notebook is for preprocessing the data and getting it ready to be clustered. This is done by exploring the data, getting rid of outliers, 
and converting data to binary to only check for presence of a feature. I chose to use a variance threshhold for some feature reduction by only 
getting rid of features where all values were identical. If a feature is just the exact same (present or not present) for every sample of data, it 
contriubutes nothing but noise. I chose not to use Principal Componenet Analysis (PCA) because it assumes the data is continuouse, not binary, causing 
potential issues due to assumptions. Multiple Correspondence Analysis (MCA) would also work for dimensionality reduction, but would likely cause a loss 
in information.

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold

In [2]:
df = pd.read_csv("../data/parquets/raw_parquet.csv")

  df = pd.read_csv("../data/parquets/raw_parquet.csv")


In [3]:
#use this to see value counts of each column. The builtin value_counts function in the pandas libary is hard to read, so with val_count, it makes it a bit easier

def val_count(data):
    counter = 0
    for i in data.columns:
        print(counter)
        print(data[i].value_counts())
        counter += 1
        print("=========================="*5)
    print(data.shape)

In [4]:
val_count(df)

0
properties_images
[{"lng": -105.28238207112, "width": 1536, "id": 14974604412492, "lat": 38.406684324181, "height": 1536, "image_type": "photo", "annotated": false, "self": "https://strabospot.org/db/image/14974604412492"}]    12
[{"lng": -105.3038202134, "width": 1536, "id": 14968640404070, "lat": 38.56319813527, "height": 1536, "image_type": "photo", "annotated": false, "self": "https://strabospot.org/db/image/14968640404070"}]      12
[{"lng": -105.30221432457, "width": 1536, "id": 14968661387069, "lat": 38.563431780821, "height": 1536, "image_type": "photo", "annotated": false, "self": "https://strabospot.org/db/image/14968661387069"}]    11
[{"lng": -105.28587841439, "width": 1536, "id": 14975497178020, "lat": 38.407283755986, "height": 1536, "image_type": "photo", "annotated": false, "self": "https://strabospot.org/db/image/14975497178020"}]    11
[{"lng": -105.30259427624, "width": 1536, "id": 14967747708707, "lat": 38.564123204014, "height": 1536, "image_type": "photo", "anno

In [None]:
#getting rid of duplicates

df.drop_duplicates(inplace = True)
df.shape

(1360962, 480)

In [8]:
def binary_simplification(df):
    """converts a pandas dataframe into binary based off of presence in data

    Args:
        df (pandas.DataFrame): a pandas dataframe of the data that needs to be converted to binary

    Returns:
        pandas.DataFrame: a new dataframe that has now been converted to binary
    """
    df_new = df.copy()
    binary_col_data = {}
    columns_to_drop = []
    
    for col in df.columns:
        print(f"Converting {col} to binary")
        binary_col_data[col] = df_new[col].replace('', np.nan).notna().astype(int)
        columns_to_drop.append(col)
            
    df_new = df_new.drop(columns=columns_to_drop)
    
    # Add all new binary columns in one go using pd.concat
    if binary_col_data:
        df_new = pd.concat([df_new, pd.DataFrame(binary_col_data, index=df_new.index)], axis=1)
        
    return df_new

In [9]:
df = binary_simplification(df)
# val_count(df)

Converting properties_images to binary
Converting properties_date to binary
Converting properties_viewed_timestamp to binary
Converting properties_notes to binary
Converting properties_orientation_data to binary
Converting properties_modified_timestamp to binary
Converting properties_symbology_circleColor to binary
Converting properties_name to binary
Converting properties_notesTimestamp to binary
Converting properties_time to binary
Converting properties_id to binary
Converting properties_self to binary
Converting geometry_type to binary
Converting geometry_coordinates to binary
Converting type to binary
Converting properties_samples to binary
Converting properties_altitude to binary
Converting properties_altitude_accuracy to binary
Converting properties_gps_accuracy to binary
Converting properties_lng to binary
Converting properties_image_basemap to binary
Converting properties_lat to binary
Converting properties__3d_structures to binary
Converting properties_symbology_lineColor to b

In [10]:
# uses variance threshold for feature reduction, removes features where all values are identical

selector = VarianceThreshold(threshold=0) #removes all features with low variance in 100% of samples  (you do (1 - percentage of same values) * percentage of vaues that are the same)
selector.fit_transform(df)

cols_idxs = selector.get_support(indices=True)
df = df.iloc[:,cols_idxs]

print(df.shape)
# val_count(df)


(1360962, 456)


In [None]:
# df.to_csv("../data/parquets/processed_parquets.csv", index = False)
