In [62]:
"""
This notebook is for preprocessing the data and getting it ready to be clustered. This is done by exploring the data, getting rid of outliers, 
and converting data to binary to only check for presence of a feature. I chose to use a variance threshhold for some feature reduction by only 
getting rid of features where all values were identical. If a feature is just the exact same (present or not present) for every sample of data, it 
contriubutes nothing but noise. I chose not to use Principal Componenet Analysis (PCA) because it assumes the data is continuouse, not binary, causing 
potential issues due to assumptions. Multiple Correspondence Analysis (MCA) would also work for dimensionality reduction, but would likely cause a loss 
in information.
"""

'\nThis notebook is for preprocessing the data and getting it ready to be clustered. This is done by exploring the data, getting rid of outliers, \nand converting data to binary to only check for presence of a feature. I chose to use a variance threshhold for some feature reduction by only \ngetting rid of features where all values were identical. If a feature is just the exact same (present or not present) for every sample of data, it \ncontriubutes nothing but noise. I chose not to use Principal Componenet Analysis (PCA) because it assumes the data is continuouse, not binary, causing \npotential issues due to assumptions. Multiple Correspondence Analysis (MCA) would also work for dimensionality reduction, but would likely cause a loss \nin information.\n'

In [63]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold

In [64]:
df = pd.read_csv("../strabospot_data.csv")

  df = pd.read_csv("../strabospot_data.csv")


In [65]:
#use this to see value counts of each column. The builtin value_counts function in the pandas libary is hard to read, so with val_count, it makes it a bit easier

def val_count(data):
    counter = 0
    for i in data.columns:
        print(counter)
        print(data[i].value_counts())
        counter += 1
        print("=========================="*5)
    print(data.shape)

In [66]:
val_count(df)

0
contacts drawing
b              15187
c              13368
d              10222
a-              9350
a               6884
b+              3371
c-              2427
b-              2350
c+              1440
f                859
0.300110724        1
742408             1
Name: count, dtype: int64
1
contacts drawing quality
c                      15109
a-                     11981
b                      11583
d                       9643
a                       6551
c-                      4585
b+                      3142
c+                      1627
b-                      1237
 38.56437533444736]        1
 38.49801201400001]        1
Name: count, dtype: int64
2
classification of spots
b                       17848
c                       16258
a                        7757
b+                       5814
a-                       5514
d                        3693
b-                       2795
c-                       2707
c+                       2511
d+                        561
 [-10

In [67]:
#getting rid of outliers and duplicates

df = df.drop(columns= df.columns[153:].tolist(), axis = 1) #dropped because features were unamed and VERY sparse
df = df.drop_duplicates()
df = df.drop([46341,40941]) #outliers
df["Complete"] = df["Complete"].replace("TRUE", True )
df["Complete"] = df["Complete"].replace("FALSE", False )
df["properties_trace_trace_feature"] = df["properties_trace_trace_feature"].replace("TRUE", True )
df["properties_inferences_outcrop_in_place"] = df["properties_inferences_outcrop_in_place"].replace("5 - definitely in place", 5 )
df["properties_symbology_lineWidth"] = df["properties_symbology_lineWidth"].replace(2.0,"2")
df["properties_symbology_lineWidth"] = df["properties_symbology_lineWidth"].replace(4.0,"4")
df = df.dropna(axis=1, how= "all")
df.shape

  df["properties_inferences_outcrop_in_place"] = df["properties_inferences_outcrop_in_place"].replace("5 - definitely in place", 5 )


(46852, 118)

In [68]:
#dropped to prevent data leakage. These features were added in at a later point in time as indicated by their values which are grades of the spots.

df = df.drop(columns= ["contacts drawing", "contacts drawing quality", "classification of spots" , "correctness of spots", "completeness of map","unit labels",  "images", "overall impression", "Complete"])

In [69]:
def binary_simplification(df):
    """converts a pandas dataframe into binary based off of presence in data

    Args:
        df (pandas.DataFrame): a pandas dataframe of the data that needs to be converted to binary

    Returns:
        pandas.DataFrame: a new dataframe that has now been converted to binary
    """
    df_new = df.copy()
    binary_col_data = {}
    columns_to_drop = []
    
    for col in df.columns:
        print(f"Converting {col} to binary")
        binary_col_data[col] = df_new[col].replace('', np.nan).notna().astype(int)
        columns_to_drop.append(col)
            
    df_new = df_new.drop(columns=columns_to_drop)
    
    # Add all new binary columns in one go using pd.concat
    if binary_col_data:
        df_new = pd.concat([df_new, pd.DataFrame(binary_col_data, index=df_new.index)], axis=1)
        
    return df_new

In [70]:
df = binary_simplification(df)
val_count(df)

Converting geometry_type to binary
Converting geometry_coordinates to binary
Converting properties_date to binary
Converting properties_trace_trace_feature to binary
Converting properties_trace_trace_type to binary
Converting properties_name to binary
Converting properties_time to binary
Converting properties_id to binary
Converting properties_modified_timestamp to binary
Converting properties_self to binary
Converting type to binary
Converting properties_images to binary
Converting properties_images_notes to binary
Converting properties_trace_contact_type to binary
Converting properties_trace_trace_notes to binary
Converting properties_trace_geologic_structure_type to binary
Converting properties_trace_shear_sense to binary
Converting properties_orientation_data to binary
Converting properties_rock_unit_unit_label_abbreviation to binary
Converting properties_rock_unit_map_unit_name to binary
Converting properties_notes to binary
Converting properties_image_basemap to binary
Converting

In [71]:
# uses variance threshold for feature reduction, removes features where all values are identical

selector = VarianceThreshold(threshold=0) #removes all features with low variance in 100% of samples  (you do (1 - percentage of same values) * percentage of vaues that are the same)
selector.fit_transform(df)

cols_idxs = selector.get_support(indices=True)
df = df.iloc[:,cols_idxs]
val_count(df)


0
properties_date
1    46850
0        2
Name: count, dtype: int64
1
properties_trace_trace_feature
0    41863
1     4989
Name: count, dtype: int64
2
properties_trace_trace_type
0    41863
1     4989
Name: count, dtype: int64
3
properties_name
1    46623
0      229
Name: count, dtype: int64
4
properties_time
1    46850
0        2
Name: count, dtype: int64
5
properties_id
1    46850
0        2
Name: count, dtype: int64
6
properties_modified_timestamp
1    46850
0        2
Name: count, dtype: int64
7
properties_self
1    46850
0        2
Name: count, dtype: int64
8
type
1    46850
0        2
Name: count, dtype: int64
9
properties_images
0    41871
1     4981
Name: count, dtype: int64
10
properties_images_notes
0    46636
1      216
Name: count, dtype: int64
11
properties_trace_contact_type
0    44796
1     2056
Name: count, dtype: int64
12
properties_trace_trace_notes
0    46773
1       79
Name: count, dtype: int64
13
properties_trace_geologic_structure_type
0    45133
1     1719
Name: co