In [1]:
import os

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
analyze_path = os.path.join(parent_dir, "utils")

os.chdir(analyze_path)

In [2]:
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from tdam.cover import CubicalCover
from tdam.clustering import FailSafeClustering
from tdam.core_old import MapperAlgorithm

from TrafficTDApythonUtils.utils_v3 import *
from TrafficTDApythonUtils.plots import *

In [None]:
from utils import read_data, read_taiwan_specific

combined_data = read_data()
taiwan, grid_filter = read_taiwan_specific(read_grid=True)

In [None]:
# all_features_df = pd.read_csv("../ComputedData/ForModel/all_features.csv")
all_features_df = pd.read_csv("../ComputedData/ForModel/all_features_filtered.csv")

## Start Mapper

In [None]:
pc = 9

lens = PCA(pc).fit_transform(all_features_df)

pca = PCA(pc).fit(all_features_df)
ratios = pca.explained_variance_ratio_
print(ratios)
print(ratios.sum()) 

In [None]:
import pickle

In [None]:
overlaps = [2, 5]
intervals = [7, 8, 9, 10]
detailed_results = []
silhouette_for_intervals = []

for overlap in overlaps:
    for interval in intervals:
        print(f"Processing overlap {overlap}, interval {interval}")
        mapper_algo = MapperAlgorithm(
            cover=CubicalCover(
                n_intervals=interval,
                overlap_frac=overlap / 10
            ),
            clustering=FailSafeClustering(
                KMeans(
                    n_clusters=2,
                    random_state=42
                )
            ),
            n_jobs=14
        )

        mapper_info = mapper_algo.fit_transform(all_features_df.to_numpy(), lens)

        silhouette_for_intervals.append(mapper_info[1])
        result = {
            "overlap": overlap,
            "interval": interval,
            "silhouette": mapper_info[1],
            "mapper_info": mapper_info
        }
        detailed_results.append(result)

        with open(f"../ComputedData/ForMatrixFilter/o{overlap}i{interval}.pkl", 'wb') as file:
            pickle.dump(result, file)

detailed_results_df = pd.DataFrame(detailed_results)

In [None]:
all_features_df['hotspot'] = grid_filter['hotspot']
all_features_df['hotspot'] = all_features_df['hotspot'].apply(lambda x: 'Hotspot' if 'Hotspot' in str(x) else 'Not Hotspot')
all_features_df['youbike'] = all_features_df['youbike_100m_count_mean'].apply(lambda x: 'Facility' if x>0 else 'No Facility')
all_features_df['hotspot_youbike'] = all_features_df['hotspot'] + '_' + all_features_df['youbike']

for overlap in overlaps:
    for interval in intervals:
        
        detailed_results_df = pickle.load(open(f"../ComputedData/ForMatrixFilter/o{overlap}i{interval}.pkl", "rb"))
        choose = 'hotspot_youbike'
        mapper_plotter = MapperPlotter(detailed_results_df['mapper_info'], 
                                    all_features_df, seed=87, iterations=30, dim=3,
                                        range_lst=[-0.5, 0.5, 0.5, -0.5])

        # def avg_label(data):
        #     return sum(data) / len(data) if len(data) > 0 else 0
        def most_common_encoded_label(data):
            most_common_item = Counter(data).most_common(1)[0][0]
            return most_common_item

        mapper_plot = mapper_plotter.create_mapper_plot(choose, most_common_encoded_label, avg=False)
        full_info = mapper_plotter.extract_data()
        mapper_plotter.map_colors(choose, size=10, threshold=0)
        mapper_plotter.plot(choose, avg=False, set_label=True, size=1000, anchor=1.33,
                            save_path=f"../ComputedData/ForMatrixFilter/Plots/o{overlap}i{interval}.png")