In [1]:
import json
import itertools
import math
import csv
from tqdm import tqdm
import os

### File finder

In [2]:
def find_the_way(path,file_format,con=""):
    files_add = []
    # r=root, d=directories, f = files
    for r, d, f in os.walk(path):
        for file in f:
            if file_format in file:
                if con in file:
                    files_add.append(os.path.join(r, file))  
            
    return files_add




### Calculate the WiFi distance between fingerprints. This function takes the WiFi values of different fingerprints as a series and calculates the pseudo distance between these series using various methods. This is a feature extraction process.

In [3]:
import csv
import numpy as np
import scipy.spatial.distance
from tqdm import tqdm

## 1.4 Feature Extraction (Intersection Based) - Simplified
def feature_extraction_file(data, name, fps):
    features = [["correlation",
                "chebyshev", 
                "intersecting_mac",
                "euclidean",
                "cosine",
                "jensenshannon",
                "jaccard",
                "canberra",
                "minkowski",
                "real"]]
    
    for i in tqdm(data, position=0, leave=True):
        fp1 = fps[i[0]]
        fp2 = fps[i[1]]
        feature = feature_extraction(fp1, fp2) 
        feature.append(i[2])
        features.append(feature)
    
    with open(name, "w", newline='') as f:
        writer = csv.writer(f)
        writer.writerows(features) 

def feature_extraction(fp1, fp2):
    # Kesişim (intersection) kullanarak ortak MAC adreslerini bul
    common_macs = set(fp1.keys()).intersection(set(fp2.keys()))
    
    # Eğer kesişim boşsa, tüm mesafeleri 0 olarak döndür
    if not common_macs:
        intersecting_mac = 0
        # Boş vektörler için mesafe hesaplamaları
        output_data = [0, 0, intersecting_mac, 0, 0, 0, 0, 0, 0]
        return output_data
    
    # Kesişen MAC sayısı
    intersecting_mac = len(common_macs)
    
    # Sadece ortak MAC'ler için vektörleri oluştur
    f1 = [fp1[mac] for mac in common_macs]
    f2 = [fp2[mac] for mac in common_macs]
    
    # Güvenli mesafe hesaplamaları
    def safe_distance(func, v1, v2, default_value=0):
        """Mesafe hesaplama fonksiyonunu güvenli şekilde çalıştırır"""
        try:
            result = func(v1, v2)
            if np.isnan(result) or np.isinf(result):
                return default_value
            return result
        except (ValueError, ZeroDivisionError, RuntimeWarning):
            return default_value
    
    # Vektörleri numpy array'e çevir
    f1_arr = np.array(f1, dtype=float)
    f2_arr = np.array(f2, dtype=float)
    
    # Özel durumları kontrol et
    f1_std = np.std(f1_arr)
    f2_std = np.std(f2_arr)
    
    # Mesafe hesaplamaları
    correlation = safe_distance(scipy.spatial.distance.correlation, f1_arr, f2_arr, 1.0)
    chebyshev = safe_distance(scipy.spatial.distance.chebyshev, f1_arr, f2_arr, 0)
    euclidean = safe_distance(scipy.spatial.distance.euclidean, f1_arr, f2_arr, 0)
    
    # Cosine için özel kontrol (sıfır vektör kontrolü)
    if f1_std == 0 and f2_std == 0:
        cosine = 0  # İki vektör de sabit ise mesafe sıfır
    elif f1_std == 0 or f2_std == 0:
        cosine = 1  # Bir vektör sabit ise maksimum mesafe
    else:
        cosine = safe_distance(scipy.spatial.distance.cosine, f1_arr, f2_arr, 1.0)
    
    jensenshannon = safe_distance(scipy.spatial.distance.jensenshannon, f1_arr, f2_arr, 0)
    canberra = safe_distance(scipy.spatial.distance.canberra, f1_arr, f2_arr, 0)
    minkowski = safe_distance(scipy.spatial.distance.minkowski, f1_arr, f2_arr, 0)
    
    # Jaccard mesafesi için binary vektörler (güvenli hesaplama)
    threshold = -70  
    f1_binary = [1 if x > threshold else 0 for x in f1]
    f2_binary = [1 if x > threshold else 0 for x in f2]
    
    # Jaccard için özel kontrol (tüm sıfır vektör kontrolü)
    if sum(f1_binary) == 0 and sum(f2_binary) == 0:
        jaccard = 0  # İki vektör de tüm sıfır ise benzer
    elif sum(f1_binary) == 0 or sum(f2_binary) == 0:
        jaccard = 1  # Bir vektör tüm sıfır ise maksimum mesafe
    else:
        jaccard = safe_distance(scipy.spatial.distance.jaccard, f1_binary, f2_binary, 1.0)
    
    output_data = [correlation,
                  chebyshev,
                  intersecting_mac,
                  euclidean,
                  cosine,
                  jensenshannon,
                  jaccard,
                  canberra,
                  minkowski]
    
    # NaN değerleri 0 ile değiştir
    output_data = [0 if x != x else x for x in output_data]
    
    return output_data

In [4]:
## 1.3 Loading the data
import csv
import json
import os
from tqdm import tqdm

In [None]:

prelist=find_the_way("./",".csv","")
prelist=[#'./task2_train_elevations.csv',
 './task2_train_estimated_wifi_distances.csv']#,
# './task2_train_steps.csv']

for p in prelist:

    print(p)

    with open("task2_train_fingerprints.json") as f:
        fps_train = json.load(f)
    
    with open(p) as f:
        train_data = []
        train_h = csv.DictReader(f)
        for pair in tqdm(train_h):
            train_data.append([pair['id1'],pair['id2'],float(pair['estimated_distance'])])
    print("Train Data loaded!!")
    feature_extraction_file(train_data,p.replace("@.csv","-distance.csv"),fps_train)
    print("\n\n")
    print("*"*100)

./task2_train_estimated_wifi_distances.csv


1944671it [00:11, 163855.10it/s]


Train Data loaded!!


  1%|▌                                                                       | 13590/1944671 [00:06<15:45, 2041.86it/s]