# Mobilite Verisi Özellik Çıkarımı

Bu not defteri, `final_mobility_analysis_results.csv` dosyasındaki işlenmiş mobilite verilerinden cihaz (device_aid) bazında özellikler çıkarır.
Amaç, her bir cihazın davranışlarını ve tercihlerini yansıtan bir özellik seti oluşturmaktır.
Bu özellikler daha sonra persona oluşturma gibi analizlerde kullanılabilir.

## 1. Ortam Kurulumu ve Kütüphanelerin İçe Aktarılması

In [None]:
import pandas as pd
from google.colab import drive
import os

## 2. Google Drive Bağlantısı ve Dosya Yolları

In [None]:
drive.mount('/content/drive')

# Lütfen GEREKİRSE aşağıdaki dosya yollarını kendi Google Drive yapınıza göre güncelleyin:
BASE_DRIVE_PATH = '/content/drive/MyDrive/' # Genellikle '/content/drive/MyDrive/' olur

INPUT_CSV_NAME = 'final_mobility_analysis_results.csv'
OUTPUT_FEATURES_CSV_NAME = 'device_features_v2.csv' # Updated output name

input_file_path = os.path.join(BASE_DRIVE_PATH, INPUT_CSV_NAME)
output_file_path = os.path.join(BASE_DRIVE_PATH, OUTPUT_FEATURES_CSV_NAME)

## 3. Veri Yükleme ve Ön Hazırlık

In [None]:
try:
    df_results = pd.read_csv(input_file_path)
    print(f"'{INPUT_CSV_NAME}' başarıyla yüklendi. Boyut: {df_results.shape}")
except FileNotFoundError:
    print(f"HATA: Giriş dosyası bulunamadı: {input_file_path}")
    df_results = None
except Exception as e:
    print(f"HATA: Veri yüklenirken bir sorun oluştu: {e}")
    df_results = None

if df_results is not None:
    if 'timestamp_dt' in df_results.columns and not pd.api.types.is_datetime64_any_dtype(df_results['timestamp_dt']):
        df_results['timestamp_dt'] = pd.to_datetime(df_results['timestamp_dt'])
    elif 'readable_time' in df_results.columns and not pd.api.types.is_datetime64_any_dtype(df_results['readable_time']):
        df_results['timestamp_dt'] = pd.to_datetime(df_results['readable_time'])
    elif 'timestamp' in df_results.columns:
        df_results['timestamp_dt'] = pd.to_datetime(df_results['timestamp'], unit='s')
    else:
        print("HATA: Geçerli bir zaman damgası sütunu bulunamadı.")
        df_results = None
    
    if df_results is not None and 'timestamp_dt' in df_results.columns:
        print("\nZaman damgası sütunu ('timestamp_dt') başarıyla datetime formatına getirildi.")
        original_df_results_cols = df_results.columns.tolist() # Store original columns before cleaning
        df_results.columns = df_results.columns.str.replace('[^A-Za-z0-9_]+', '', regex=True)
        print("Sütun adları temizlendi.")
        
        polygon_info_map = {
            'airport': 'airport_NAME', # Re-included
            'poi': 'poi_NAME',
            # 'lvl1': 'lvl1_NAME', # REMOVED
            # 'lvl2': 'lvl2_NAME', # REMOVED
            # 'lvl3': 'lvl3_NAME', # REMOVED
            # 'lvl4': 'lvl4_NAME', # REMOVED
            'clubs': 'clubs_Network',  # Re-included
            'hotels': 'hotels_MusteriTabelaAdi',
            'luxury_houses': 'luxuryhouses_POIAd', # Assuming 'luxuryhouses_POIAd' is correct from previous version
            'turkey_sites': 'turkeysites_KonutSiteleri',
            'p_schools': 'pschools_name'
        }
        valid_polygon_info = {}
        for p_short, name_col_original in polygon_info_map.items():
            in_col = f'in_{p_short}'
            name_col_cleaned = name_col_original 
            if in_col in df_results.columns:
                if name_col_cleaned and name_col_cleaned in df_results.columns:
                    valid_polygon_info[p_short] = {'in_col': in_col, 'name_col': name_col_cleaned}
                else:
                    valid_polygon_info[p_short] = {'in_col': in_col, 'name_col': None}
                    if name_col_cleaned:
                         print(f"Uyarı: '{p_short}' için isim sütunu '{name_col_cleaned}' bulunamadı, ancak '{in_col}' var.")
        
        polygon_cols_present = [info['in_col'] for p_short, info in valid_polygon_info.items()]
        print(f"\nAnalizde kullanılacak 'in_' poligon sütunları: {polygon_cols_present}")

## 4. Özellik Çıkarımı

In [None]:
if df_results is not None and 'device_aid' in df_results.columns and 'timestamp_dt' in df_results.columns:
    df_features = pd.DataFrame(index=df_results['device_aid'].unique())
    print(f"Özellik DataFrame'i {len(df_features.index)} benzersiz device_aid için oluşturuldu.")

    # --- I. Temel Aktivite Özellikleri ---
    print("\nI. Temel Aktivite Özellikleri çıkarılıyor...")
    df_features['total_pings'] = df_results.groupby('device_aid').size()
    df_features['unique_days_active'] = df_results.groupby('device_aid')['timestamp_dt'].apply(lambda x: x.dt.date.nunique())
    df_features['activity_span_days'] = df_results.groupby('device_aid')['timestamp_dt'].apply(lambda x: (x.max() - x.min()).days if pd.notnull(x.max()) and pd.notnull(x.min()) else 0)

    # --- II. Lokasyon Bazlı Yoğunluk Özellikleri ---
    print("\nII. Lokasyon Bazlı Yoğunluk Özellikleri çıkarılıyor...")
    for p_short, info in valid_polygon_info.items(): # This loop processes all valid polygons
        in_col = info['in_col']
        if df_results[in_col].dtype == 'object': 
            df_results[in_col] = df_results[in_col].map({'True': 1, 'False': 0, True: 1, False: 0}).fillna(0)
        elif df_results[in_col].dtype == 'bool':
            df_results[in_col] = df_results[in_col].astype(int)
        
        df_features[f'pings_in_{p_short}'] = df_results.groupby('device_aid')[in_col].sum().astype(int)
        df_features[f'ratio_in_{p_short}'] = (df_features[f'pings_in_{p_short}'] / df_features['total_pings']).fillna(0)

    # --- III. Lokasyon Çeşitliliği Özellikleri ---
    print("\nIII. Lokasyon Çeşitliliği Özellikleri çıkarılıyor...")
    ping_cols_for_diversity = [f'pings_in_{p_short}' for p_short in valid_polygon_info.keys() if f'pings_in_{p_short}' in df_features.columns]
    df_features['num_distinct_polygon_types_visited'] = df_features[ping_cols_for_diversity].gt(0).sum(axis=1)

    for p_short, info in valid_polygon_info.items(): # This loop processes all valid polygons
        in_col = info['in_col']
        name_col = info['name_col']
        feature_col_name = f'num_distinct_{p_short}'
        if name_col and name_col in df_results.columns: 
            df_in_polygon = df_results[df_results[in_col] == 1]
            if not df_in_polygon.empty:
                 distinct_counts = df_in_polygon.groupby('device_aid')[name_col].nunique()
                 df_features[feature_col_name] = distinct_counts
                 df_features[feature_col_name].fillna(0, inplace=True)
            else:
                 df_features[feature_col_name] = 0
        else:
            df_features[feature_col_name] = 0 

    # --- IV. Zaman Dilimi Bazlı Genel Aktivite Özellikleri ---
    print("\nIV. Zaman Dilimi Bazlı Genel Aktivite Özellikleri çıkarılıyor...")
    df_results['hour'] = df_results['timestamp_dt'].dt.hour
    time_bins = [0, 6, 12, 18, 24]
    time_labels = ['gece', 'sabah', 'ogle', 'aksam'] 
    df_results['time_of_day'] = pd.cut(df_results['hour'], bins=time_bins, labels=time_labels, right=False, include_lowest=True)

    tod_counts = df_results.groupby(['device_aid', 'time_of_day'], observed=False).size().unstack(fill_value=0)
    for label in time_labels:
        if label in tod_counts.columns:
            df_features[f'pings_{label}'] = tod_counts[label]
            df_features[f'ratio_{label}_pings'] = (df_features[f'pings_{label}'] / df_features['total_pings']).fillna(0)
        else:
            df_features[f'pings_{label}'] = 0
            df_features[f'ratio_{label}_pings'] = 0

    # --- V. Zaman Dilimi ve Lokasyon Bazlı Kombine Özellikler ---
    print("\nV. Zaman Dilimi ve Lokasyon Bazlı Kombine Özellikler çıkarılıyor...")
    for p_short, info in valid_polygon_info.items(): # This loop processes all valid polygons
        in_col = info['in_col']
        p_tod_counts = df_results[df_results[in_col] == 1].groupby(['device_aid', 'time_of_day'], observed=False).size().unstack(fill_value=0)
        for t_label in time_labels:
            col_name_p_t = f'pings_{p_short}_{t_label}'
            if t_label in p_tod_counts.columns:
                df_features[col_name_p_t] = p_tod_counts[t_label]
            else:
                df_features[col_name_p_t] = 0
            df_features[f'ratio_{p_short}_{t_label}_to_total_in_{p_short}'] = \
                (df_features[col_name_p_t] / df_features[f'pings_in_{p_short}'].replace(0, np.nan)).fillna(0)
            df_features[f'ratio_{p_short}_{t_label}_to_total_device_pings'] = \
                (df_features[col_name_p_t] / df_features['total_pings']).fillna(0)

    # --- VI. Potansiyel İkametgah Göstergesi (Gece Sinyalleri) ---
    print("\nVI. Potansiyel İkametgah Göstergesi (Gece Sinyalleri) çıkarılıyor...")
    df_night_pings_by_type = pd.DataFrame(index=df_features.index)
    for p_short, info in valid_polygon_info.items(): # This loop processes all valid polygons
        in_col = info['in_col']
        night_pings_in_p_type = df_results[(df_results['time_of_day'] == 'gece') & (df_results[in_col] == 1)]\
                                  .groupby('device_aid').size()
        df_night_pings_by_type[p_short] = night_pings_in_p_type
    df_night_pings_by_type.fillna(0, inplace=True)

    if not df_night_pings_by_type.empty:
        df_features['dominant_gece_location_type'] = df_night_pings_by_type.idxmax(axis=1)
        df_features['dominant_gece_location_ping_count'] = df_night_pings_by_type.max(axis=1).astype(int)
        df_features.loc[df_night_pings_by_type.sum(axis=1) == 0, 'dominant_gece_location_type'] = np.nan
        df_features.loc[df_night_pings_by_type.sum(axis=1) == 0, 'dominant_gece_location_ping_count'] = 0
        df_features['ratio_dominant_gece_loc_pings_to_total_gece'] = \
            (df_features['dominant_gece_location_ping_count'] / df_features['pings_gece'].replace(0, np.nan)).fillna(0)
    else:
        df_features['dominant_gece_location_type'] = np.nan
        df_features['dominant_gece_location_ping_count'] = 0
        df_features['ratio_dominant_gece_loc_pings_to_total_gece'] = 0

    df_night_all_devices = df_results[df_results['time_of_day'] == 'gece'].copy()
    def get_dominant_night_loc_name_apply(device_series):
        device_id = device_series.name
        dominant_type_short = device_series['dominant_gece_location_type']
        if pd.isna(dominant_type_short) or dominant_type_short not in valid_polygon_info or valid_polygon_info[dominant_type_short]['name_col'] is None:
            return np.nan
        in_col = valid_polygon_info[dominant_type_short]['in_col']
        name_col = valid_polygon_info[dominant_type_short]['name_col']
        device_night_pings_in_dom_type = df_night_all_devices[
            (df_night_all_devices['device_aid'] == device_id) & (df_night_all_devices[in_col] == 1)
        ]
        if device_night_pings_in_dom_type.empty or not device_night_pings_in_dom_type[name_col].notna().any():
            return np.nan
        mode_series = device_night_pings_in_dom_type[name_col].mode()
        return mode_series.iloc[0] if not mode_series.empty else np.nan
    if not df_features.empty and 'dominant_gece_location_type' in df_features.columns:
        df_features['dominant_gece_location_name'] = df_features.apply(get_dominant_night_loc_name_apply, axis=1)
    else:
        df_features['dominant_gece_location_name'] = np.nan
    df_features['is_gece_location_consistent_type'] = df_features.get('ratio_dominant_gece_loc_pings_to_total_gece', pd.Series(0, index=df_features.index))
    def get_pings_at_dominant_name(device_series):
        device_id = device_series.name
        dom_name = device_series['dominant_gece_location_name']
        dom_type_short = device_series['dominant_gece_location_type']
        if pd.isna(dom_name) or pd.isna(dom_type_short) or dom_type_short not in valid_polygon_info or valid_polygon_info[dom_type_short]['name_col'] is None:
            return 0
        in_col = valid_polygon_info[dom_type_short]['in_col']
        name_col = valid_polygon_info[dom_type_short]['name_col']
        pings = df_night_all_devices[
            (df_night_all_devices['device_aid'] == device_id) &
            (df_night_all_devices[in_col] == 1) &
            (df_night_all_devices[name_col] == dom_name)
        ].shape[0]
        return pings
    if not df_features.empty and 'dominant_gece_location_name' in df_features.columns:
        pings_at_dom_name = df_features.apply(get_pings_at_dominant_name, axis=1)
        df_features['is_gece_location_consistent_name'] = (pings_at_dom_name / df_features['pings_gece'].replace(0, np.nan)).fillna(0)
    else:
        df_features['is_gece_location_consistent_name'] = 0

    # --- VII. Hafta İçi/Hafta Sonu Aktivite Özellikleri ---
    print("\nVII. Hafta İçi/Hafta Sonu Aktivite Özellikleri çıkarılıyor...")
    df_results['day_of_week'] = df_results['timestamp_dt'].dt.dayofweek 
    df_results['is_weekend'] = df_results['day_of_week'] >= 5 
    weekend_counts = df_results.groupby(['device_aid', 'is_weekend'], observed=False).size().unstack(fill_value=0)
    if False in weekend_counts.columns: df_features['pings_weekday'] = weekend_counts[False]
    else: df_features['pings_weekday'] = 0
    if True in weekend_counts.columns: df_features['pings_weekend'] = weekend_counts[True]
    else: df_features['pings_weekend'] = 0
    df_features['ratio_weekend_pings'] = (df_features['pings_weekend'] / df_features['total_pings']).fillna(0)
    for p_short, info in valid_polygon_info.items(): # This loop processes all valid polygons
        in_col = info['in_col']
        p_weekend_counts = df_results[df_results[in_col] == 1].groupby(['device_aid', 'is_weekend'], observed=False).size().unstack(fill_value=0)
        col_name_p_wd = f'pings_{p_short}_weekday'
        col_name_p_we = f'pings_{p_short}_weekend'
        if False in p_weekend_counts.columns: df_features[col_name_p_wd] = p_weekend_counts[False]
        else: df_features[col_name_p_wd] = 0
        if True in p_weekend_counts.columns: df_features[col_name_p_we] = p_weekend_counts[True]
        else: df_features[col_name_p_we] = 0
        df_features[f'ratio_{p_short}_weekend_to_total_{p_short}'] = \
            (df_features[col_name_p_we] / df_features[f'pings_in_{p_short}'].replace(0, np.nan)).fillna(0)
    
    # --- VIII. Top Ziyaret Edilen Spesifik Mekanlar ---
    print("\nVIII. Top Ziyaret Edilen Spesifik Mekanlar özellikleri çıkarılıyor...")
    all_locations_data = []
    for p_short, info in valid_polygon_info.items(): # This loop processes all valid polygons
        in_col = info['in_col']
        name_col = info['name_col']
        if name_col and name_col in df_results.columns: 
            df_subset = df_results[(df_results[in_col] == 1) & df_results[name_col].notna()].copy()
            if not df_subset.empty:
                df_subset['location_type_derived'] = p_short
                temp_df = df_subset[['device_aid', name_col, 'location_type_derived']].rename(columns={name_col: 'location_name'})
                all_locations_data.append(temp_df)
    
    cleaned_neighborhood_col = 'neighborhood' 
    if 'neighborhood' not in df_results.columns:
        for orig_col_idx, orig_col_name_in_file in enumerate(original_df_results_cols):
            if orig_col_name_in_file == 'neighborhood': 
                if orig_col_idx < len(df_results.columns):
                    cleaned_neighborhood_col = df_results.columns[orig_col_idx]
                    if 'neighborhood' not in cleaned_neighborhood_col: 
                        print(f"Uyarı: Orijinal 'neighborhood' sütunu, '{cleaned_neighborhood_col}' olarak temizlendi.")
                break
        else: 
            cleaned_neighborhood_col = None 
            print("Uyarı: 'neighborhood' sütunu bulunamadı.")
    
    if cleaned_neighborhood_col and cleaned_neighborhood_col in df_results.columns and df_results[cleaned_neighborhood_col].notna().any():
        df_subset_neighborhood = df_results[df_results[cleaned_neighborhood_col].notna()].copy()
        df_subset_neighborhood['location_type_derived'] = 'neighborhood'
        temp_df_neighborhood = df_subset_neighborhood[['device_aid', cleaned_neighborhood_col, 'location_type_derived']].rename(columns={cleaned_neighborhood_col: 'location_name'})
        all_locations_data.append(temp_df_neighborhood)
    else:
        print(f"Uyarı: Mahalle sütunu ('{cleaned_neighborhood_col}') bulunamadı veya tamamı boş. Top mekanlara dahil edilmeyecek.")

    N_TOP_LOCATIONS = 3
    top_n_cols_base = []
    for i in range(1, N_TOP_LOCATIONS + 1):
        top_n_cols_base.extend([f'top_{i}_loc_name', f'top_{i}_loc_type', f'top_{i}_loc_pings'])
    for col in top_n_cols_base: df_features[col] = np.nan 

    if all_locations_data:
        df_all_named_locations = pd.concat(all_locations_data, ignore_index=True)
        if not df_all_named_locations.empty:
            location_ping_counts = df_all_named_locations.groupby(['device_aid', 'location_name', 'location_type_derived'], observed=False).size().reset_index(name='ping_count')
            
            def get_top_n_locations_flat(group, cols_to_fill):
                top_n = group.sort_values('ping_count', ascending=False).head(N_TOP_LOCATIONS)
                records = []
                for i_row in range(N_TOP_LOCATIONS):
                    if i_row < len(top_n):
                        row = top_n.iloc[i_row]
                        records.extend([row.location_name, row.location_type_derived, row.ping_count])
                    else:
                        records.extend([np.nan, np.nan, np.nan]) 
                return pd.Series(records, index=cols_to_fill)

            if not location_ping_counts.empty:
                top_n_features_df = location_ping_counts.groupby('device_aid', group_keys=False).apply(get_top_n_locations_flat, cols_to_fill=top_n_cols_base)
                if not top_n_features_df.empty:
                    # Directly assign/update columns in df_features, aligning by index
                    for col_to_assign in top_n_features_df.columns:
                        df_features[col_to_assign] = top_n_features_df[col_to_assign].reindex(df_features.index)
                    
                    # Ensure correct types for ping counts and handle NaNs
                    for i in range(1, N_TOP_LOCATIONS + 1):
                        ping_col = f'top_{i}_loc_pings'
                        if ping_col in df_features.columns:
                           df_features[ping_col] = df_features[ping_col].fillna(0).astype(int)
                else:
                     print("Uyarı: top_n_features_df DataFrame'i boş, top N mekan özellikleri eklenemedi.")
            else:
                print("Uyarı: location_ping_counts DataFrame'i boş, top N mekan özellikleri eklenemedi.")
        else:
            print("Uyarı: df_all_named_locations DataFrame'i boş, top N mekan özellikleri eklenemedi.")
    else:
        print("Uyarı: En çok ziyaret edilen mekan özellikleri için isimlendirilmiş lokasyon verisi bulunamadı.")
    
    # --- IX. Top Neighborhood Time-of-Day Analysis ---
    print("\nIX. Top Neighborhood Time-of-Day Analysis özellikleri çıkarılıyor...")
    top_neighborhood_name_col_feat = 'top_neighborhood_name' # Name for feature df
    top_neighborhood_pings_col_feat = 'top_neighborhood_pings_total'
    
    # Initialize columns in df_features
    df_features[top_neighborhood_name_col_feat] = np.nan
    df_features[top_neighborhood_pings_col_feat] = 0
    for t_label in time_labels:
        df_features[f'top_neighborhood_pings_{t_label}'] = 0
        df_features[f'ratio_top_neighborhood_{t_label}_to_total_top_neighborhood'] = 0

    if cleaned_neighborhood_col and cleaned_neighborhood_col in df_results.columns and df_results[cleaned_neighborhood_col].notna().any():
        neighborhood_ping_counts_for_top = df_results.groupby(['device_aid', cleaned_neighborhood_col], observed=False).size().reset_index(name='pings_in_neighborhood')
        if not neighborhood_ping_counts_for_top.empty:
            # Find the neighborhood with max pings for each device_aid
            idx_top_hood = neighborhood_ping_counts_for_top.groupby(['device_aid'])['pings_in_neighborhood'].idxmax()
            top_neighborhood_info_df = neighborhood_ping_counts_for_top.loc[idx_top_hood].set_index('device_aid')
            
            # Assign to df_features, ensuring index alignment
            df_features[top_neighborhood_name_col_feat] = top_neighborhood_info_df[cleaned_neighborhood_col].reindex(df_features.index)
            df_features[top_neighborhood_pings_col_feat] = top_neighborhood_info_df['pings_in_neighborhood'].reindex(df_features.index).fillna(0).astype(int)

            # Merge top neighborhood name back to df_results to filter pings for time-of-day analysis
            # Use a temporary df_results with only necessary columns for merge efficiency
            df_results_for_merge = df_results[['device_aid', cleaned_neighborhood_col, 'time_of_day']].copy()
            df_results_temp = df_results_for_merge.merge(df_features[[top_neighborhood_name_col_feat]].reset_index(), on='device_aid', how='left')
            
            # Filter pings that occurred in each device's top neighborhood
            df_pings_in_top_hood = df_results_temp[df_results_temp[cleaned_neighborhood_col] == df_results_temp[top_neighborhood_name_col_feat]]

            if not df_pings_in_top_hood.empty:
                top_hood_tod_counts = df_pings_in_top_hood.groupby(['device_aid', 'time_of_day'], observed=False).size().unstack(fill_value=0)
                for t_label in time_labels:
                    col_name = f'top_neighborhood_pings_{t_label}'
                    ratio_col_name = f'ratio_top_neighborhood_{t_label}_to_total_top_neighborhood'
                    if t_label in top_hood_tod_counts.columns:
                        # Map results to df_features, aligning by index (device_aid)
                        df_features[col_name] = df_features.index.map(top_hood_tod_counts[t_label]).fillna(0).astype(int)
                        df_features[ratio_col_name] = (df_features[col_name] / df_features[top_neighborhood_pings_col_feat].replace(0, np.nan)).fillna(0)
                    # else columns remain 0 as initialized
            else:
                print("Uyarı: Hiçbir cihaz için en sık ziyaret edilen mahallede zaman dilimi bazlı ping bulunamadı.")
        else:
            print("Uyarı: Mahalle ping sayıları hesaplanamadı (neighborhood_ping_counts_for_top boş).")
    else:
        print(f"Uyarı: Mahalle sütunu ('{cleaned_neighborhood_col}') bulunamadığı veya boş olduğu için en sık mahalle zaman özellikleri hesaplanamadı.")

    print("\nÖzellik çıkarımı tamamlandı.")
else:
    print("Giriş verisi (df_results) yüklenemediği veya gerekli sütunlar eksik olduğu için özellik çıkarımı yapılamadı.")

## 5. Çıkarılan Özellikleri İnceleme ve Kaydetme

In [None]:
if 'df_features' in locals() and not df_features.empty:
    print("\nÇıkarılan Özellikler DataFrame'i (df_features) - İlk 5 satır:")
    pd.set_option('display.max_columns', None) 
    print(df_features.head())
    pd.reset_option('display.max_columns') 
    print(f"\nÖzellik DataFrame Boyutu: {df_features.shape}")
    print("\nÖzellik Sütunları:")
    for col in df_features.columns:
        print(col)
    
    try:
        df_features.to_csv(output_file_path, index=True) 
        print(f"\nÖzellikler başarıyla '{output_file_path}' dosyasına kaydedildi.")
    except Exception as e:
        print(f"HATA: Özellikler kaydedilirken bir sorun oluştu: {e}")
else:
    print("\nÖzellik DataFrame'i (df_features) oluşturulamadı veya boş.")