<a href="https://colab.research.google.com/github/kyochanpy/Google_Smartphone_Decimeter_Challenge/blob/main/PP/remove_device_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from glob import glob

In [2]:
def get_groundtruth(path: Path) -> pd.DataFrame:
    output_df = pd.DataFrame()
    
    for path in glob(str(path / 'train/*/*/ground_truth.csv')):
        _df = pd.read_csv(path)
        output_df = pd.concat([output_df, _df])
    output_df = output_df.reset_index(drop=True)
    
    _columns = ['latDeg', 'lngDeg', 'heightAboveWgs84EllipsoidM']
    output_df[['t_'+col for col in _columns]] = output_df[_columns]
    output_df = output_df.drop(columns=_columns, axis=1)
    return output_df

In [3]:
def calc_haversine(lat1, lon1, lat2, lon2):
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    d = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(d**0.5)
    return dist

In [4]:
base = Path('/content/drive/MyDrive/GSDC')
train_base = pd.read_csv(base / 'baseline_locations_train.csv')
test_base = pd.read_csv(base / 'baseline_locations_test.csv')

# merge graoundtruth
train_base = train_base.merge(
    get_groundtruth(base),
    on=['collectionName', 'phoneName', 'millisSinceGpsEpoch']
)
sub = pd.read_csv(base / 'sample_submission.csv')

In [6]:
def remove_device(input_df):
    remove_list = []
    for name, df in train_base.groupby('collectionName'):
        phones_list = df["phoneName"].unique().tolist()
        meter_list = []

        if (len(phones_list) == 1):
                remove_list.append('dont_remove')
                continue

        for phone in phones_list:
            phone_df = df[df["phoneName"] == phone]
            phone_df['meter'] = phone_df.apply(lambda r: calc_haversine(r.latDeg, r.lngDeg, r.t_latDeg, r.t_lngDeg),axis=1)
            meter_score = phone_df['meter'].mean()
            meter_list.append(meter_score)
        remove_meter = sorted(meter_list)[-1]
        remove_idx = meter_list.index(remove_meter)
        remove_list.append(phones_list[remove_idx])


        def get_removedevice(input_df):
            input_df['index'] = input_df.index
            input_df = input_df.sort_values('millisSinceGpsEpoch')
            input_df.index = input_df['millisSinceGpsEpoch'].values
            sub_list = list(input_df["collectionName"].unique())

            output_df = pd.DataFrame() 
            for i, subdf in input_df.groupby('collectionName'):

                phones = subdf['phoneName'].unique()
                sub_idx = sub_list.index(i)
                device = remove_list[sub_idx-1]

                if (len(phones) == 1) or (not device in phones):
                    output_df = pd.concat([output_df, subdf])
                    continue

                origin_df = subdf.copy()
                
                _index = subdf['phoneName']==device
                subdf.loc[_index, 'latDeg'] = np.nan
                subdf.loc[_index, 'lngDeg'] = np.nan
                subdf = subdf.interpolate(method='index', limit_area='inside')

                _index = subdf['latDeg'].isnull()
                subdf.loc[_index, 'latDeg'] = origin_df.loc[_index, 'latDeg'].values
                subdf.loc[_index, 'lngDeg'] = origin_df.loc[_index, 'lngDeg'].values

                output_df = pd.concat([output_df, subdf])

            output_df.index = output_df['index'].values
            output_df = output_df.sort_index()

            del output_df['index']
            
            return output_df


    output_df = get_removedevice(input_df)
    
    return output_df

In [7]:
remove_device(test_base)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone
0,2020-05-15-US-MTV-1,Pixel4,1273608785432,37.416554,-122.082085,-30.69,2020-05-15-US-MTV-1_Pixel4
1,2020-05-15-US-MTV-1,Pixel4,1273608786432,37.416589,-122.082072,-31.76,2020-05-15-US-MTV-1_Pixel4
2,2020-05-15-US-MTV-1,Pixel4,1273608787432,37.416571,-122.082071,-31.65,2020-05-15-US-MTV-1_Pixel4
3,2020-05-15-US-MTV-1,Pixel4,1273608788432,37.416577,-122.082073,-31.52,2020-05-15-US-MTV-1_Pixel4
4,2020-05-15-US-MTV-1,Pixel4,1273608789432,37.416626,-122.082041,-28.95,2020-05-15-US-MTV-1_Pixel4
...,...,...,...,...,...,...,...
91481,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763185000,37.334539,-121.899383,-8.39,2021-04-29-US-SJC-3_SamsungS20Ultra
91482,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763186000,37.334545,-121.899380,-7.36,2021-04-29-US-SJC-3_SamsungS20Ultra
91483,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763187000,37.334551,-121.899371,-4.08,2021-04-29-US-SJC-3_SamsungS20Ultra
91484,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763188000,37.334540,-121.899371,-5.70,2021-04-29-US-SJC-3_SamsungS20Ultra
