In [10]:
%%time
import dask
import dask.dataframe as dd
import geopandas as gpd
import pandas as pd
from shapely.geometry import Polygon, Point
import json
import datetime
from time import strftime, localtime
df=dd.read_csv('/Users/yusufkemahli/task_1_datas/first_task.access_points_wout_tenant.csv')
df = df.drop(['POSITION.GRID[0]', 'POSITION.GRID[1]'], axis=1)

CPU times: user 1.79 ms, sys: 52 μs, total: 1.85 ms
Wall time: 1.82 ms


In [4]:
def get_points(my_path):
    with open(my_path) as f:
        data = json.load(f)
    coordinates = data[0]['geometry']['coordinates'][0]
    points = [Point(coord[1],coord[0]) for coord in coordinates]
    return points

In [3]:
def find_max_min_array(my_path):
    with open(my_path) as f:
        data = json.load(f)
    coordinates = data[0]['geometry']['coordinates'][0]
    pdf=pd.DataFrame(coordinates,columns= ['Y','X'])
    pdf= pdf[['X','Y']]
    max_x=pdf['X'].max()
    min_x=pdf['X'].min()
    max_y=pdf['Y'].max()
    min_y=pdf['Y'].min()
    return [max_x,min_x,max_y,min_y]

In [4]:
def first_filter(my_data_frame,my_path):
    my_array = find_max_min_array(my_path)
    condition = (my_data_frame['POSITION.X'] < my_array[0]) & \
            (my_data_frame['POSITION.X'] > my_array[1]) & \
            (my_data_frame['POSITION.Y'] < my_array[2]) & \
            (my_data_frame['POSITION.Y'] > my_array[3])
    return my_data_frame.loc[condition]

In [5]:
%%time
def filter_within_polygon(my_data_frame, my_path):
    polygon = Polygon(get_points(my_path))
    my_filtered_data_frame = first_filter(my_data_frame, my_path)
    my_filtered_data_frame = my_filtered_data_frame.assign(
        within_polygon=my_filtered_data_frame.apply(
            lambda row: Point(row['POSITION.X'], row['POSITION.Y']).within(polygon), axis=1, meta=(None, 'bool')
        )
    )
    filtered_df = my_filtered_data_frame[my_filtered_data_frame['within_polygon']].drop(columns='within_polygon')
    return filtered_df
    
filtered_fids_df = filter_within_polygon(df, '/Users/yusufkemahli/task_1_datas/wifi.fids_coordinates.json')
filtered_shop_df = filter_within_polygon(df, '/Users/yusufkemahli/task_1_datas/wifi.shop_area_coordinates.json')
filtered_east_df = filter_within_polygon(df, '/Users/yusufkemahli/task_1_datas/wifi.shop_area_east_coordinates.json')
filtered_west_df = filter_within_polygon(df, '/Users/yusufkemahli/task_1_datas/wifi.shop_area_west_coordinates.json')


CPU times: user 13.3 ms, sys: 2.27 ms, total: 15.6 ms
Wall time: 16 ms


In [6]:
def convert_date_time(df):
    df = df.assign(
        WINDOW_START_DATETIME=df.apply(
            lambda row: strftime('%Y-%m-%d %H:%M:%S', localtime(row.WINDOW_START)), axis=1, meta=(None, 'object')
        )
    )
    df = df.assign(
        WINDOW_START_DATE=df.apply(
            lambda row: strftime('%Y-%m-%d', localtime(row.WINDOW_START)), axis=1, meta=(None, 'object')
        )
    )
    df = df.assign(
        WINDOW_START_TIME=df.apply(
            lambda row: strftime('%H:%M:%S', localtime(row.WINDOW_START)), axis=1, meta=(None, 'object')
        )
    )
    return df


filtered_fids_df = convert_date_time(filtered_fids_df)
filtered_shop_df = convert_date_time(filtered_shop_df)
filtered_east_df = convert_date_time(filtered_east_df)
filtered_west_df = convert_date_time(filtered_west_df)

CPU times: user 6.86 ms, sys: 806 μs, total: 7.66 ms
Wall time: 7.05 ms


In [7]:
%%time
def group_df(my_filtered_df, my_column_name):
    original_dtypes = {'CLIMAC': ('string'), 'WINDOW_START_DATE': ('object'), my_column_name: ('float64')}
    grouped_df = my_filtered_df.groupby(['CLIMAC','WINDOW_START_DATE']).size().reset_index()
    grouped_df.columns = ['CLIMAC', 'WINDOW_START_DATE','MEMBER_COUNT']
    grouped_df[my_column_name] = grouped_df['MEMBER_COUNT'] / 2
    grouped_df=grouped_df.drop(columns='MEMBER_COUNT')
    grouped_df = grouped_df.astype(original_dtypes)
    return grouped_df

grouped_fids_df = group_df(filtered_fids_df,'FIDS_TOTAL_TIME_IN_MINUTE')
grouped_shop_df = group_df(filtered_shop_df,'SHA_TOTAL_TIME_IN_MINUTE')
grouped_east_df = group_df(filtered_east_df,'SHA_EAST_TOTAL_TIME_IN_MINUTE')
grouped_west_df = group_df(filtered_west_df,'SHA_WEST_TOTAL_TIME_IN_MINUTE')



CPU times: user 18.2 ms, sys: 1.36 ms, total: 19.5 ms
Wall time: 19.2 ms


In [8]:
%%time
categories= ["0-15", "15-30", "30-60", "60-90", "90-120", "120+"]
special_categories= ['0-5','5-10','10-15','15-20','20+']

def selection(my_float):
    if my_float <= 15:
        return categories[0]
    elif my_float <= 30:
        return categories[1]      
    elif my_float <= 60:
        return categories[2]       
    elif my_float <= 90:
        return categories[3]
    elif my_float <= 120:
        return categories[4] 
    else:
        return categories[5]

def special_selection(my_float):
    if my_float <= 5:
        return special_categories[0]
    elif my_float <= 10:
        return special_categories[1]     
    elif my_float <= 15:
        return special_categories[2]       
    elif my_float <= 20:
        return special_categories[3]
    else:
        return special_categories[4]

def categorization(my_df,my_column_name,my_selection_column_name):
    my_df = my_df.assign(
        **{my_column_name : my_df.apply(
            lambda row: selection(row[my_selection_column_name]), axis=1, meta=(None, 'object')
            )}
    )
    return my_df
    
def special_categorization(my_df):
    my_df = my_df.assign(
        FIDS_SPECIAL_CATEGORY=my_df.apply(
            lambda row: special_selection(row['FIDS_TOTAL_TIME_IN_MINUTE']), axis=1, meta=(None, 'object')
        )    
    )
    return my_df


grouped_fids_df=categorization(grouped_fids_df,'FIDS_CATEGORY','FIDS_TOTAL_TIME_IN_MINUTE')
grouped_fids_df=special_categorization(grouped_fids_df)

grouped_shop_df=categorization(grouped_shop_df,'SHA_CATEGORY','SHA_TOTAL_TIME_IN_MINUTE')
grouped_east_df=categorization(grouped_east_df,'SHA_EAST_CATEGORY','SHA_EAST_TOTAL_TIME_IN_MINUTE')
grouped_west_df=categorization(grouped_west_df,'SHA_WEST_CATEGORY','SHA_WEST_TOTAL_TIME_IN_MINUTE')

CPU times: user 2.93 ms, sys: 262 μs, total: 3.19 ms
Wall time: 3.04 ms


In [48]:
two_merged_df= dd.merge(grouped_fids_df,grouped_shop_df,how='inner', on=['CLIMAC','WINDOW_START_DATE'])

three_merged_df=dd.merge(two_merged_df,grouped_east_df, on=['CLIMAC','WINDOW_START_DATE'],how='left')

four_merged_df=dd.merge(three_merged_df,grouped_west_df, on=['CLIMAC','WINDOW_START_DATE'],how='left')

CPU times: user 3.91 ms, sys: 653 μs, total: 4.56 ms
Wall time: 4.43 ms


In [None]:
four_merged_df.compute().to_csv('final_dataframe.csv',index=False)

In [11]:
print(four_merged_df.dtypes)

CLIMAC                           string[python]
WINDOW_START_DATE                        object
FIDS_TOTAL_TIME_IN_MINUTE               float64
FIDS_CATEGORY                            object
FIDS_SPECIAL_CATEGORY                    object
SHA_TOTAL_TIME_IN_MINUTE                float64
SHA_CATEGORY                             object
SHA_EAST_TOTAL_TIME_IN_MINUTE           float64
SHA_EAST_CATEGORY                        object
SHA_WEST_TOTAL_TIME_IN_MINUTE           float64
SHA_WEST_CATEGORY                        object
dtype: object
