In [4]:
import pandas as pd
import matplotlib.pyplot as plt

In [139]:
df_k1000 = pd.read_csv('../data/dataset/5.df_k1000_clustering.csv', index_col=0)
df_k2000 = pd.read_csv('../data/dataset/5.df_k2000_clustering.csv', index_col=0)
df_k3000 = pd.read_csv('../data/dataset/5.df_k3000_clustering.csv', index_col=0)
df_k4000 = pd.read_csv('../data/dataset/5.df_k4000_clustering.csv', index_col=0)
df_k5000 = pd.read_csv('../data/dataset/5.df_k5000_clustering.csv', index_col=0)

In [140]:
def concat_frames(frames):
  for i in range(len(frames)):
    f = frames[i]
    f['cluster'] = f['cluster'].astype('int') + 1000*i
    frames[i] = f
    
  pd.concat(frames, axis=0)
  
concat_frames([df_k1000, df_k2000])

In [161]:
candidates_full_cols = pd.concat([df_k1000, df_k2000, df_k3000, df_k4000, df_k5000], axis=0)

18547

### TODO
-  groupby, sum
- 데이터 형식 바꾸기. cluster: subset, cell_id: element, count_cust: cost

In [144]:
candidates = candidates_full_cols.copy().groupby(['cluster', 'cell_id']).sum().reset_index()

In [147]:
candidates = candidates.copy()[['cluster', 'cell_id', 'count_cust']]

In [113]:
from collections import defaultdict

universe = set([])
subsets = defaultdict(set)
demands = defaultdict(int)

for _, r in candidates.iterrows():
  cluster, cell_id, count_cust = r
  
  cell_id = str(int(cell_id))
  cluster = str(int(cluster))
  universe.add(cell_id)
  subsets[cluster].add(cell_id)
  demands[cell_id] = count_cust

In [114]:
def greedy_set_cover(universe, subsets, costs):
    elements = set(e for s in subsets.keys() for e in subsets[s])
    # elements don't cover universe -> invalid input for set cover
    if elements != universe:
        return None

    # track elements of universe covered
    covered = set()
    cover_sets = []

    while covered != universe:
        min_cost_elem_ratio = float("inf")
        min_set = None
        # find set with minimum cost:elements_added ratio
        for s, elements in subsets.items():
            new_elements = len(elements - covered)
            # set may have same elements as already covered -> new_elements = 0
            # check to avoid division by 0 error
            if new_elements != 0:
                cost_elem_ratio = costs[s] / new_elements
                if cost_elem_ratio < min_cost_elem_ratio:
                    min_cost_elem_ratio = cost_elem_ratio
                    min_set = s
        cover_sets.append(min_set)
        # union
        covered |= subsets[min_set]
    return cover_sets

In [76]:
def calculate_subset_demand(subsets, demands):
      total_demands = {}
      for subset, elements in subsets.items():
        cost = 0
        for e in elements:
          cost += demands[str(e)]
        
        total_demands[subset] = cost
        
      total_demands_recip = {}
      
      for k, v in total_demands.items():
        total_demands_recip[k] = 1/v
        
      return [total_demands, total_demands_recip]

In [149]:
total_demands, total_demands_recip = calculate_subset_demand(subsets, demands)

In [164]:
cover = greedy_set_cover(universe, subsets, total_demands_recip)

KeyboardInterrupt: 

In [165]:
len(cover)

905

In [151]:
cover_df = pd.DataFrame(cover)
cover_df.columns = ['cluster']
candidates_full_cols['cluster'] = candidates_full_cols['cluster'].astype('str')

In [157]:
clusters_df = pd.merge(cover_df, candidates_full_cols, how='left', on='cluster')
clusters_df

Unnamed: 0,cluster,cell_id,resident_table,base_dt,dow,ccw_cd,ccw_nm,adng_cd,adng_nm,cell_xcrd,cell_ycrd,gender,age,app_web,time_zone,count_cust
0,714,83943045,0,20220620,1,4146,용인시,41463550,서농동,127.075294,37.229260,MALE,6,EV라운지,6,7.439
1,714,83943045,0,20220620,1,4146,용인시,41463550,서농동,127.075294,37.229260,MALE,7,하이차저,5,3.802
2,714,83943045,0,20220613,1,4146,용인시,41463550,서농동,127.075294,37.229260,MALE,9,EV라운지,2,7.393
3,714,83943045,0,20220607,2,4146,용인시,41463550,서농동,127.075294,37.229260,MALE,10,EV라운지,5,10.836
4,714,83943045,0,20220625,6,4146,용인시,41463550,서농동,127.075294,37.229260,FEMALE,11,EV라운지,4,13.982
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
876667,1654,84087237,1,20220610,5,4146,용인시,41465530,신봉동,127.079865,37.319866,MALE,9,EV라운지,4,4.320
876668,1654,84087237,1,20220626,7,4146,용인시,41465530,신봉동,127.079865,37.319866,MALE,7,EV라운지,4,3.764
876669,1654,84087236,1,20220618,6,4146,용인시,41465530,신봉동,127.079865,37.319416,MALE,6,EV라운지,4,3.764
876670,1654,84087236,1,20220630,4,4146,용인시,41465530,신봉동,127.079865,37.319416,MALE,8,EV라운지,4,4.320


### Coverage

- 수요가 발생한 노드의 수: `len(df_k1000['cell_id'].unique())` = 18547
- 커버 가능한 노드의 수: `len(clusters_df['cell_id'].unique())` = 17976

커버리지 = 수요노드 / 커버노드 = 17976 / 18547 = 96.92%


In [167]:
demand_count = len(clusters_df['cell_id'].unique())
covered_count = len(df_k1000['cell_id'].unique())
print(covered_count) #커버 가능한 노드의 수
print(demand_count) #수요가 발생한 노드의 수

print(demand_count / covered_count * 100)


18547
17976
96.92133498679031


## Cluster의 중심 좌표 구하기

In [187]:
# cluster로 그룹화 후 개별 데이터프레임으로 쪼개기

grouped_clusters_df = clusters_df.groupby('cluster')
groups = [grouped_clusters_df.get_group(g).reset_index() for g in grouped_clusters_df.groups]

In [188]:
groups[1].head()

Unnamed: 0,index,cluster,cell_id,resident_table,base_dt,dow,ccw_cd,ccw_nm,adng_cd,adng_nm,cell_xcrd,cell_ycrd,gender,age,app_web,time_zone,count_cust
0,630008,1,87734799,0,20220629,3,4146,용인시,41461510,중앙동,127.20889,37.22561,FEMALE,12,EV라운지,2,4.1
1,630009,1,87734799,0,20220626,7,4146,용인시,41461510,중앙동,127.20889,37.22561,FEMALE,12,EV라운지,3,4.1
2,630010,1,87734799,0,20220601,3,4146,용인시,41461510,중앙동,127.20889,37.22561,FEMALE,12,EV라운지,5,8.2
3,630011,1,87734799,0,20220605,7,4146,용인시,41461510,중앙동,127.20889,37.22561,FEMALE,12,EV라운지,3,4.1
4,630012,1,87734799,0,20220619,7,4146,용인시,41461510,중앙동,127.20889,37.22561,FEMALE,12,EV라운지,5,8.2


In [198]:
for g in groups:
  area = 0
  cx = 0
  cy = 0
  
  for i, r in g.head().iterrows():
    j = (i + 1) % g.shape[0]
    
    p1 = g.iloc[i]
    p2 = g.iloc[j]
    
    x1 = p1['cell_xcrd']
    x2 = p2['cell_xcrd']
    y1 = p1['cell_ycrd']
    y2 = p2['cell_ycrd']
    
    area += x1 * y2
    area -= y1 * x2
    
    cx += ((x1 + x2) * ((x1 * y2) - (x2 * y1)))
    cy += ((y1 + y2) * ((x1 * y2) - (x2 * y1)))
  
  # print(cx, cy)
  area /= 2
  area = abs(area)

  cx = cx / (6.0 * area);
  cy = cy / (6.0 * area);
  
  print(cx, cy)

  cx = cx / (6.0 * area);
  cy = cy / (6.0 * area);


nan nan
nan nan
nan nan
nan nan
nan nan
-84.82747666666667 -24.778812333333335
nan nan
nan nan
nan nan
nan nan
84.87047833333334 24.77408733333333
nan nan
nan nan
nan nan
nan nan
nan nan
84.83821333333333 24.82616
nan nan
nan nan
nan nan
84.80231650490818 24.815082165431072
84.80922 24.840968
-84.88473109371569 -24.779271476487793
nan nan
nan nan
-84.81763466666666 -24.852254
nan nan
nan nan
-84.82418238388169 -24.78118294174996
-84.75211359897013 -24.834048741692943
-84.76514 -24.740792666666664
nan nan
nan nan
nan nan
nan nan
nan nan
nan nan
nan nan
nan nan
nan nan
nan nan
nan nan
nan nan
nan nan
-84.7834980547968 -24.882946172860304
nan nan
nan nan
nan nan
nan nan
nan nan
84.78075724776255 24.821382786050837
nan nan
nan nan
nan nan
nan nan
84.8835851993564 24.840521872181924
nan nan
nan nan
nan nan
nan nan
nan nan
nan nan
nan nan
nan nan
nan nan
nan nan
nan nan
nan nan
nan nan
nan nan
nan nan
nan nan
-84.87779833333333 -24.775150333333336
nan nan
nan nan
84.81841778681932 24.8932662

```
for (i = 0; i < points; i++)
{
	j = (i + 1) % points;

	pt1 = pPoly->GetPointAt(i);
	pt2 = pPoly->GetPointAt(j);

	x1 = pt1.x;
	x2 = pt2.x;
	y1 = pt1.y;
	y2 = pt2.y;

	area += x1 * y2;
	area -= y1 * x2;

	centerx += ((x1 + x2) * ((x1 * y2) - (x2 * y1)));
	centery += ((y1 + y2) * ((x1 * y2) - (x2 * y1)));
}

area /= 2.0;
area = fabs(area);

centerx = centerx / (6.0 * area);
centery = centery / (6.0 * area);
```