# Import Package

In [67]:
import pulp, json
import networkx as nx
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt

# Read Both

In [75]:
# 讀入通訊紀錄
data = json.load(open("input/json-intra-main-group-of-8W05D6H20.JSON.json"))

# 處理通訊紀錄的欄位
df = pd.DataFrame(data['content_in_role'])
df["Time"] = df["Time"].apply(pd.to_datetime)

# Text Data

In [76]:
# read the text data
df_text = df[df["Type"] == "SMS"][["nr", "Time", "ID_From", "ID_Received", "Type"]]
print(f"Before: # of Text: {len(df_text)}")

# drop the duplicated row
duplicated_idx = df_text[["Time", "ID_From", "ID_Received"]].duplicated()
df_text = df_text[~duplicated_idx]
print(f"After: # of Text: {len(df_text)}")
df_text.head(1)

Before: # of Text: 604
After: # of Text: 562


Unnamed: 0,nr,Time,ID_From,ID_Received,Type
6,7,2014-07-06 12:29:00,MEQWEGGDW,MEGGGXPKG,SMS


# Voice Data

In [79]:
# # 將「通話」和「簡訊」的紀錄分開處理
df_voice = df[df["Type"] == "Voice"][["nr", "Time", "ID_From", "ID_Received", "Duration_in_second", "Type"]]
df_voice["End_Time"] = df_voice.loc[:, "Time"] + pd.to_timedelta(df_voice["Duration_in_second"], unit='s')
df_voice["overlap_id"] = pd.NA

voice_from_to_lst = df_voice[["ID_From", "ID_Received"]].drop_duplicates(subset=["ID_From", "ID_Received"]).values.tolist()
G = nx.Graph()
idx = 0
for (id_from, id_received) in voice_from_to_lst:
    
    sub_voice = df_voice[(df_voice["ID_From"] == id_from) & (df_voice["ID_Received"] == id_received)]
    intervals = pd.IntervalIndex.from_arrays(
        sub_voice["Time"], sub_voice["End_Time"], closed='both'
    )
    
    # grouping phone calls with overlapped time interval from the same two persons
    for interval in intervals:
        nr = sub_voice[intervals.overlaps(interval)]["nr"].to_list()
        G.add_edges_from(
            [(nr[i], nr[i+1]) for i in range(len(nr)-2)] + [(nr[-1], nr[0])]
        )

# keep the latest phone call in each overlapped group
CC = [G.subgraph(c).copy() for c in nx.connected_components(G)]
for i, c in enumerate(CC):
    df_voice.loc[df_voice["nr"].isin(list(c.nodes)), "overlap_id"] = i
df_voice = df_voice.sort_values('End_Time').groupby(["overlap_id"]).tail(1)
print(f"# of Voice: {len(df_voice)}")
df_voice.head(1)

# of Voice: 3561


Unnamed: 0,nr,Time,ID_From,ID_Received,Duration_in_second,Type,End_Time,overlap_id
98,99,2014-07-01 15:25:00,MEQWEGGDW,MEGGGXPKG,228.0,Voice,2014-07-01 15:28:48,37


# Export Data

In [81]:
df = pd.concat([df_voice, df_text])
df.to_csv("input/input.csv", index=False)