In [20]:
from enum import Enum

import requests
import json
from tqdm import tqdm
import pandas as pd

In [9]:
TOKEN = "xoxb-659631020560-2531853286371-Hk9oLTKC9rAzq6zzDaBZD4W3"


def parse_json(dic, prefixes=[]):
    new_dic = {}
    for key, value in dic.items():
        if type(value) == list:
            value = {str(i): v for i, v in enumerate(value)}
            
        if type(value) == dict:
            new_dic.update(parse_json(value, prefixes + [key]))
        else:
            new_dic[f"{'.'.join(prefixes + [key])}"] = value
    return new_dic


def write_to_jsonl(file_path, lis: list):
    with open(file_path, 'w', encoding='utf8') as f:
        for dic in lis:
            json.dump(dic, f, ensure_ascii=False)
            f.write('\n')


class RequestType(Enum):
    CHANNELS = 1
    HISTORY = 2
    

def request_slack_api(request_type: RequestType, 
                      payload: dict = {},
                     ):
    if request_type == RequestType.CHANNELS:
        url = "https://slack.com/api/conversations.list"
        payload["limit"] = "1000"
    elif request_type == RequestType.HISTORY:
        url = "https://slack.com/api/conversations.history"
        
    header = {
        "Authorization": "Bearer {}".format(TOKEN)
    }

    res = requests.get(url, headers=header, params=payload)
    return res.json()


def retrieve_slack_api(request_type: RequestType, 
                       payload: dict = {},
                      ):
    res = []
    res.append(request_slack_api(request_type, payload))
    if not res[-1]['ok']:
        print(res)
        return []
#         raise RuntimeError
    has_more = res[-1]['has_more']
    while has_more:
        payload['cursor'] = res[-1]['response_metadata']['next_cursor']
        res.append(request_slack_api(request_type, payload))
        has_more = res[-1]['has_more']
        
    return res


def retrieve_channel_history(channel_id):
    messages = []
    payload = {
        "channel" : channel_id,
        "limit": 200,
    }
    
    res_list = retrieve_slack_api(RequestType.HISTORY, payload)
    [messages.extend(res['messages']) for res in res_list if res]
    return messages


def parse_channels(res: dict):
    flatten_df = pd.concat([pd.Series(parse_json(dic)) for dic in res['channels']], axis=1).T
    return flatten_df


def parse_messages(messages: list):        
    return pd.concat([pd.Series(parse_json(m)) for m in messages], axis=1).T

## conversations.list

In [21]:
res = request_slack_api(RequestType.CHANNELS)
channel_df = parse_channels(res)
channel_df = channel_df.applymap(lambda x: x.replace('\n', '') if type(x) == str else x)
channel_df.to_csv('../export/channel_flatten.csv', index=False)
channel_df

Unnamed: 0,id,name,is_channel,is_group,is_im,created,is_archived,is_general,unlinked,name_normalized,...,topic.creator,topic.last_set,purpose.value,purpose.creator,purpose.last_set,num_members,previous_names.0,previous_names.1,previous_names.2,previous_names.3
0,CK26510JW,general,True,False,False,1560071290,False,True,0,general,...,UKDJ78A4V,1560071290,このチャンネルはワークスペース全体のコミュニケーションと社内アナウンス用です。全メンバーがこ...,UKDJ78A4V,1560071290,39,,,,
1,CMF34A3AP,gunosy,True,False,False,1566965117,True,False,0,gunosy,...,,0,,,0,0,,,,
2,CMS1S0TD2,room-tetsu,True,False,False,1566962437,False,False,0,room-tetsu,...,UKDJ78A4V,1572318831,今日やることを投稿していく〜〜,UKDJ78A4V,1566962438,33,tetsu-room,todo,,
3,CNMAQRN91,2_tech,True,False,False,1570011716,False,False,0,2_tech,...,,0,技術的なことをシェアする,UKDJ78A4V,1570011716,35,20_tech,2_tech,tech,
4,CNMB58BM1,room-sam,True,False,False,1570012645,False,False,0,room-sam,...,,0,,,0,10,sam-room,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,C02AVNKNFQD,room-hong,True,False,False,1628744605,False,False,0,room-hong,...,,0,,,0,10,,,,
102,C02B4PBPQA0,room-kiyo,True,False,False,1628737238,False,False,0,room-kiyo,...,,0,,,0,11,,,,
103,C02DW7XSRRA,room-yamada,True,False,False,1631071765,False,False,0,room-yamada,...,,0,,,0,10,,,,
104,C02E57NJU6N,room-mazzini,True,False,False,1631441343,False,False,0,room-mazzini,...,,0,,,0,11,room_mazzini,,,


In [22]:
write_to_jsonl('../export/channels.jsonl', res['channels'])

## conversations.history

In [12]:
channel_id = 'C01UN98V1F0'
messages = retrieve_channel_history(channel_id)
message_df = parse_messages(messages)
message_df

Unnamed: 0,type,subtype,ts,user,text,inviter,client_msg_id,team,blocks.0.type,blocks.0.block_id,...,attachments.0.channel_id,attachments.0.channel_name,attachments.0.is_msg_unfurl,attachments.0.mrkdwn_in.0,topic,reactions.3.name,reactions.3.users.0,reactions.3.count,reactions.1.users.1,reactions.1.users.2
0,message,channel_join,1632558873.077300,U02FMR38EAX,<@U02FMR38EAX>さんがチャンネルに参加しました,U01HQVDCC7K,,,,,...,,,,,,,,,,
1,message,,1632541899.075100,U01HQVDCC7K,python使ってmyslackchannelのメッセージを全部抜くスレッド,,dd989b1a-9aa7-4244-8fa0-7591e9ed0fea,TKDJK0LGG,rich_text,pyU,...,,,,,,,,,,
2,message,,1632530581.072200,UKDJ78A4V,データソースの置き場所を複数（S3、GCS）試してみるとかはありな気がする,,5e5b89e6-7613-4183-8c83-e24b7c1761f6,TKDJK0LGG,rich_text,hB3=O,...,,,,,,,,,,
3,message,,1632530453.071000,U01HQVDCC7K,競馬AIもっかい作るのもあり。データ基盤の練習にもなるし。,,4A239DBA-2A57-4265-9DDC-E3E893AC66A1,TKDJK0LGG,rich_text,Mz=,...,,,,,,,,,,
4,message,,1632524902.068800,U01HQVDCC7K,あとやっぱり自分のヘルスケアデータのデータ基盤作成やりたいな。やろう。node js の勉強...,,90F52864-B582-4DF3-96A3-145A404574BC,TKDJK0LGG,rich_text,r3/1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,message,channel_topic,1626932345.007000,U01HQVDCC7K,"チャンネルのトピックを設定 : MLDL1, Software/Sensor4, UIUX2...",,,,,,...,,,,,"MLDL1, Software/Sensor4, UIUX2, PM/PdM1, その他2ぐ...",,,,,
100,message,,1626932281.006600,U01HQVDCC7K,"やっとouraのデータを可視化できた。\nvue, promise, chartjs, fi...",,,,rich_text,K3C,...,,,,,,soreiine,USUM12RHN,1,,
101,message,,1626858993.002100,U01HHBP3QRH,もしかして自分のチームくる？,,c25960d8-f0c3-40d5-bb26-0cde15a85258,TKDJK0LGG,rich_text,JPTk,...,,,,,,,,,,
102,message,channel_join,1626858983.001800,U01HHBP3QRH,<@U01HHBP3QRH>さんがチャンネルに参加しました,,,,,,...,,,,,,,,,,


In [13]:
import json

history_df = []
all_messages = []
for channel_id in tqdm(channel_df['id'].tolist(), total=len(channel_df)):
    messages = retrieve_channel_history(channel_id)
    if not messages:
        continue

    for message in messages:
        message['channel_id'] = channel_id

    all_messages.extend(messages)
    message_df = parse_messages(messages)
    history_df.append(message_df)

history_df = pd.concat(history_df).reset_index(drop=True)
history_df.applymap(lambda x: x.replace('\n', '') if type(x) == str else x).to_csv('../export/history_flatten.csv', index=False)
history_df.shape

  2%| | 2/106 [00:00<00:38,  2.7

[{'ok': False, 'error': 'not_in_channel'}]


  8%| | 8/106 [00:08<01:19,  1.2

[{'ok': False, 'error': 'not_in_channel'}]


  8%| | 9/106 [00:08<01:05,  1.4

[{'ok': False, 'error': 'not_in_channel'}]


  9%| | 10/106 [00:08<00:55,  1.

[{'ok': False, 'error': 'not_in_channel'}]


 10%| | 11/106 [00:09<00:48,  1.

[{'ok': False, 'error': 'not_in_channel'}]


 14%|▏| 15/106 [00:12<01:03,  1.

[{'ok': False, 'error': 'not_in_channel'}]


 16%|▏| 17/106 [00:13<00:47,  1.

[{'ok': False, 'error': 'not_in_channel'}]


 17%|▏| 18/106 [00:13<00:42,  2.

[{'ok': False, 'error': 'not_in_channel'}]


 18%|▏| 19/106 [00:13<00:38,  2.

[{'ok': False, 'error': 'not_in_channel'}]


 19%|▏| 20/106 [00:14<00:35,  2.

[{'ok': False, 'error': 'not_in_channel'}]


 23%|▏| 24/106 [00:15<00:31,  2.

[{'ok': False, 'error': 'not_in_channel'}]


 25%|▎| 27/106 [00:16<00:30,  2.

[{'ok': False, 'error': 'not_in_channel'}]


 27%|▎| 29/106 [00:26<02:45,  2.

[{'ok': False, 'error': 'not_in_channel'}]


 28%|▎| 30/106 [00:26<02:02,  1.

[{'ok': False, 'error': 'not_in_channel'}]


 29%|▎| 31/106 [00:26<01:32,  1.

[{'ok': False, 'error': 'not_in_channel'}]


 30%|▎| 32/106 [00:27<01:11,  1.

[{'ok': False, 'error': 'not_in_channel'}]


 31%|▎| 33/106 [00:27<00:57,  1.

[{'ok': False, 'error': 'not_in_channel'}]


 34%|▎| 36/106 [00:29<00:42,  1.

[{'ok': False, 'error': 'not_in_channel'}]


 35%|▎| 37/106 [00:29<00:36,  1.

[{'ok': False, 'error': 'not_in_channel'}]


 36%|▎| 38/106 [00:29<00:32,  2.

[{'ok': False, 'error': 'not_in_channel'}]


 37%|▎| 39/106 [00:30<00:30,  2.

[{'ok': False, 'error': 'not_in_channel'}]


 40%|▍| 42/106 [00:32<00:34,  1.

[{'ok': False, 'error': 'not_in_channel'}]


 41%|▍| 43/106 [00:32<00:30,  2.

[{'ok': False, 'error': 'not_in_channel'}]


 42%|▍| 45/106 [00:33<00:31,  1.

[{'ok': False, 'error': 'not_in_channel'}]


 44%|▍| 47/106 [00:34<00:25,  2.

[{'ok': False, 'error': 'not_in_channel'}]


 46%|▍| 49/106 [00:35<00:23,  2.

[{'ok': False, 'error': 'not_in_channel'}]


 47%|▍| 50/106 [00:35<00:22,  2.

[{'ok': False, 'error': 'not_in_channel'}]


 49%|▍| 52/106 [00:36<00:21,  2.

[{'ok': False, 'error': 'not_in_channel'}]


 50%|▌| 53/106 [00:36<00:20,  2.

[{'ok': False, 'error': 'not_in_channel'}]


 53%|▌| 56/106 [00:38<00:27,  1.

[{'ok': False, 'error': 'not_in_channel'}]


 56%|▌| 59/106 [00:39<00:20,  2.

[{'ok': False, 'error': 'not_in_channel'}]


 58%|▌| 61/106 [00:41<00:26,  1.

[{'ok': False, 'error': 'not_in_channel'}]


 58%|▌| 62/106 [00:41<00:23,  1.

[{'ok': False, 'error': 'not_in_channel'}]


 60%|▌| 64/106 [00:46<00:55,  1.

[{'ok': False, 'error': 'not_in_channel'}]


 61%|▌| 65/106 [00:47<00:42,  1.

[{'ok': False, 'error': 'not_in_channel'}]


 73%|▋| 77/106 [00:56<00:15,  1.

[{'ok': False, 'error': 'not_in_channel'}]


 77%|▊| 82/106 [00:58<00:09,  2.

[{'ok': False, 'error': 'not_in_channel'}]


100%|█| 106/106 [01:09<00:00,  1


(8710, 1260)

In [14]:
write_to_jsonl('../export/history.jsonl', all_messages)

## conversations.replies

In [23]:
def retrieve_channel_replies(history_df, channel_id):
    ts_list = history_df[history_df['channel_id'] == channel_id]['ts'].tolist()
    
    messages = []
    for ts in tqdm(ts_list):
        payload = {
            "channel" : channel_id,
            "ts": ts,
            "limit": 1000,
        }

        res_list = retrieve_slack_api(RequestType.HISTORY, payload)
        [messages.extend(res['messages']) for res in res_list if res]
    return messages

In [26]:
reply_df = []
all_replies = []
for channel_id in tqdm(channel_df['id'].tolist(), total=len(channel_df)):
    replies = retrieve_channel_replies(history_df, channel_id)
    if not replies:
        continue

    for reply in replies:
        reply['channel_id'] = channel_id

    all_replies.extend(replies)

reply_df = pd.concat([parse_messages(reply) for reply in all_replies]).reset_index(drop=True)
reply_df.applymap(lambda x: x.replace('\n', '') if type(x) == str else x).to_csv('../export/reply_flatten.csv', index=False)
reply_df.shape

  0%|   | 0/106 [00:00<?, ?it/s]
  0%|     | 0/2 [00:00<?, ?it/s][A
 50%|▌| 1/2 [00:00<00:00,  2.71i[A
100%|█| 2/2 [00:00<00:00,  2.68i[A
  1%| | 1/106 [00:00<01:18,  1.3
0it [00:00, ?it/s][A

  0%|   | 0/600 [00:00<?, ?it/s][A
  0%| | 1/600 [00:01<11:43,  1.1[A
  0%| | 2/600 [00:02<11:58,  1.2[A
  0%| | 3/600 [00:03<12:02,  1.2[A
  1%| | 4/600 [00:04<12:09,  1.2[A
  1%| | 5/600 [00:06<12:03,  1.2[A
  1%| | 6/600 [00:07<12:00,  1.2[A
  1%| | 7/600 [00:08<11:58,  1.2[A
  1%| | 8/600 [00:09<12:41,  1.2[A
  2%| | 9/600 [00:11<12:30,  1.2[A
  2%| | 10/600 [00:12<12:22,  1.[A
  2%| | 11/600 [00:13<12:16,  1.[A
  2%| | 12/600 [00:14<12:20,  1.[A
  2%| | 13/600 [00:16<12:14,  1.[A
  2%| | 14/600 [00:17<12:22,  1.[A
  2%| | 15/600 [00:18<12:18,  1.[A
  3%| | 16/600 [00:20<12:52,  1.[A
  3%| | 17/600 [00:21<13:43,  1.[A
  3%| | 18/600 [00:23<13:46,  1.[A
  3%| | 19/600 [00:24<13:00,  1.[A
  3%| | 20/600 [00:25<12:47,  1.[A
  4%| | 21/600 [00:26<12:32,  1.[A
  4%| | 22/

 37%|▎| 222/600 [04:41<07:52,  1[A
 37%|▎| 223/600 [04:43<07:49,  1[A
 37%|▎| 224/600 [04:44<07:52,  1[A
 38%|▍| 225/600 [04:45<07:50,  1[A
 38%|▍| 226/600 [04:46<07:48,  1[A
 38%|▍| 227/600 [04:48<07:45,  1[A
 38%|▍| 228/600 [04:49<07:43,  1[A
 38%|▍| 229/600 [04:50<07:37,  1[A
 38%|▍| 230/600 [04:51<07:32,  1[A
 38%|▍| 231/600 [04:52<07:33,  1[A
 39%|▍| 232/600 [04:54<07:33,  1[A
 39%|▍| 233/600 [04:55<07:33,  1[A
 39%|▍| 234/600 [04:56<07:31,  1[A
 39%|▍| 235/600 [04:57<07:30,  1[A
 39%|▍| 236/600 [04:59<07:25,  1[A
 40%|▍| 237/600 [05:00<07:26,  1[A
 40%|▍| 238/600 [05:01<07:25,  1[A
 40%|▍| 239/600 [05:02<07:24,  1[A
 40%|▍| 240/600 [05:04<07:28,  1[A
 40%|▍| 241/600 [05:05<07:20,  1[A
 40%|▍| 242/600 [05:06<07:22,  1[A
 40%|▍| 243/600 [05:07<07:29,  1[A
 41%|▍| 244/600 [05:09<07:28,  1[A
 41%|▍| 245/600 [05:10<07:28,  1[A
 41%|▍| 246/600 [05:11<07:21,  1[A
 41%|▍| 247/600 [05:12<07:23,  1[A
 41%|▍| 248/600 [05:14<07:18,  1[A
 42%|▍| 249/600 [05:17<10:45

 75%|▋| 449/600 [09:37<03:10,  1[A
 75%|▊| 450/600 [09:39<03:08,  1[A
 75%|▊| 451/600 [09:41<03:41,  1[A
 75%|▊| 452/600 [09:42<03:30,  1[A
 76%|▊| 453/600 [09:43<03:21,  1[A
 76%|▊| 454/600 [09:44<03:14,  1[A
 76%|▊| 455/600 [09:46<03:09,  1[A
 76%|▊| 456/600 [09:47<03:05,  1[A
 76%|▊| 457/600 [09:48<03:02,  1[A
 76%|▊| 458/600 [09:49<02:58,  1[A
 76%|▊| 459/600 [09:51<02:58,  1[A
 77%|▊| 460/600 [09:52<02:59,  1[A
 77%|▊| 461/600 [09:53<02:56,  1[A
 77%|▊| 462/600 [09:55<02:56,  1[A
 77%|▊| 463/600 [09:56<02:52,  1[A
 77%|▊| 464/600 [09:57<02:54,  1[A
 78%|▊| 465/600 [09:58<02:54,  1[A
 78%|▊| 466/600 [10:00<02:50,  1[A
 78%|▊| 467/600 [10:01<02:47,  1[A
 78%|▊| 468/600 [10:02<02:46,  1[A
 78%|▊| 469/600 [10:03<02:46,  1[A
 78%|▊| 470/600 [10:05<02:44,  1[A
 78%|▊| 471/600 [10:06<02:41,  1[A
 79%|▊| 472/600 [10:07<02:46,  1[A
 79%|▊| 473/600 [10:08<02:40,  1[A
 79%|▊| 474/600 [10:10<02:38,  1[A
 79%|▊| 475/600 [10:11<02:34,  1[A
 79%|▊| 476/600 [10:12<02:34

 36%|▎| 74/205 [01:17<02:14,  1.[A
 37%|▎| 75/205 [01:18<02:12,  1.[A
 37%|▎| 76/205 [01:19<02:11,  1.[A
 38%|▍| 77/205 [01:20<02:09,  1.[A
 38%|▍| 78/205 [01:21<02:10,  1.[A
 39%|▍| 79/205 [01:22<02:09,  1.[A
 39%|▍| 80/205 [01:23<02:07,  1.[A
 40%|▍| 81/205 [01:24<02:06,  1.[A
 40%|▍| 82/205 [01:25<02:06,  1.[A
 40%|▍| 83/205 [01:26<02:05,  1.[A
 41%|▍| 84/205 [01:27<02:03,  1.[A
 41%|▍| 85/205 [01:28<02:02,  1.[A
 42%|▍| 86/205 [01:29<02:01,  1.[A
 42%|▍| 87/205 [01:30<01:59,  1.[A
 43%|▍| 88/205 [01:31<02:00,  1.[A
 43%|▍| 89/205 [01:32<02:06,  1.[A
 44%|▍| 90/205 [01:33<02:09,  1.[A
 44%|▍| 91/205 [01:35<02:08,  1.[A
 45%|▍| 92/205 [01:36<02:03,  1.[A
 45%|▍| 93/205 [01:37<01:59,  1.[A
 46%|▍| 94/205 [01:38<01:56,  1.[A
 46%|▍| 95/205 [01:39<01:54,  1.[A
 47%|▍| 96/205 [01:40<01:54,  1.[A
 47%|▍| 97/205 [01:41<01:52,  1.[A
 48%|▍| 98/205 [01:42<01:52,  1.[A
 48%|▍| 99/205 [01:43<01:50,  1.[A
 49%|▍| 100/205 [01:44<01:48,  1[A
 49%|▍| 101/205 [01:45<01:46

KeyboardInterrupt: 

In [17]:
reply_df = parse_messages(all_replies)
reply_df.applymap(lambda x: x.replace('\n', '') if type(x) == str else x).to_csv('../export/reply_flatten.csv', index=False)
reply_df.shape

(4, 8)

In [18]:
reply_df.applymap(lambda x: x.replace('\n', '') if type(x) == str else x).to_csv('../export/reply_flatten.csv', index=False)

In [27]:
write_to_jsonl('../export/reply.jsonl', all_replies)