In [1]:
import numpy as np
import re
import pandas as pd
import pickle
from collections import Counter

In [2]:
A_df = pd.read_pickle("data/16types.pickle")
B_df = pd.read_pickle("data/5dimensions.pickle")

In [3]:
A_df

Unnamed: 0,say_id,reply_id,group_id,name,body,16types_a,16types_b
0,1,\N,1234568,まこぴす,よろしくお願いします！,挨拶,挨拶
1,31,\N,1234568,哲,よろしくお願いします,挨拶,挨拶
2,70,\N,1234568,仙波,名前なのが恥ずかしいです…_x000D_\nよろしくお願いします！,挨拶,挨拶
3,119,\N,1234568,まこぴす,早速課題やっちゃいましょう！,転換,転換
4,163,\N,1234568,仙波,やっちゃいましょう_x000D_\nmoodleはゴミです！,了承,提案
5,194,\N,1234568,まこぴす,使いにくいです(笑),提案,提案
6,302,\N,1234568,哲,同意です,了承,了承
7,309,\N,1234568,仙波,以前インタラクティブアート受講していたのですが、その時に課題が不具合で出せなくなっていた時期...,提案,提案
8,385,\N,1234568,まこぴす,学習環境として必要最低限の機能は備えていると思うが、操作性の面ではPCの最低限の知識があるこ...,提案,提案
9,426,\N,1234568,まこぴす,自分の意見です！(笑),提案,提案


In [4]:
B_df

Unnamed: 0,say_id,reply_id,group_id,name,body,argument_a,argument_b,epistemic_a,epistemic_b,social_a,social_b,coordination_a,coordination_b
0,1,\N,1,まこぴす,よろしくお願いします！,Non-argumentative moves,Non-argumentative moves,Off task,Off Task,,,,
1,31,\N,1,哲,よろしくお願いします,Non-argumentative moves,Non-argumentative moves,Off task,Off Task,,,,
2,70,\N,1,仙波,名前なのが恥ずかしいです…\n\nよろしくお願いします！,Non-argumentative moves,Non-argumentative moves,Off task,Off Task,,,,
3,119,\N,1,まこぴす,早速課題やっちゃいましょう！,Non-argumentative moves,Non-argumentative moves,Off task,Off Task,,,Proceedings,Proceedings
4,163,\N,1,仙波,やっちゃいましょう\n\nmoodleはゴミです！,Simple claim,Simple Claim,On task,On Task,Externalization,,,
5,194,\N,1,まこぴす,使いにくいです(笑),Simple claim,Simple Claim,On task,On Task,Externalization,,,
6,302,\N,1,哲,同意です,Simple claim,Simple Claim,On task,On Task,Quick consensus building,Quick consensus building,,
7,309,\N,1,仙波,以前インタラクティブアート受講していたのですが、その時に課題が不具合で出せなくなっていた時期...,Grounded claim,Grounded Claim,On task,On Task,Externalization,Externalization,,
8,385,\N,1,まこぴす,学習環境として必要最低限の機能は備えていると思うが、操作性の面ではPCの最低限の知識があるこ...,Grounded and Qualified claim,Grounded and Qualified claim,On task,On Task,Externalization,Externalization,,
9,426,\N,1,まこぴす,自分の意見です！(笑),Non-argumentative moves,Simple Claim,Off task,On Task,,Externalization,,


# A_dfのラベルを番号に変換する + マージするための簡単な前処理


* ノイズ (Noise) => 0
* 了承 (Agreement) => 1
* 提案 (Proposal) => 2
* 質問 (Question) => 3
* 報告 (Report) => 4
* 挨拶 (Greeting) => 5
* 回答 (Reply) => 6
* メタ (Qutside comments) => 7
* 確認 (Confimation) => 8
* 感謝 (Gratitude) => 9
* 愚痴 (Complaint) => 10
* 依頼 (Request) => 11
* 訂正 (Correction) => 12
* 不同意 (Disagreement) => 13
* 転換 (Switchover) => 14
* ジョーク (Joke) => 15

In [5]:
A_df = pd.read_pickle("data/16types.pickle")
col_A = ['16types_a', '16types_b']

[print(c, ":", Counter(A_df[c]), '\n') for c in col_A]

# 辞書の定義
dict_A = {
    'ノイズ': 0,
    '了承': 1,
    '提案': 2,
    '質問': 3,
    '報告': 4,
    '挨拶': 5,
    '回答': 6,
    'メタ': 7,
    '確認': 8,
    '感謝': 9,
    '愚痴': 10,
    '依頼': 11,
    '訂正': 12,
    '不同意': 13,
    '転換': 14,
    'ジョーク': 15
}

# ラベルを番号に置換する
for c in col_A:
    A_df = A_df.replace({c: dict_A})

[print(c, ":", Counter(A_df[c]), '\n') for c in col_A]

# reply_idの\\Nを-1に変更する
d_reply = {
    '\\N': '-1',
}
A_df = A_df.replace({'reply_id': d_reply})
print(A_df.values[0])


# say_id、reply_idとgroup_idをstrに変更する
# 同時に空白を削除する
A_df['say_id'] = A_df['say_id'].apply(int).apply(str)
A_df['reply_id'] = A_df['reply_id'].apply(int).apply(str)
A_df['group_id'] = A_df['group_id'].apply(int).apply(str)

16types_a : Counter({'了承': 2361, '提案': 1844, '回答': 1250, '質問': 1222, '報告': 1188, '挨拶': 1158, '確認': 544, '感謝': 492, 'メタ': 376, '転換': 304, 'ジョーク': 215, '依頼': 165, '愚痴': 122, 'ノイズ': 119, '訂正': 97, '不同意': 56}) 

16types_b : Counter({'了承': 2682, '提案': 1920, '報告': 1212, '挨拶': 1197, '質問': 1177, '回答': 1081, 'メタ': 675, '確認': 405, '転換': 321, 'ジョーク': 219, '依頼': 191, '感謝': 179, '訂正': 107, '不同意': 90, '愚痴': 33, 'ノイズ': 24}) 

16types_a : Counter({1: 2361, 2: 1844, 6: 1250, 3: 1222, 4: 1188, 5: 1158, 8: 544, 9: 492, 7: 376, 14: 304, 15: 215, 11: 165, 10: 122, 0: 119, 12: 97, 13: 56}) 

16types_b : Counter({1: 2682, 2: 1920, 4: 1212, 5: 1197, 3: 1177, 6: 1081, 7: 675, 8: 405, 14: 321, 15: 219, 11: 191, 9: 179, 12: 107, 13: 90, 10: 33, 0: 24}) 

['1' '-1' '1234568' 'まこぴす' 'よろしくお願いします！' 5 5]


# B_dfのラベルを番号に変換する + マージするための簡単な前処理

- argument
    - nan => 20
    - non-argumentative moves => 21
    - simple claim => 22
    - grounded claim => 23
    - qualified claim => 24
    - grounded and qualified claim => 25
- epistemic
    - nan => 30
    - off task => 31
    - on task => 32
    - no sense => 33
- social
    - nan => 40
    - externalization => 41
    - quick consensus building => 42
    - elicitation => 43
    - integration-oriented consensus building => 44
    - conflict-oriented consensus building => 45
    - summary => 46
- coordination
    - nan => 50
    - technical coordination => 51
    - proceedings => 52
    - task division => 53
    - time management => 54
    - quote => 55
    
    
    
### !!!!!!socialのスペルミス　＝＞　integra[[[r]]]ion-oriented consensus building

In [6]:
B_df = pd.read_pickle("data/5dimensions.pickle")
col_B = [
    'argument_a', 'argument_b', 'epistemic_a', 'epistemic_b', 'social_a',
    'social_b', 'coordination_a', 'coordination_b'
]

# NaNを文字列othersに置換する
B_df = B_df.fillna("NAN")

print("---------------------------------------------------------")
[print(c, ":", Counter(B_df[c]), '\n') for c in col_B]
print("---------------------------------------------------------")

# 小文字に置換する
for c in col_B:
    B_df[c] = B_df[c].str.lower()

# スペルミスに対する処理
dict_mis = {
    'integrarion-oriented consensus building':
    'integration-oriented consensus building',
}
B_df = B_df.replace({'social_a': dict_mis})
B_df = B_df.replace({'social_b': dict_mis})

print("---------------------------------------------------------")
[print(c, ":", Counter(B_df[c]), '\n') for c in col_B]
print("---------------------------------------------------------")

# 辞書の定義
dict_B_arg = {
    'nan': 20,
    'non-argumentative moves': 21,
    'simple claim': 22,
    'grounded claim': 23,
    'qualified claim': 24,
    'grounded and qualified claim': 25
}
dict_B_epi = {'nan': 30, 'off task': 31, 'on task': 32, 'no sense': 33}
dict_B_soc = {
    'nan': 40,
    'externalization': 41,
    'quick consensus building': 42,
    'elicitation': 43,
    'integration-oriented consensus building': 44,
    'conflict-oriented consensus building': 45,
    'summary': 46
}
dict_B_coo = {
    'nan': 50,
    'technical coordination': 51,
    'proceedings': 52,
    'task division': 53,
    'time management': 54,
    'quote': 55
}

dict_temp = [
    dict_B_arg, dict_B_arg, dict_B_epi, dict_B_epi, dict_B_soc, dict_B_soc,
    dict_B_coo, dict_B_coo
]

# ラベルを番号に置換する
for c, d in zip(col_B, dict_temp):
    B_df = B_df.replace({c: d})

print("---------------------------------------------------------")
[print(c, ":", Counter(B_df[c]), '\n') for c in col_B]
print("---------------------------------------------------------")



# reply_idの\\Nを-1に変更する
d_reply = {
    '\\N': '-1',
}
B_df = B_df.replace({'reply_id': d_reply})

# say_id、reply_idとgroup_idをstrに変更する
# 同時に空白を削除する
B_df['say_id'] = B_df['say_id'].apply(int).apply(str)
B_df['reply_id'] = B_df['reply_id'].apply(int).apply(str)
B_df['group_id'] = B_df['group_id'].apply(int).apply(str)

print(B_df.values[0])

# say_id、reply_idとgroup_idの空白を削除する
# B_df['say_id'] = B_df['say_id'].str.strip()
# B_df['reply_id'] = B_df['reply_id'].str.strip()
# B_df['group_id'] = B_df['group_id'].str.strip()



---------------------------------------------------------
argument_a : Counter({'Non-argumentative moves': 4504, 'non-argumentative moves': 1300, 'Simple claim': 1230, 'Simple Claim': 1229, 'simple claim': 745, 'NAN': 299, 'Grounded Claim': 230, 'Grounded claim': 180, 'grounded claim': 99, 'Qualified Claim': 65, 'Qualified claim': 37, 'Grounded and Qualified claim': 21, 'qualified claim': 18, 'grounded and qualified claim': 2}) 

argument_b : Counter({'Non-argumentative moves': 5567, 'Simple Claim': 3501, 'NAN': 434, 'Grounded Claim': 381, 'Qualified Claim': 61, 'Grounded and Qualified claim': 14, 'Grounded claim': 1}) 

epistemic_a : Counter({'Off Task': 2071, 'Off task': 1998, 'On Task': 1764, 'On task': 1683, 'off task': 1175, 'on task': 968, 'No Sense': 161, 'no sense': 106, 'No sense': 32, 'NAN': 1}) 

epistemic_b : Counter({'Off Task': 5223, 'On Task': 4409, 'No Sense': 323, 'NAN': 4}) 

social_a : Counter({'NAN': 5710, 'Externalization': 1468, 'Quick consensus building': 948, 'e

# 16typesと5dimensionsのマージ

In [7]:
# 検証

print(A_df.shape)
print(B_df.shape)

sum_0 = 0
sum_1 = 0
sum_2 = 0
for line_16t in A_df.values[:10]:
    count_same = 0
    for line_5d in B_df.values:
        if line_16t[0] == line_5d[0] and line_16t[1] == line_5d[1] and line_16t[3] == line_5d[3]:
            count_same += 1
    if (count_same == 0):
        sum_0 += 1
    if (count_same == 1):
        sum_1 += 1
    if (count_same >= 2):
        sum_2 += 1
print("A[x]はBに存在しない => ",sum_0)
print("A[x]はBに１つだけ同じの要素が存在する => ",sum_1)
print("A[x]がBに複数同じの要素が存在する => ",sum_2)


(11513, 7)
(9959, 13)
A[x]はBに存在しない =>  0
A[x]はBに１つだけ同じの要素が存在する =>  10
A[x]がBに複数同じの要素が存在する =>  0


# マージ作業

In [8]:
C = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
for line_16t in A_df.values:
    flag = 0
    for line_5d in B_df.values:
        if line_16t[0] == line_5d[0] and line_16t[1] == line_5d[1] and line_16t[3] == line_5d[3]:
            C = np.vstack([C, np.hstack([line_16t, line_5d[5:]])])
            flag = 1
    if flag == 0:
        null = np.array([20, 20, 30, 30, 40, 40, 50, 50])
        C = np.vstack([C, np.hstack([line_16t, null])])
C = np.delete(C, [0], 0)

C_col = [
    'say_id', 'reply_id', 'group_id', 'name', 'body', '16types_a', '16types_b',
    'argument_a', 'argument_b', 'epistemic_a', 'epistemic_b', 'social_a',
    'social_b', 'coordination_a', 'coordination_b'
]

C_df = pd.DataFrame(C, columns=C_col)

In [9]:
C_df

Unnamed: 0,say_id,reply_id,group_id,name,body,16types_a,16types_b,argument_a,argument_b,epistemic_a,epistemic_b,social_a,social_b,coordination_a,coordination_b
0,1,-1,1234568,まこぴす,よろしくお願いします！,5,5,21,21,31,31,40,40,50,50
1,31,-1,1234568,哲,よろしくお願いします,5,5,21,21,31,31,40,40,50,50
2,70,-1,1234568,仙波,名前なのが恥ずかしいです…_x000D_\nよろしくお願いします！,5,5,21,21,31,31,40,40,50,50
3,119,-1,1234568,まこぴす,早速課題やっちゃいましょう！,14,14,21,21,31,31,40,40,52,52
4,163,-1,1234568,仙波,やっちゃいましょう_x000D_\nmoodleはゴミです！,1,2,22,22,32,32,41,40,50,50
5,194,-1,1234568,まこぴす,使いにくいです(笑),2,2,22,22,32,32,41,40,50,50
6,302,-1,1234568,哲,同意です,1,1,22,22,32,32,42,42,50,50
7,309,-1,1234568,仙波,以前インタラクティブアート受講していたのですが、その時に課題が不具合で出せなくなっていた時期...,2,2,23,23,32,32,41,41,50,50
8,385,-1,1234568,まこぴす,学習環境として必要最低限の機能は備えていると思うが、操作性の面ではPCの最低限の知識があるこ...,2,2,25,25,32,32,41,41,50,50
9,426,-1,1234568,まこぴす,自分の意見です！(笑),2,2,21,22,31,32,40,41,50,50


In [10]:
excel_writer = pd.ExcelWriter("data/all.xlsx", engine='xlsxwriter',options={'strings_to_urls': False})
C_df.to_excel(excel_writer, "sheet1")
excel_writer.save()
C_df.to_pickle("data/all.pickle")