In [3]:
# -*- coding: utf-8 -*-
%matplotlib inline
import numpy as np
import pandas as pd
from collections import Counter
import re
import pickle
import MeCab
mc = MeCab.Tagger("-Ochasen")

In [4]:
dirpath = "data/excel/"

filename = "A_W_171116.xlsx"
A_W_171116_excel = pd.ExcelFile(dirpath + filename)
sheet_name = A_W_171116_excel.sheet_names[0]
A_W_171116_df = A_W_171116_excel.parse(sheet_name)

filename = "A_W_171128.xlsx"
A_W_171128_excel = pd.ExcelFile(dirpath + filename)
sheet_name = A_W_171128_excel.sheet_names[0]
A_W_171128_df = A_W_171128_excel.parse(sheet_name)

filename = "171116.xlsx"
S_H_171116_excel = pd.ExcelFile(dirpath + filename)
sheet_name = S_H_171116_excel.sheet_names[0]
S_H_171116_df = S_H_171116_excel.parse(sheet_name)

filename = "2015斉藤.xlsx"
S_2015_excel = pd.ExcelFile(dirpath + filename)
sheet_name = S_2015_excel.sheet_names[0]
S_2015_df = S_2015_excel.parse(sheet_name)

In [5]:
print(A_W_171116_df.shape)
print(A_W_171128_df.shape)
print(S_H_171116_df.shape)
print(S_2015_df.shape)

(4943, 36)
(4945, 36)
(5017, 30)
(5017, 12)


## 必要なデータ
- id
    - A_W_YYMMDDのNo.(0列目)
    - SH_171116のid(0列目)
    - 斉藤2015の発言番号(0列目)
- reply
    - A_W_YYMMDDの返信元(1列目)
    - SH_171116のreply(3列目)
    - 斉藤2015の返信元(1列目)
- group
    - A_W_YYMMDDのgid(6列目)
    - SH_171116のgid(5列目)
    - 斉藤2015のグループ番号(6列目)
- who
    - A_W_YYMMDDのニックネーム(8列目)
    - SH_171116のcname(6列目)
    - 斉藤2015のニックネーム(8列目)
- body
    - A_W_YYMMDDの発言内容(11列目)
    - SH_171116のbody(8列目)
    - 斉藤2015の発言内容(11列目)
- argumentation_a
    - A_W_YYMMDDのArgumentation(A)(22列目)
    - SH_171116のArgumentation(H)(17列目)
- argumentation_b
    - A_W_YYMMDDのWatanabe.2(23列目)
    - SH_171116のArgumentation(S)(18列目)
    
    
    
## 各dfで必要な列
- A_W_YYMMDD[0, 1, 6, 8, 11, 22, 23]
- SH_171116[0, 3, 5, 6, 8, 17, 18]
- 斉藤2015[0, 1, 6, 8, 11]

In [6]:
A_W_YYMMDD_use = [0, 1, 6, 8, 11, 22, 23]
S_H_171116_use = [0, 3, 5, 6, 8, 17, 18]
S_2015_use = [0, 1, 6, 8, 11]

In [7]:
# A_W_YYMMDDの列名の統合
A_W_171116_df.columns = A_W_171128_df.columns

delete_columns_list = []
for i, column in enumerate(A_W_171116_df):
    if not i in A_W_YYMMDD_use:
        delete_columns_list.append(column)
        
A_W_171116_df = A_W_171116_df.drop(delete_columns_list, axis=1)
A_W_171128_df = A_W_171128_df.drop(delete_columns_list, axis=1)

delete_columns_list = []
for i, column in enumerate(S_H_171116_df):
    if not i in S_H_171116_use:
        delete_columns_list.append(column)
S_H_171116_df = S_H_171116_df.drop(delete_columns_list, axis=1)

delete_columns_list = []
for i, column in enumerate(S_2015_df):
    if not i in S_2015_use:
        delete_columns_list.append(column)
S_2015_df = S_2015_df.drop(delete_columns_list, axis=1)

In [8]:
print(not False in S_H_171116_df['id'].values == S_2015_df['発言番号'].values)
print(not False in S_H_171116_df['cname'].values == S_2015_df['ニックネーム'].values)
print(not False in S_H_171116_df['body'].values == S_2015_df['発言内容'].values)

True
True
True


In [9]:
S_H_171116_df["reply"] = S_2015_df['返信元']
S_H_171116_df["gid"] = S_2015_df['グループ番号']

In [10]:
concat_border = 299
A_W_171116_df = A_W_171116_df[0:concat_border]
A_W_171128_df = A_W_171128_df[concat_border:]
A_W_df = pd.concat([A_W_171116_df, A_W_171128_df])

In [11]:
print(A_W_df.shape)
print(S_H_171116_df.shape)

(4945, 7)
(5017, 7)


In [12]:
# 要素含まれるNanの数を確認
A_W_df.isnull().sum()

No.                   3
返信元                   3
gid                   3
ニックネーム                3
発言内容                  3
Argumentation(A)    113
Watanabe.2          212
dtype: int64

In [13]:
# A_W_dfの人間が付けたラベル列のNanを取り除く
A_W_df = A_W_df.dropna(subset=['Argumentation(A)', 'Watanabe.2'])
# 要素にNanが含まれる列を再確認
A_W_df.isnull().any()

No.                 False
返信元                 False
gid                 False
ニックネーム              False
発言内容                False
Argumentation(A)    False
Watanabe.2          False
dtype: bool

In [14]:
# 要素に含まれるNanの数を確認
S_H_171116_df.isnull().sum()

id                    0
reply                 0
gid                   0
cname                 0
body                  0
Argumentation(H)    189
Argumentation(S)    225
dtype: int64

In [15]:
# S_H_171116_dfの人間が付けたラベル列のNanを取り除く
S_H_171116_df = S_H_171116_df.dropna(subset=['Argumentation(H)', 'Argumentation(S)'])
# 要素にNanが含まれる列を再確認
S_H_171116_df.isnull().any()

id                  False
reply               False
gid                 False
cname               False
body                False
Argumentation(H)    False
Argumentation(S)    False
dtype: bool

In [16]:
A_W_df = A_W_df.drop_duplicates(subset=['ニックネーム', '発言内容'])
S_H_171116_df = S_H_171116_df.drop_duplicates(subset=['cname', 'body'])

Counter(A_W_df["Argumentation(A)"])+Counter(A_W_df["Watanabe.2"])

Counter({'Non-argumentative moves': 3851,
         'Simple claim': 337,
         'Grounded claim': 57,
         'Grounded and Qualified claim': 20,
         'Qualified claim': 14,
         'simple claim': 696,
         'grounded claim': 96,
         'non-argumentative moves': 1174,
         'qualified claim': 17,
         'grounded and qualified claim': 2,
         'Grounded Claim': 349,
         'Simple Claim': 2134,
         'Qualified Claim': 79})

In [17]:
# 表記ゆれの修正と確認
A_W_df['Argumentation(A)'] = A_W_df['Argumentation(A)'].str.lower()
counter1 = Counter(A_W_df['Argumentation(A)'])
A_W_df['Watanabe.2'] = A_W_df['Watanabe.2'].str.lower()
counter2 = Counter(A_W_df['Watanabe.2'])
counter1 + counter2

Counter({'non-argumentative moves': 5025,
         'simple claim': 3167,
         'grounded claim': 502,
         'grounded and qualified claim': 22,
         'qualified claim': 110})

In [18]:
Counter(S_H_171116_df["Argumentation(S)"])+Counter(S_H_171116_df["Argumentation(H)"])

Counter({'Non-argumentative moves': 5579,
         'Simple Claim': 2472,
         'Grounded Claim': 236,
         'Grounded and Qualified claim': 15,
         'Qualified Claim': 47,
         'Simple claim': 855,
         'Grounded claim': 114,
         'Qualified claim': 22})

In [19]:
# 表記ゆれの確認
S_H_171116_df['Argumentation(S)'] = S_H_171116_df['Argumentation(S)'].str.lower()
counter1 = Counter(S_H_171116_df['Argumentation(S)'])
S_H_171116_df['Argumentation(H)'] = S_H_171116_df['Argumentation(H)'].str.lower()
counter2 = Counter(S_H_171116_df['Argumentation(H)'])
counter1 + counter2

Counter({'non-argumentative moves': 5579,
         'simple claim': 3327,
         'grounded claim': 350,
         'grounded and qualified claim': 15,
         'qualified claim': 69})

In [20]:
A_W_match_df = A_W_df[A_W_df['Argumentation(A)'] == A_W_df['Watanabe.2']].reset_index(drop=True)
A_W_match_df.head()

Unnamed: 0,No.,返信元,gid,ニックネーム,発言内容,Argumentation(A),Watanabe.2
0,1.0,\N,1.0,まこぴす,よろしくお願いします！,non-argumentative moves,non-argumentative moves
1,31.0,\N,1.0,哲,よろしくお願いします,non-argumentative moves,non-argumentative moves
2,70.0,\N,1.0,仙波,名前なのが恥ずかしいです…\n\nよろしくお願いします！,non-argumentative moves,non-argumentative moves
3,119.0,\N,1.0,まこぴす,早速課題やっちゃいましょう！,non-argumentative moves,non-argumentative moves
4,163.0,\N,1.0,仙波,やっちゃいましょう\n\nmoodleはゴミです！,simple claim,simple claim


In [21]:
S_H_match_df = S_H_171116_df[S_H_171116_df['Argumentation(H)'] == S_H_171116_df['Argumentation(S)']].reset_index(drop=True)
S_H_match_df.head()

Unnamed: 0,id,reply,gid,cname,body,Argumentation(H),Argumentation(S)
0,1593,\N,1,世界のわたべ,よろしくお願いします。,non-argumentative moves,non-argumentative moves
1,1598,\N,1,ざきさん,よろしくです,non-argumentative moves,non-argumentative moves
2,1606,\N,1,あ,よろしくです,non-argumentative moves,non-argumentative moves
3,1659,\N,1,ざきさん,みなさんファイルをアップしましたか？,non-argumentative moves,non-argumentative moves
4,1683,\N,1,あ,今アップしました！,non-argumentative moves,non-argumentative moves


In [22]:
A_W_match_df.isnull().sum()

No.                 0
返信元                 0
gid                 0
ニックネーム              0
発言内容                0
Argumentation(A)    0
Watanabe.2          0
dtype: int64

In [23]:
S_H_match_df.isnull().sum()

id                  0
reply               0
gid                 0
cname               0
body                0
Argumentation(H)    0
Argumentation(S)    0
dtype: int64

In [24]:
header = ['id', 'reply', 'group_id', 'cname', 'body', 'Argumentation_A', 'Argumentation_B']
A_W_df.columns = header
S_H_171116_df.columns = header
A_W_match_df.columns = header
S_H_match_df.columns = header

In [25]:
All_df = pd.concat([A_W_df, S_H_171116_df], ignore_index=True)
print(All_df.shape)

(9083, 7)


In [26]:
match_df = pd.concat([A_W_match_df, S_H_match_df], ignore_index=True)
print(match_df.shape)

(7765, 7)


In [27]:
# データの形式の定義
data_format = ['id', 'reply', 'group_id', 'cname', 'body', 'label']
# 名前からデータのindexに変換する辞書
data_index = {column : i for i, column in enumerate(data_format)}
print(data_index)

{'id': 0, 'reply': 1, 'group_id': 2, 'cname': 3, 'body': 4, 'label': 5}


In [28]:
id = 0
reply = 1
group_id = 2
cname = 3
body = 4
label = 5

df_index = {column: i for i, column in enumerate(All_df)}
print(df_index)

{'id': 0, 'reply': 1, 'group_id': 2, 'cname': 3, 'body': 4, 'Argumentation_A': 5, 'Argumentation_B': 6}


In [29]:
df_use = [0, 1, 2, 3, 4]
# All_df_index['ラベル名']を変更して取り出すラベルを分ける
# label_position = df_index['Argumentation_A']
# label_position = df_index['Argumentation_B']
label_position = df_index['Argumentation_B']
df_use.append(label_position)
print(df_use)

[0, 1, 2, 3, 4, 5]


In [30]:
delete_columns_list = []
for i, column in enumerate(All_df):
#for i, column in enumerate(match_df):
    if not i in df_use:
        delete_columns_list.append(column)
input_df = All_df.drop(delete_columns_list, axis=1)
#input_df = match_df.drop(delete_columns_list, axis=1)
# input_df = A_W_consulted_df.drop(delete_columns_list, axis=1)
input_df.columns = data_format

## 必要のない列を取り除いて、入力用のdfを作成

In [31]:
input_df

Unnamed: 0,id,reply,group_id,cname,body,label
0,1.0,\N,1.0,まこぴす,よろしくお願いします！,non-argumentative moves
1,31.0,\N,1.0,哲,よろしくお願いします,non-argumentative moves
2,70.0,\N,1.0,仙波,名前なのが恥ずかしいです…\n\nよろしくお願いします！,non-argumentative moves
3,119.0,\N,1.0,まこぴす,早速課題やっちゃいましょう！,non-argumentative moves
4,163.0,\N,1.0,仙波,やっちゃいましょう\n\nmoodleはゴミです！,simple claim
5,194.0,\N,1.0,まこぴす,使いにくいです(笑),simple claim
6,302.0,\N,1.0,哲,同意です,simple claim
7,309.0,\N,1.0,仙波,以前インタラクティブアート受講していたのですが、その時に課題が不具合で出せなくなっていた時期...,grounded claim
8,385.0,\N,1.0,まこぴす,学習環境として必要最低限の機能は備えていると思うが、操作性の面ではPCの最低限の知識があるこ...,grounded and qualified claim
9,426.0,\N,1.0,まこぴす,自分の意見です！(笑),non-argumentative moves


In [32]:
counter = Counter(input_df['label'])
print(counter)

Counter({'non-argumentative moves': 5380, 'simple claim': 3081, 'grounded claim': 481, 'qualified claim': 118, 'grounded and qualified claim': 23})


# 出現頻度が少ないラベルを確認

In [33]:
input_data = input_df.values

In [34]:
def preprocess(xs, trim_citation=False):
    zs = [""] * len(xs)
    '''全角空白'''
    for i, l in enumerate(xs):
        xs[i] = re.sub('　', ' ', str(l))
    '''多重引用'''
    for i, l in enumerate(xs):
        xs[i] = re.sub('＆ｇｔ；', '＞', l)
    '''引用部分は除外'''
    count_refer = 0
    for i, l in enumerate(xs):
        sen = ""
        have_refer = 1
        for x in l.split('\n'):
            if len(x) == 0:
                continue
            elif x[0].encode('utf-8') == '＞'.encode('utf-8'):
                have_refer = 1
                x = x[1:]
                if len(x) > 0 and x[0].encode('utf-8') != '＞'.encode('utf-8') and \
                        x[0:2].encode('utf-8') != ' ＞'.encode('utf-8') and x != ' ':
                    if trim_citation:
                        zs[i] += x + '\n'
                    else:
                        sen = sen + "＞ " + x + '\n'
                continue
            else:
                sen = sen + x + '\n'
        xs[i] = sen.strip()
        count_refer += have_refer
    '''顔文字'''
    pat = re.compile('（[^ぁ-んァ-ン一-龠]+?）', re.U)
    for i, l in enumerate(xs):
        xs[i] = re.sub(pat, ' KAOMOJI ', l)
    '''日本語及び記号{？！ー〜。、} のみ有効'''
    pat = re.compile('[^＞ Ａ-Ｚａ-ｚ０-９ぁ-んァ-ン一-龠？！ー〜。、\n]+?', re.U)
    for i, l in enumerate(xs):
        xs[i] = re.sub(pat, '', l)
    '''記号繰り返し'''
    pat_1 = re.compile('[ｗ]{1,}', re.U)
    pat_2 = re.compile('[？]{1,}', re.U)
    pat_3 = re.compile('[！]{1,}', re.U)
    pat_4 = re.compile('[～]{1,}', re.U)
    pat_5 = re.compile('[ー]{1,}', re.U)
    pat_6 = re.compile('[。]{1,}', re.U)

    for i, l in enumerate(xs):
        l = re.sub(pat_1, 'ｗ', l)
        l = re.sub(pat_2, '？', l)
        l = re.sub(pat_3, '！', l)
        l = re.sub(pat_4, '～', l)
        l = re.sub(pat_5, 'ー', l)
        l = re.sub(pat_6, '。', l)
        xs[i] = l

    if trim_citation:
        return zs
    else:
        return None

In [35]:
def make_xs(orig, trim_citation=False, use_seq2seq=False):

    tmp = np.copy(orig)
    xs = tmp[:, body]
    zs = preprocess(xs, trim_citation=(trim_citation or use_seq2seq))

    if use_seq2seq:
        tmp_prev = np.roll(tmp, 1, axis=0)
        tmp_prev[0, 0] = ""
        ts = tmp[:, [1, 2]]
        xs_prev = tmp_prev[:, body]

        for i in range(len(tmp)):
            if tmp[i, group_id] != tmp_prev[i, group_id]:  # group が異なる
                xs_prev[i] = ""
        preprocess(xs_prev, trim_citation=True)

        for i in range(len(xs)):
            replay_to = tmp[i, 2]
            if zs[i] != "":
                xs_prev[i] = zs[i]
            elif replay_to != -1:
                a = xs[:i][tmp[:i, 1] == replay_to]
                if len(a) != 0:
                    xs_prev[i] = a[-1]

        return xs, xs_prev
    else:
        return xs

In [36]:
def get_doc(xs):
    doc = []
    for ll in xs:
        sen = []
        for l in ll.split('\n'):
            for w in mc.parse(l).split('\n'):
                mx = w.split('\t')
                if len(mx[0]) > 0:
                    sen.append(mx[0])
        doc.append(sen)
    return doc

In [37]:
xs, xs_pre = make_xs(input_data, trim_citation=True, use_seq2seq=True)
doc = get_doc(xs)
doc_pre = get_doc(xs_pre)

In [38]:
for i in doc[:10]:
    print(" ".join(i))

よろしく お願い し ます ！ EOS
よろしく お願い し ます EOS
名前 な の が 恥ずかしい です EOS よろしく お願い し ます ！ EOS
早速 課題 やっ ちゃ い ましょ う ！ EOS
やっ ちゃ い ましょ う EOS は ゴミ です ！ EOS
使い にくい です 笑 EOS
同意 です EOS
以前 インタラクティブ アート 受講 し て い た の です が 、 その 時 に 課題 が 不具合 で 出せ なく なっ て い た 時期 が あり まし て EOS 本当に もろい です EOS
学習 環境 として 必要 最低限 の 機能 は 備え て いる と 思う が 、 操作 性 の 面 で は の 最低限 の 知識 が ある こと が 前提 な ので 誰 も が 利用 できる と は 言え ない 。 また から だ と 見 やすい レイアウト も スマフォ から だ と 見 にくい と 感じ た 。 EOS
自分 の 意見 です ！ 笑 EOS


In [39]:
wd_set = Counter([x for sen in doc + doc_pre for x in sen])
wd_ary = np.array(list(wd_set.keys()))
wd_cnt = np.array(list(wd_set.values()))
wd_ary = wd_ary[np.argsort(wd_cnt)[::-1]]
wd_cnt.sort()
wd_cnt = wd_cnt[::-1]
wd_to_id = {wd: i for i, wd in enumerate(wd_ary)}
lb_to_id = {lb: i for i, lb in enumerate({x for x in input_data[:, label]})}
id_to_wd = {wd_to_id[wd]: wd for wd in wd_to_id.keys()}
id_to_lb = {lb_to_id[lb]: lb for lb in lb_to_id.keys()}

In [40]:
for i in range(10):
    print(list(wd_ary)[i], list(wd_cnt)[i])
    #print(list(wd_set.keys())[i], list(wd_set.values())[i])

EOS 28238
の 11219
て 7808
か 7505
です 7473
が 7359
に 7304
ます 7119
は 7084
、 7077


In [41]:
KIND = len(list(lb_to_id.keys()))
CUT_OFF = 2

print("words kinds:",
      len(wd_cnt), "words>=" + str(CUT_OFF) + ":", np.sum(wd_cnt >= CUT_OFF))
print("all words num:", np.sum(wd_cnt))
print("all words num:", np.sum(wd_cnt[wd_cnt >= CUT_OFF]))

other_id = np.sum(wd_cnt >= CUT_OFF)
wd_to_id.update({wd: other_id for wd in wd_ary[wd_cnt < CUT_OFF]})
print(other_id)
id_to_wd[other_id] = '[X]'

print(max(wd_to_id.values()))

with open('data/Argumentation_B_id_to_wd.pickle', mode='wb') as f:
    pickle.dump(id_to_wd, f)
with open('data/Argumentation_B_wd_to_id.pickle', mode='wb') as f:
    pickle.dump(wd_to_id, f)
with open('data/Argumentation_B_wd_set.pickle', mode='wb') as f:
    pickle.dump(wd_set, f)
with open('data/Argumentation_B_id_to_lb.pickle', mode='wb') as f:
    pickle.dump(id_to_lb, f)
pd.DataFrame(
    np.array(list(id_to_lb.items())).T, index=["ラベルid", "ラベルname"])

words kinds: 5602 words>=2: 5544
all words num: 338449
all words num: 338391
5544
5544


Unnamed: 0,0,1,2,3,4
ラベルid,0,1,2,3,4
ラベルname,grounded and qualified claim,non-argumentative moves,grounded claim,simple claim,qualified claim


In [42]:
of = open("data/Argumentation_B_edu_data.txt", "w")
for i, (sen, lb) in enumerate(zip(np.array(doc), input_data[:, label])):
    sen_str = "".join([str(wd_to_id[wd]) + " " for wd in sen])
    print_str = str(lb_to_id[lb]) + " " + str(lb_to_id[lb]) + " " + sen_str
    print(print_str, file=of)
of.close()

of = open("data/Argumentation_B_edu_data_pre.txt", "w")
for i, sen in enumerate(np.array(doc_pre)):
    sen_str = "".join([str(wd_to_id[wd]) + " " for wd in sen])
    print(sen_str, file=of)
of.close()

print(max([len(sen) for sen in doc]))

print(KIND)

313
5


# train