# Data setting

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import layers
# tf.__version__
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
student = pd.read_csv('./student_sentence_data.csv')
teacher = pd.read_csv('./teacher_sentence_data.csv')

In [3]:
student

Unnamed: 0,sentence_id,sentence,answer,label
0,20df21b159a5e1d0ee484c936f1597ef,Suppose you need to measure the temperature in...,너는 농작물 밭에서 온도를 측정할 필요가 있다고 가정해라,1
1,1b90bfe67b06a515eec82e88e1bd7d84,There are different kinds of knowledge.,지식에는 다른 종류가 있다,1
2,2673ce8234cc65bf5d5e81743172700b,Why?,왜일까,1
3,98fdeabb8c02fc9ed1ce18a72fb7e84b,"You can be perfect, but you need to change the...","너는 완벽할 수 있지만, 그것을 생각하는 방식을 변화할 필요가 있다",1
4,60cd307ed0ad93d58635cb0d9d58e1e6,He was a violinist and composer known for his ...,그는 유니크한 퍼포먼스 방법으로 알려져있는 바이올리니스트이자 작곡가였다.,1
...,...,...,...,...
1729,a2cb57d228407136bd5bd7d1258af2fe,"Or, from the perspective of the character, Who...",또는 등장인물의 관점애서 난 ㄴㄱ?,1
1730,51dc29f0fc5109927bdee5fa23afb09f,Who is this person?,얘 ㄴㄱ?,1
1731,9cd6dbaf1d9d205d0cb5d9f2263a710c,If there's a single secret to storytelling the...,만약 이야기하기에 한가지ㅜ비밀이 있다면 난 그게ㅜ이거라 믿는다,1
1732,4fb845c67d91bcb3178498fc6fe1fedc,Diego,디에고,1


In [4]:
import warnings
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer

warnings.filterwarnings('ignore')

In [5]:
student['answer'] = student['answer'].str.replace('[^0-9가-힣]',' ')
teacher['answer'] = teacher['answer'].str.replace('[^0-9가-힣]',' ')

# Soynlp tokenizer

In [6]:
import soynlp
from soynlp.noun import LRNounExtractor_v2
from soynlp import DoublespaceLineCorpus
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer

import warnings
warnings.filterwarnings('ignore')

In [7]:
import collections

def corpus_init():
    f = open("./corpus_target.txt", 'w',encoding='utf8')
    f.write('')
    f.close()

def corpus_save(target):
    save = '\n'.join(target)
    f = open("./corpus_target.txt", 'a',encoding='utf8')
    f.write(save)
    f.close()
    
    
def stop_save(target):
    save = '\n'.join(target)
    f = open("./stopwords.txt", 'a',encoding='utf8')
    f.write(save)
    f.close()
    
def return_tokenizer():
    corpus = DoublespaceLineCorpus("./corpus_target.txt",iter_sent=True)
    noun_extractor = LRNounExtractor_v2(verbose=True) # n_gram?
    nouns = noun_extractor.train_extract(corpus)
    scores = {word:score.score for word, score in nouns.items()}
    tokenizer = LTokenizer(scores=scores)
    return tokenizer

In [8]:
target = list(teacher['answer'].str.replace('[^0-9가-힣]',' '))
target.extend(list(student['answer'].str.replace('[^0-9가-힣]',' ')))

In [9]:
corpus_init()
corpus_save(target)

In [10]:
tokenizer = return_tokenizer()

[Noun Extractor] use default predictors
[Noun Extractor] num features: pos=3929, neg=2321, common=107
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 13631 from 5338 sents. mem=0.315 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=35740, mem=0.339 Gb
[Noun Extractor] batch prediction was completed for 5368 words
[Noun Extractor] checked compounds. discovered 321 compounds
[Noun Extractor] postprocessing detaching_features : 2365 -> 2311
[Noun Extractor] postprocessing ignore_features : 2311 -> 2272
[Noun Extractor] postprocessing ignore_NJ : 2272 -> 2256
[Noun Extractor] 2256 nouns (321 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=0.345 Gb                    
[Noun Extractor] 62.14 % eojeols are covered


In [11]:
tokenizer(student['answer'][0])

['너는', '농작물', '밭에서', '온도', '를', '측정', '할', '필요가', '있다고', '가정', '해라']

In [12]:
clean_train_answer = [tokenizer(i) for i in student['answer']]
clean_train_answer.extend([tokenizer(i) for i in teacher['answer']])

In [13]:
stop_words = set(['은','는','이','가','하','아','것','들','의','있','되','수','보','주','등','한','를','을','에'])
def cleaner(target_sentence):
    target_sentence = collections.deque(target_sentence)
    for _ in range(len(target_sentence)):
        tar = target_sentence.popleft()
        if tar not in stop_words:
             target_sentence.append(tar)
    return list(target_sentence)

In [14]:
tp = []
for i in clean_train_answer:
    tp.extend(i)

import collections
only_word = []
tp = collections.Counter(tp)
for i in tp.keys():
    if tp[i] == 1:
        only_word.append(i)

In [30]:
ct_word = pd.DataFrame()
ct_word['word'] = tp.keys()
ct_word['count'] = tp.values()
ct_word.sort_values(by='count',ascending=False,inplace=True)

ct_word.head(10)

Unnamed: 0,word,count
46,의,1338
22,을,1223
63,은,1120
112,이,992
48,에,883
4,를,764
57,는,664
15,가,507
125,한,494
24,하는,398


In [16]:
stop = ['은','는','이','가','하','아','것','들','의','있','되','수','보','주','등','한','를','을','에','적','할','인가가',
        '해서','된','도','인','로','함']
# stop.extend(only_word)
stop_words = set(stop)
stop_save(stop_words)

In [17]:
test = tokenizer(student['answer'][0])
cleaner(test)

['너는', '농작물', '밭에서', '온도', '측정', '필요가', '있다고', '가정', '해라']

In [18]:
clean_train_answer = [cleaner(tokenizer(i)) for i in student['answer']]
clean_train_answer

[['너는', '농작물', '밭에서', '온도', '측정', '필요가', '있다고', '가정', '해라'],
 ['지식', '에는', '다른', '종류', '있다'],
 ['왜일까'],
 ['너는', '완벽', '있지만', '그것', '생각', '하는', '방식', '변화', '필요가', '있다'],
 ['그는', '유니크한', '퍼포먼스', '방법', '으로', '알려져', '있는', '바이올리니스트이자', '작곡', '가였다'],
 ['누가', '그들', '비난', '할수있나'],
 ['고생', '물학자들은', '지구', '에서', '생명', '역사', '집중', '한다'],
 ['랍스터', '치킨', '만큼', '저렴', '하면', '우리', '그것', '덜', '즐길', '지도', '모른다'],
 ['복원', '청소하고', '피해', '입은', '서식지', '다시', '만드는', '걸', '포함', '한다'],
 ['우리',
  '부엌',
  '으로',
  '향하는',
  '옆문으로',
  '들어왔을때',
  '나는',
  '무엇',
  '잘못',
  '되었다는것을',
  '즉시',
  '알았다'],
 ['여기', '아름', '다운', '무지', '개', '보세요'],
 ['나는', '무언가가', '벽을', '따라서', '느리게', '움직', '이는것을', '들었다'],
 ['하나의', '문화', '다른', '문화', '보다', '나은지를', '어떻게', '결정', '할지', '알기', '어렵다'],
 ['1824년', '페루', '스페인으로부터', '자유', '얻었다'],
 ['문화', '또한', '역할', '한다'],
 ['그것', '더', '쉽고', '편하다'],
 ['사람들', '영웅', '사랑', '한다'],
 ['그는', '주위', '세상', '모든', '방향', '으로', '움직', '이는', '이상', '기분', '느꼈다'],
 ['그것', '완벽', '하다'],
 ['그의', '아버지', '그가', '교회', '목사가', '되기를', '

# Translator

In [19]:
# from googletrans import Translator
# translator = Translator()

# sents = list(student['answer'].str.replace('[^0-9가-힣ㄱ-ㅎㅏ-ㅣ]',' '))
# ans = []
# for sent in sents:
#     try:
#         tar = translator.translate(sent, dest="en") 
#         ans.append(tar)
#         print(tar)
#     except:
#         ans.append('')
# ans

In [20]:
# ans = [i.text for i in ans]
# pd.DataFrame(ans).to_csv('./student_trans.csv')

# tf-idf part1

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
target = list(teacher['answer'].str.replace('[^0-9가-힣]',' '))
target.extend(list(student['answer'].str.replace('[^0-9가-힣]',' ')))

answer1 = pd.DataFrame()
answer1['st_answer'] = student['answer']
answer1['te_answer'] = teacher['answer']
answer1['label'] = student['label']

# 토크나이저, 스탑워드 사용
for j in range(1,4):
    tfidf = TfidfVectorizer(tokenizer=tokenizer,stop_words=stop_words,ngram_range=(j,j)).fit(target)
    st_vector = tfidf.transform(student['answer'])
    te_vector = tfidf.transform(teacher['answer'])
    scores = [float(cosine_similarity(st_vector[i],te_vector[i])) for i in range(len(answer1))]
    answer1[f'{j}gram'] = scores

In [22]:
answer1

Unnamed: 0,st_answer,te_answer,label,1gram,2gram,3gram
0,너는 농작물 밭에서 온도를 측정할 필요가 있다고 가정해라,당신이 농작물 밭에서 온도를 측정할 필요하다고 가정해보자,1,0.599107,0.361024,0.266093
1,지식에는 다른 종류가 있다,다른 종류의 지식이 있다,1,0.849616,0.235235,0.000000
2,왜일까,왜 그럴까,1,0.000000,0.000000,0.000000
3,너는 완벽할 수 있지만 그것을 생각하는 방식을 변화할 필요가 있다,여러분은 완벽할 수 있다 하지만 여러분은 그것 완벽함 에 대해 여러분이 생각하는 ...,1,0.495286,0.201255,0.084286
4,그는 유니크한 퍼포먼스 방법으로 알려져있는 바이올리니스트이자 작곡가였다,그는 그의 독특한 연주 방법으로 알려진 바이올리니스트이자 작곡가였다,1,0.538268,0.298015,0.111124
...,...,...,...,...,...,...
1729,또는 등장인물의 관점애서 난,또는 그 등장인물의 관점에서 나는 누구인가,1,0.482320,0.179716,0.000000
1730,얘,이 사람은 누구인가,1,0.000000,0.000000,0.000000
1731,만약 이야기하기에 한가지 비밀이 있다면 난 그게 이거라 믿는다,만약 이야기하는 일에 한 가지 비밀이 있다면 그렇다면 나는 그것이 이것이라고 ...,1,0.442056,0.180508,0.000000
1732,디에고,,1,0.000000,0.000000,0.000000


# tf-idf part2

In [23]:
trans = pd.read_csv('./student_trans.csv')
answer2 = pd.DataFrame()
answer2['trans_answer'] = trans['0'].str.replace('[^0-9a-zA-Z]',' ')
answer2['question'] = student['sentence'].str.replace('[^0-9a-zA-Z]',' ')
answer2['label'] = student['label']


target = list(answer2['question'])
target.extend(list(answer2['trans_answer']))

for j in range(1,5):
    tfidf = TfidfVectorizer(ngram_range=(j,j)).fit(target)
    st_vector = tfidf.transform(answer2['trans_answer'])
    te_vector = tfidf.transform(answer2['question'])
    scores = [float(cosine_similarity(st_vector[i],te_vector[i])) for i in range(len(answer2))]
    answer2[f'trans_{j}gram'] = scores

In [24]:
answer2

Unnamed: 0,trans_answer,question,label,trans_1gram,trans_2gram,trans_3gram,trans_4gram
0,Suppose you need to measure the temperature in...,Suppose you need to measure the temperature in...,1,0.995607,0.850657,0.668330,0.646124
1,There are different kinds of knowledge,There are different kinds of knowledge,1,1.000000,1.000000,1.000000,1.000000
2,Why is it,Why,1,0.812342,0.000000,0.000000,0.000000
3,You can be perfect but you need to change the...,You can be perfect but you need to change the...,1,1.000000,1.000000,1.000000,1.000000
4,He was a violinist and composer known as a uni...,He was a violinist and composer known for his ...,1,0.938912,0.713160,0.552313,0.363697
...,...,...,...,...,...,...,...
1729,Or is it because of the perspective of the cha...,Or from the perspective of the character Who...,1,0.691174,0.474348,0.379573,0.288073
1730,His one,Who is this person,1,0.000000,0.000000,0.000000,0.000000
1731,If I have a secret I believe that if I have a...,If there s a single secret to storytelling the...,1,0.554813,0.000000,0.000000,0.000000
1732,Diego,Diego,1,1.000000,0.000000,0.000000,0.000000


# RandomForest Classifier

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

train_data = pd.DataFrame()
train_data['1gram'] = answer1['1gram']
train_data['2gram'] = answer1['2gram']
train_data['3gram'] = answer1['3gram']
# train_data['4gram'] = answer1['4gram']


train_data['trans_1gram'] = answer2['trans_1gram']
train_data['trans_2gram'] = answer2['trans_2gram']
train_data['trans_3gram'] = answer2['trans_3gram']
# train_data['trans_4gram'] = answer2['trans_4gram']

train_label = answer2['label']


X_train,X_test,y_train,y_test = train_test_split(
    train_data,train_label,test_size=0.15,random_state=10
)

from sklearn.ensemble import RandomForestClassifier

bootstrap = []
for i in range(100): # 랜덤추출
    rf = RandomForestClassifier(random_state=i)
    rf.fit(X_train,y_train)
    pred = rf.predict(X_test)
    acc = accuracy_score(pred,y_test)
    print(acc)
    bootstrap.append(acc)

0.7662835249042146
0.7662835249042146
0.7662835249042146
0.7701149425287356
0.7662835249042146
0.7662835249042146
0.7739463601532567
0.7586206896551724
0.7701149425287356
0.7624521072796935
0.7624521072796935
0.7739463601532567
0.7547892720306514
0.7662835249042146
0.7816091954022989
0.7662835249042146
0.7509578544061303
0.7624521072796935
0.7509578544061303
0.7547892720306514
0.7624521072796935
0.7624521072796935
0.7662835249042146
0.7624521072796935
0.7739463601532567
0.7586206896551724
0.7662835249042146
0.7624521072796935
0.7547892720306514
0.7586206896551724
0.7586206896551724
0.7739463601532567
0.7586206896551724
0.7586206896551724
0.7509578544061303
0.7931034482758621
0.7662835249042146
0.7816091954022989
0.7662835249042146
0.7624521072796935
0.7662835249042146
0.7547892720306514
0.7662835249042146
0.7777777777777778
0.7586206896551724
0.7624521072796935
0.7701149425287356
0.7662835249042146
0.7777777777777778
0.7816091954022989
0.7624521072796935
0.7624521072796935
0.7662835249

In [32]:
CI = pd.Series(bootstrap)
CI.describe()

count    100.000000
mean       0.767050
std        0.009742
min        0.743295
25%        0.762452
50%        0.766284
75%        0.773946
max        0.793103
dtype: float64

In [33]:
import scipy.stats

def conf_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h

In [34]:
conf_interval(CI,confidence=0.95)

(0.7670498084291186, 0.7651168565698998, 0.7689827602883375)

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
bootstrap1 = []
parms = {'n_estimators':list(range(50,150,10)),
         'max_depth':list(range(10))} 
rf_clf = RandomForestClassifier(random_state=12,n_jobs=-1,verbose=True) # n_jobs ?
grid_cv = GridSearchCV(rf_clf,param_grid=parms,cv=6,n_jobs=-1,scoring='accuracy')# ,param_grid=parms
grid_cv.fit(X_train,y_train)

#     print(grid_cv.best_estimator_)
#     print(grid_cv.best_params_)
#     print(round(grid_cv.best_score_,4))
pred = grid_cv.predict(X_test)
acc = accuracy_score(pred,y_test)
print(acc)

0.7471264367816092


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 140 out of 140 | elapsed:    0.0s finished
