In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from collections import Counter
import matplotlib.pyplot as plt
import pickle
from sklearn import metrics


In [2]:
np.set_printoptions(threshold=np.nan)

# labelの詳細
with open('data/Epistemic_FA_id_to_lb.pickle', mode='rb') as f:
    id_to_code = pickle.load(f)
    
    
old_input = np.asarray([
    " ".join(i.strip().split()[2:])
    for i in open("data/Epistemic_FA_edu_data.txt").readlines()
])
old_labels = np.asarray(
    [
        l.strip().split()[:2]
        for l in open("data/Epistemic_FA_edu_data.txt").readlines()
    ],
    dtype=np.int64)


new_input = np.asarray([
    " ".join(i.strip().split()[2:])
    for i in open("data/New_edu_data.txt").readlines()
])
new_labels = np.asarray(
    [
        l.strip().split()[:2]
        for l in open("data/New_edu_data.txt").readlines()
    ],
    dtype=np.int64)


train = np.arange(len(old_input)) % 10 != 0

print(id_to_code)
print("old_input_train:", len(old_input[train]))
print("old_labels_train:", len(old_labels[train]))

print("old_input_test:", len(old_input[~train]))
print("old_labels_test:", len(old_labels[~train]))

print("new_input:", len(new_input))
print("new_labels:", len(new_labels))

print("new_input[0]:", new_input[0])
print("new_labels[1]:", new_labels[1])

{0: 'on task', 1: 'off task'}
old_input_train: 7614
old_labels_train: 7614
old_input_test: 846
old_labels_test: 846
new_input: 2743
new_labels: 2743
new_input[0]: 49 89 0
new_labels[1]: [0 0]


In [3]:
xxxx = old_labels[:,:1]
print(xxxx.shape)
print(Counter(np.reshape(xxxx,(-1))))

(8460, 1)
Counter({1: 4543, 0: 3917})


# Hashingのために、以前のデータとnew_inputを結合する
# all_input = [ old_input_train, old_input_test, new_input ]

In [4]:
all_input = np.hstack((old_input[train],old_input[~train],new_input)) 
a = len(old_input[train])
b = len(old_input[~train])
c = len(new_input)
print("old_input_train size:",a)
print("old_input_test size:",b)
print("new_input size:",c)

old_input_train size: 7614
old_input_test size: 846
new_input size: 2743


# 以前のデータの結果

In [5]:
c_vectorizer = CountVectorizer()
h_vectorizer = HashingVectorizer(non_negative=True, ngram_range=(1, 2), norm=u'l2')
dat1 = h_vectorizer.fit_transform(all_input.tolist())

train_feature = dat1[0:a]
train_label = old_labels[train][:,0]
test_feature = dat1[a:a+b]
test_label = old_labels[~train][:,0]
class_names = list(id_to_code.values())

clf = svm.SVC(kernel='linear',probability=True)
clf.fit(train_feature,train_label)
test_pred = clf.predict(test_feature)
print("test_pred:",Counter(test_pred), " test_true:",Counter(test_label))
print("test acc =",metrics.accuracy_score(test_label, test_pred))
print(id_to_code)
print("class:", class_names)
print(confusion_matrix(test_pred,test_label))
print(classification_report(test_pred,test_label, target_names=class_names))



test_pred: Counter({1: 459, 0: 387})  test_true: Counter({1: 456, 0: 390})
test acc = 0.9089834515366431
{0: 'on task', 1: 'off task'}
class: ['on task', 'off task']
[[350  37]
 [ 40 419]]
             precision    recall  f1-score   support

    on task       0.90      0.90      0.90       387
   off task       0.92      0.91      0.92       459

avg / total       0.91      0.91      0.91       846



# 新しいデータの予測結果

In [6]:
c_vectorizer = CountVectorizer()
h_vectorizer = HashingVectorizer(non_negative=True, ngram_range=(1, 2), norm=u'l2')
dat2 = h_vectorizer.fit_transform(all_input.tolist())

train_feature = dat2[0:a]
train_label = old_labels[train][:,0]
test_feature = dat2[a:a+b]
test_label = old_labels[~train][:,0]
test_feature_new = dat2[a+b:a+b+c] #新しいデータ
class_names = list(id_to_code.values())

clf = svm.SVC(kernel='linear',probability=True)
clf.fit(train_feature,train_label)
test_pred = clf.predict(test_feature)
test_new_pred = clf.predict(test_feature_new)
print(Counter(test_new_pred))
print(id_to_code)
print(test_new_pred)


of = open("data/New_pred_Epistemic_data.txt", "w")
for label in test_new_pred:
    print(id_to_code[label], file=of)
of.close()




Counter({0: 1649, 1: 1094})
{0: 'on task', 1: 'off task'}
[1 1 1 0 0 0 1 0 0 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 1 1 1
 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 1 1 0 1 1 1 0 1 0 1 1 0 0 0 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 0 1 0 0
 0 1 0 0 0 1 0 0 1 0 1 0 0 1 1 1 1 0 0 0 1 0 1 0 0 0 0 0 0 1 1 1 1 0 1 0 0
 1 1 1 0 1 1 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 1 0 1 1 1 1 1 0
 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 1
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 1 0 0 1 0 1 1 0 1 0 1 0 0 0 0 0 0 0
 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 1
 0 1 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 1 0 0
 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 1 0 1 1 0 0 0
 0 1 1 1 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 1 0 1 1 1 1 0 0 0 1
 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 1 1 1 1 1
 1 0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0

# 新しいデータの予測結果(train all)

In [7]:
c_vectorizer = CountVectorizer()
h_vectorizer = HashingVectorizer(non_negative=True, ngram_range=(1, 2), norm=u'l2')
dat3 = h_vectorizer.fit_transform(all_input.tolist())

# train_feature = dat2[0:a]
# train_label = old_labels[train][:,0]
# test_feature = dat2[a:a+b]
# test_label = old_labels[~train][:,0]

train_feature = dat3[0:a+b]

train_label = np.hstack((old_labels[train][:,0],old_labels[~train][:,0])) 
test_feature_new = dat2[a+b:a+b+c] #新しいデータ
class_names = list(id_to_code.values())

clf = svm.SVC(kernel='linear',probability=True)
clf.fit(train_feature,train_label)
test_pred = clf.predict(test_feature)
test_new_pred = clf.predict(test_feature_new)
print(Counter(test_new_pred))
print(id_to_code)
print(test_new_pred)


of = open("data/New_pred_Epistemic_2_data.txt", "w")
for label in test_new_pred:
    print(id_to_code[label], file=of)
of.close()








Counter({0: 1633, 1: 1110})
{0: 'on task', 1: 'off task'}
[1 1 1 0 0 0 1 0 0 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 1 0 1
 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 1 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 1 1 0 1 1 1 0 1 0 1 1 0 0 0 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 0 1 0 0
 0 1 0 0 0 1 0 0 1 1 1 0 0 1 1 1 1 0 0 0 1 0 1 0 0 0 0 0 0 1 1 1 1 0 1 0 0
 1 1 1 0 1 1 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 1 0 1 1 1 1 1 0
 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 1 1 1 0 0 1 0 0 0 0 0 0 1 0 1
 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0
 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 1 0 0 0 0 1
 0 1 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 1 0 1
 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 1 0 1 1 0 0 0
 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 1 0 1 1 1 1 0 0 0 1
 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 1 1 1 1 1
 1 0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0