In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from collections import Counter
import matplotlib.pyplot as plt
import pickle
from sklearn import metrics


In [2]:
np.set_printoptions(threshold=np.nan)
with open('data/Social_FA_id_to_lb.pickle', mode='rb') as f:
    id_to_code = pickle.load(f)
x = np.asarray([
    " ".join(i.strip().split()[2:])
    for i in open("data/Social_FA_edu_data.txt").readlines()
])
t = np.asarray(
    [
        l.strip().split()[:2]
        for l in open("data/Social_FA_edu_data.txt").readlines()
    ],
    dtype=np.int64)
train = np.arange(len(x)) % 10 != 0

print("train_x:", len(x[train]))
print("test_x", len(x[~train]))
print("train_y", len(t[train]))
print("test_y", len(t[~train]))

print("train_x[0]:", x[train][0])
print("train_y[0]:", t[train][0])
print("test_x[1]:", x[~train][1])
print("test_y[1]:", t[~train][1])


train_x: 2357
test_x 262
train_y 2357
test_y 262
train_x[0]: 1167 1968 3356 562 19 9 54 15 1 7 2 8 64 135 5 63 2 2449 10 649 102 82 9 54 15 977 2 48 55 9 0 3553 2560 7 0
train_y[0]: [6 6]
test_x[1]: 1883 3 1308 2 65 1 3 18 6 12 0
test_y[1]: [6 6]


In [3]:
c_vectorizer = CountVectorizer()
# h_vectorizer = HashingVectorizer(non_negative=True, ngram_range=(1, 2), norm=u'l2')
# h_vectorizer = HashingVectorizer(ngram_range=(1, 2), norm=u'l2')
h_vectorizer = HashingVectorizer()
dat = h_vectorizer.fit_transform(x.tolist())

train_feature = dat[train]
train_label = t[train][:,0]
test_feature = dat[~train]
test_label = t[~train][:,0]
class_names = list(id_to_code.values())

clf = svm.SVC(kernel='linear',probability=True)
clf.fit(train_feature,train_label)
test_pred = clf.predict(test_feature)
print("test_pred:",Counter(test_pred), " test_true:",Counter(test_label))
print("test acc =",metrics.accuracy_score(test_label, test_pred))
print("class:", class_names)
print(confusion_matrix(test_pred,test_label))
print(classification_report(test_pred,test_label, target_names=class_names))

test_pred: Counter({6: 155, 0: 80, 5: 27})  test_true: Counter({6: 127, 0: 88, 5: 29, 3: 7, 1: 6, 2: 3, 4: 2})
test acc = 0.7480916030534351
class: ['quick consensus building', 'conflict-oriented consensus building', 'summary', 'integration-oriented consensus building', 'blank', 'elicitation', 'externalization']
[[ 67   1   0   3   0   1   8]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  1   0   0   0   0  18   8]
 [ 20   5   3   4   2  10 111]]
                                         precision    recall  f1-score   support

               quick consensus building       0.76      0.84      0.80        80
   conflict-oriented consensus building       0.00      0.00      0.00         0
                                summary       0.00      0.00      0.00         0
integration-oriented consensus building       0.00      0.00      0.00         0
                                  blank       0.00      0.00  

  'recall', 'true', average, warn_for)


In [4]:
clf = svm.SVC(kernel='poly',probability=True)
clf.fit(train_feature,train_label)
test_pred = clf.predict(test_feature)
print("test_pred:",Counter(test_pred), " test_true:",Counter(test_label))
print("test acc =",metrics.accuracy_score(test_label, test_pred))
print("class:", class_names)
print(confusion_matrix(test_pred,test_label))
print(classification_report(test_pred,test_label, target_names=class_names))

test_pred: Counter({6: 262})  test_true: Counter({6: 127, 0: 88, 5: 29, 3: 7, 1: 6, 2: 3, 4: 2})
test acc = 0.4847328244274809
class: ['quick consensus building', 'conflict-oriented consensus building', 'summary', 'integration-oriented consensus building', 'blank', 'elicitation', 'externalization']
[[  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [ 88   6   3   7   2  29 127]]
                                         precision    recall  f1-score   support

               quick consensus building       0.00      0.00      0.00         0
   conflict-oriented consensus building       0.00      0.00      0.00         0
                                summary       0.00      0.00      0.00         0
integration-oriented consensus building       0.00      0.00      0.00         0
                                  blank       0.00      0.00      0.00      

  'recall', 'true', average, warn_for)


In [5]:
clf = svm.SVC(kernel='sigmoid',probability=True)
clf.fit(train_feature,train_label)
test_pred = clf.predict(test_feature)
print("test_pred:",Counter(test_pred), " test_true:",Counter(test_label))
print("test acc =",metrics.accuracy_score(test_label, test_pred))
print("class:", class_names)
print(confusion_matrix(test_pred,test_label))
print(classification_report(test_pred,test_label, target_names=class_names))

test_pred: Counter({6: 262})  test_true: Counter({6: 127, 0: 88, 5: 29, 3: 7, 1: 6, 2: 3, 4: 2})
test acc = 0.4847328244274809
class: ['quick consensus building', 'conflict-oriented consensus building', 'summary', 'integration-oriented consensus building', 'blank', 'elicitation', 'externalization']
[[  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [ 88   6   3   7   2  29 127]]
                                         precision    recall  f1-score   support

               quick consensus building       0.00      0.00      0.00         0
   conflict-oriented consensus building       0.00      0.00      0.00         0
                                summary       0.00      0.00      0.00         0
integration-oriented consensus building       0.00      0.00      0.00         0
                                  blank       0.00      0.00      0.00      

  'recall', 'true', average, warn_for)


In [6]:
clf = svm.SVC(kernel='rbf',probability=True)
clf.fit(train_feature,train_label)
test_pred = clf.predict(test_feature)
print("test_pred:",Counter(test_pred), " test_true:",Counter(test_label))
print("test acc =",metrics.accuracy_score(test_label, test_pred))
print("class:", class_names)
print(confusion_matrix(test_pred,test_label))
print(classification_report(test_pred,test_label, target_names=class_names))

test_pred: Counter({6: 262})  test_true: Counter({6: 127, 0: 88, 5: 29, 3: 7, 1: 6, 2: 3, 4: 2})
test acc = 0.4847328244274809
class: ['quick consensus building', 'conflict-oriented consensus building', 'summary', 'integration-oriented consensus building', 'blank', 'elicitation', 'externalization']
[[  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [ 88   6   3   7   2  29 127]]
                                         precision    recall  f1-score   support

               quick consensus building       0.00      0.00      0.00         0
   conflict-oriented consensus building       0.00      0.00      0.00         0
                                summary       0.00      0.00      0.00         0
integration-oriented consensus building       0.00      0.00      0.00         0
                                  blank       0.00      0.00      0.00      

  'recall', 'true', average, warn_for)


In [7]:
with open('data/Social_FA_id_to_lb.pickle', mode='rb') as f:
    id_to_code = pickle.load(f)

x = np.asarray(
    [" ".join(i.strip().split()[2:]) for i in open("data/Social_FA_edu_data.txt").readlines()])
t = np.asarray(
    [l.strip().split()[:2] for l in open("data/Social_FA_edu_data.txt").readlines()], dtype=np.int64)
train = np.arange(len(x)) % 10 != 0
print("traindata", len(x[train]))
print("test", len(x[~train]))
count_vectorizer = CountVectorizer()
vectorizer = HashingVectorizer(
    non_negative=True, ngram_range=(1, 2), norm=u'l2')
dat = vectorizer.fit_transform(x.tolist())

clf = svm.SVC(kernel='linear', probability=True)  #0.890063424947
#clf = svm.SVC(kernel='poly', probability=True) #0.547568710359
#clf = svm.SVC(kernel='sigmoid', probability=True) #0.547568710359
clf.fit(dat[train], t[train][:, 0])

pred = clf.predict(dat[~train])
label = t[~train][:, 0]
print(Counter(pred), Counter(label))
cf = confusion_matrix(pred, label)
print(cf)
#print(np.mean(label == pred))
class_names = list(id_to_code.values())
print(class_names)
print(classification_report(pred,label, target_names=class_names))

traindata 2357
test 262




Counter({6: 154, 0: 86, 5: 22}) Counter({6: 127, 0: 88, 5: 29, 3: 7, 1: 6, 2: 3, 4: 2})
[[ 71   1   1   3   0   1   9]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0]
 [  1   0   0   0   0  16   5]
 [ 16   5   2   4   2  12 113]]
['quick consensus building', 'conflict-oriented consensus building', 'summary', 'integration-oriented consensus building', 'blank', 'elicitation', 'externalization']
                                         precision    recall  f1-score   support

               quick consensus building       0.81      0.83      0.82        86
   conflict-oriented consensus building       0.00      0.00      0.00         0
                                summary       0.00      0.00      0.00         0
integration-oriented consensus building       0.00      0.00      0.00         0
                                  blank       0.00      0.00      0.00         0
                            elicitation  

  'recall', 'true', average, warn_for)
