In [1]:
import pandas as pd

dat = pd.read_csv('../data/comments_labeled.csv')
dat_train = dat[dat['LABEL'] > 0]
dat_test1 = dat[dat['LABEL'] == 0]
dat_test2 = dat[dat['LABEL'] == -1]

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
cv.fit(dat['text'])

X_train = cv.transform(dat_train['text'])
y_train = dat_train['LABEL']

X_test1 = cv.transform(dat_test1['text'])
X_test2 = cv.transform(dat_test2['text'])

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score

cls1 = DecisionTreeClassifier(criterion='entropy', max_depth=15, min_samples_split=10, min_samples_leaf=10, random_state=0)
cls1.fit(X_train, y_train)
y_pred1 = cls1.predict(X_train)
print(accuracy_score(y_train, y_pred1))
print(f1_score(y_train, y_pred1, average=None))
print(confusion_matrix(y_train, y_pred1))

0.8900286686973661
[0.7774252  0.83621809 0.80441601 0.95135908 0.96567627]
[[ 8575     0     0     0     0]
 [ 1514  3865     0     0     0]
 [ 1559     0  3206     0     0]
 [  408     0     0  3990     0]
 [ 1429     0     0     0 20102]]


In [4]:
cls2 = DecisionTreeClassifier(criterion='gini', max_depth=15, min_samples_split=10, min_samples_leaf=10,random_state=0)
cls2.fit(X_train, y_train)
y_pred2 = cls2.predict(X_train)
print(accuracy_score(y_train, y_pred2))
print(f1_score(y_train, y_pred2, average=None))
print(confusion_matrix(y_train, y_pred2))

0.8921564235800036
[0.78078762 0.89763295 0.80441601 0.89608434 0.96567627]
[[ 8575     0     0     0     0]
 [  999  4380     0     0     0]
 [ 1559     0  3206     0     0]
 [  828     0     0  3570     0]
 [ 1429     0     0     0 20102]]


In [5]:
cls3= DecisionTreeClassifier(criterion='entropy', max_depth=15, min_samples_split=10, min_samples_leaf=10,class_weight='balanced', random_state=0)
cls3.fit(X_train, y_train)
y_pred3 = cls3.predict(X_train)
print(f1_score(y_train, y_pred3, average=None))
print(confusion_matrix(y_train, y_pred3))

[0.70835571 0.89763295 0.80441601 0.98570111 0.88679782]
[[ 8575     0     0     0     0]
 [  999  4380     0     0     0]
 [ 1559     0  3206     0     0]
 [  124     0     0  4274     0]
 [ 4379     0     0     0 17152]]


In [6]:
cls4 = DecisionTreeClassifier(criterion='gini', max_depth=15, min_samples_split=10, min_samples_leaf=10,class_weight='balanced', random_state=0)
cls4.fit(X_train, y_train)
y_pred4 = cls4.predict(X_train)
print(f1_score(y_train, y_pred4, average=None))
print(confusion_matrix(y_train, y_pred4))

[0.75282033 0.89763295 0.80441601 0.95135908 0.93402975]
[[ 8575     0     0     0     0]
 [  999  4380     0     0     0]
 [ 1559     0  3206     0     0]
 [  408     0     0  3990     0]
 [ 2665     0     0     0 18866]]


In [7]:
y_pred_all_true = (y_train == y_pred1) & (y_train == y_pred2) & (y_train == y_pred3) & (y_train == y_pred4)
X_train_all_true = X_train[y_pred_all_true]
y_train_all_true = y_train[y_pred_all_true]

In [8]:
cls_final = DecisionTreeClassifier(criterion='gini', max_depth=15, min_samples_split=10, random_state=0)
cls_final.fit(X_train_all_true, y_train_all_true)
y_pred_final = cls_final.predict(X_train_all_true)

print(f1_score(y_train_all_true, y_pred_final, average=None))
print(confusion_matrix(y_train_all_true, y_pred_final))

[1. 1. 1. 1. 1.]
[[ 8575     0     0     0     0]
 [    0  3865     0     0     0]
 [    0     0  3206     0     0]
 [    0     0     0  3570     0]
 [    0     0     0     0 17152]]


In [9]:
from collections import Counter

y_pred_test1 = cls_final.predict(X_test1)
Counter(y_pred_test1)

Counter({1: 92544})

In [10]:
from collections import Counter

y_pred_test2 = cls_final.predict(X_test2)
Counter(y_pred_test2)

Counter({3: 828, 2: 1605, 4: 543, 5: 1273, 1: 1045})

In [11]:
dat_test2.insert(0, 'DT_LABEL', value=y_pred_test2)

In [12]:
dat_final = dat[dat['LABEL'] != 0]

for (idx, row), y in zip(dat_test2.iterrows(), y_pred_test2):
    dat_final.loc[idx, 'LABEL'] = y
dat_final.to_csv('../data/comments_dt.csv', index=False)
dat_test2.to_csv('comments_dt_filled.csv', index=False)

label_names = {
    0: 'None',
    1: 'GPU',
    2: 'CPU',
    3: 'Phone',
    4: 'TechCompany',
    5: 'Channel',
    -1: 'MoreThanOneClass'
}

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [13]:
import pydotplus
from sklearn.tree import export_graphviz

def draw_model(tree, filename):
    dot_data = export_graphviz(tree,
    out_file = None,
	feature_names = cv.get_feature_names(),
	class_names = [label_names[x] for x in tree.classes_],
	filled=True,
	rounded = True,
	special_characters = True)
    graph = pydotplus.graph_from_dot_data(dot_data)
    graph.write_png(filename)

draw_model(cls1, 'cls_e.png')
draw_model(cls2, 'cls_g.png')
draw_model(cls3, 'cls_e_bal.png')
draw_model(cls4, 'cls_g_bal.png')
draw_model(cls_final, 'cls_final.png')