In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame(columns=["content", "label"])

In [3]:
import os

In [4]:
encode = {}
index = 0
i = 0

for file in os.listdir("."):
    if '.txt' not in file:
        continue
    label = file.split("_")[0]
    encode[label] = index
    
    with open(file, 'r') as f:
        for line in f.readlines():
            df.loc[i] = [line.strip(" \n\r\t"), index]
            i += 1
            
    index += 1

In [5]:
len(df)

3656

In [6]:
df['label'].unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=object)

In [7]:
df.head()

Unnamed: 0,content,label
0,"VNU, UIT, Dept Comp Engn, Ho Chi Minh City, Vi...",0
1,"Vietnam Natl Univ HCMC, Univ Sci, Dept Optimiz...",0
2,"Univ Sci, Lab Adv Mat, VNU HCM, Ho Chi Minh Ci...",0
3,"Vietnam Natl Univ Ho Chi Minh City, Univ Sci, ...",0
4,"Viet Nam Natl Univ Ho Chi Minh City, Linh Trun...",0


In [8]:
df = df.sample(frac=1).reset_index(drop=True)

In [9]:
df.head()

Unnamed: 0,content,label
0,"Hanoi Univ Sci & Technol, Sch Chem Engn, 1 Dai...",8
1,"Natl Univ Sci Ho Chi Minh City, Dept Organ Che...",0
2,"HCM Natl Univ, Ho Chi Minh City Univ Technol, ...",0
3,"Hue Univ, Fac Hospital & Tourism, 22 Lam Hoang...",7
4,"Vietnam Acad Sci & Technol, Southern Inst Ecol...",1


In [10]:
n = int(len(df) * 0.3)

In [11]:
df['num_words'] = df['content'].apply(lambda s: len(s.split()))
df['num_unique_words'] = df['content'].apply(lambda s: len(set(w for w in s.split())))
df['words_vs_unique'] = df['num_unique_words'] / (df['num_words']+1) * 100

In [12]:
train = df[n:]

In [13]:
test = df[:n]

In [14]:
len(train) + len(test)

3656

In [15]:
train.to_csv("train.csv", index=False)

In [16]:
test.to_csv("test.csv", index=False)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [18]:
tfidf = TfidfVectorizer(
    min_df = 3, 
    max_df = 0.5, 
    ngram_range=(1,3),
    use_idf=True,
    sublinear_tf=True
)

In [19]:
X_train_tfidf = tfidf.fit_transform(train['content'])
X_test_tfidf = tfidf.transform(test['content'])

In [20]:
X_train_tfidf.shape

(2560, 3302)

In [21]:
EXCLUED_COLS = ['content', 'label']
static_cols = [c for c in train.columns if not c in EXCLUED_COLS]
X_train_static = train[static_cols].values
X_test_static = test[static_cols].values

In [22]:
from scipy.sparse import hstack, csr_matrix, vstack

In [23]:
X_train = hstack([X_train_tfidf, csr_matrix(X_train_static)]).tocsr()
X_test = hstack([X_test_tfidf, csr_matrix(X_test_static)]).tocsr()

In [28]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [25]:
clfs = {
    "RF": RandomForestClassifier(n_estimators=200, min_samples_leaf=2, max_depth=7, max_features=0.7, random_state=42, n_jobs=-1),
    "GB": GradientBoostingClassifier(n_estimators=200, min_samples_split=2, max_depth=5, max_features=0.7, random_state=111),
    "LR": LogisticRegression(penalty='l2', C=1.0),
}

In [26]:
for name, clf in clfs.items():
    print(name)
    y_true = train['label'].astype('int')
    clf.fit(X_train, y_true)
    y_pred = clf.predict(X_train)
    
    print("F1", f1_score(y_true, y_pred, average="micro"))
    print("CM", confusion_matrix(y_true, y_pred))
    print("ACC", accuracy_score(y_true, y_pred))
    
    y_true = test['label'].astype('int')
    y_pred = clf.predict(X_test)
    
    print("F1", f1_score(y_true, y_pred, average="micro"))
    print("CM", confusion_matrix(y_true, y_pred))
    print("ACC", accuracy_score(y_true, y_pred))

RF
F1 0.95546875
CM [[480   0   0   0   0   0   0   0   0   0]
 [ 18 549   0   0   0   0   0   0   1  22]
 [  3   0   0   0   0   0   0   0   0  13]
 [  1   0   0  54   0   0   0   0   0   2]
 [ 13   0   0   0 159   0   0   0   0   2]
 [  0   0   0   0   0 231   0   0   0   0]
 [ 10   0   0   0   0   0 106   0   0   1]
 [  0   0   0   0   0   0   0 175   0   1]
 [  0   0   0   0   0   0   0   0 257  18]
 [  8   0   0   0   0   0   0   0   1 435]]
ACC 0.95546875
F1 0.9434306569343066
CM [[199   1   0   0   0   0   0   0   0   0]
 [ 10 228   0   0   0   0   0   0   3  12]
 [  2   0   0   0   0   0   0   0   0   5]
 [  0   0   0  18   0   0   0   0   0   4]
 [  7   0   0   0  53   0   0   0   0   0]
 [  2   0   0   0   0 110   0   0   0   0]
 [  3   0   0   0   0   0  37   0   0   0]
 [  0   0   0   0   0   0   0  86   0   0]
 [  1   0   0   0   0   0   0   0 110  10]
 [  2   0   0   0   0   0   0   0   0 193]]
ACC 0.9434306569343066
GB
F1 0.999609375
CM [[479   0   0   0   0   0   0   0 



F1 0.992578125
CM [[480   0   0   0   0   0   0   0   0   0]
 [  2 586   0   0   0   0   0   0   0   2]
 [  0   0  10   0   0   0   0   0   0   6]
 [  0   0   0  55   0   0   0   0   1   1]
 [  0   0   0   0 172   0   0   0   0   2]
 [  0   0   0   0   0 231   0   0   0   0]
 [  0   0   0   0   0   0 117   0   0   0]
 [  0   0   0   0   0   0   0 176   0   0]
 [  0   0   0   0   0   0   0   0 271   4]
 [  1   0   0   0   0   0   0   0   0 443]]
ACC 0.992578125
F1 0.9881386861313869
CM [[200   0   0   0   0   0   0   0   0   0]
 [  1 252   0   0   0   0   0   0   0   0]
 [  0   0   4   0   0   0   0   0   0   3]
 [  0   0   0  18   0   1   0   0   0   3]
 [  0   0   0   0  60   0   0   0   0   0]
 [  1   0   0   0   0 111   0   0   0   0]
 [  0   0   0   0   0   0  40   0   0   0]
 [  0   0   0   0   0   0   0  86   0   0]
 [  1   0   0   0   0   0   0   0 117   3]
 [  0   0   0   0   0   0   0   0   0 195]]
ACC 0.9881386861313869


In [29]:
models=[ 
    ######## First level ########
    [
        RandomForestClassifier(n_estimators=200, min_samples_leaf=2, max_depth=7, max_features=0.7, random_state=42, n_jobs=-1),        
        ExtraTreesClassifier (n_estimators=200, min_samples_leaf=2, max_depth=7, max_features=0.7, random_state=42, n_jobs=-1),
        GradientBoostingClassifier(n_estimators=200, min_samples_split=2, max_depth=5, max_features=0.7, random_state=111),
        LogisticRegression(penalty='l2', C=1.0),
    ],
    ######## Second level ########
    [
        RandomForestClassifier (n_estimators=200, min_samples_leaf=2, max_depth=30, max_features=0.7, random_state=421, n_jobs=-1)
    ]
]

In [32]:
from pystacknet.pystacknet import StackNetClassifier

model = StackNetClassifier(
    models, metric="accuracy", 
    folds=5,
    restacking=False, 
    use_retraining=True, 
    use_proba=True, 
    random_state=12345, n_jobs=1, verbose=1
)

y_train = train['label'].astype('int')
model.fit(X_train, y_train)
preds=model.predict_proba(X_test)

Input Dimensionality 3305 at Level 0 
4 models included in Level 0 




Level 0, fold 1/5 , model 0 , accuracy===0.947266 
Level 0, fold 1/5 , model 1 , accuracy===0.912109 
Level 0, fold 1/5 , model 2 , accuracy===0.984375 
Level 0, fold 1/5 , model 3 , accuracy===0.982422 




Level 0, fold 2/5 , model 0 , accuracy===0.947266 
Level 0, fold 2/5 , model 1 , accuracy===0.904297 
Level 0, fold 2/5 , model 2 , accuracy===0.986328 
Level 0, fold 2/5 , model 3 , accuracy===0.982422 




Level 0, fold 3/5 , model 0 , accuracy===0.957031 
Level 0, fold 3/5 , model 1 , accuracy===0.919922 
Level 0, fold 3/5 , model 2 , accuracy===0.984375 
Level 0, fold 3/5 , model 3 , accuracy===0.982422 




Level 0, fold 4/5 , model 0 , accuracy===0.958984 
Level 0, fold 4/5 , model 1 , accuracy===0.910156 
Level 0, fold 4/5 , model 2 , accuracy===0.992188 
Level 0, fold 4/5 , model 3 , accuracy===0.992188 




Level 0, fold 5/5 , model 0 , accuracy===0.957031 
Level 0, fold 5/5 , model 1 , accuracy===0.906250 
Level 0, fold 5/5 , model 2 , accuracy===0.994141 
Level 0, fold 5/5 , model 3 , accuracy===0.986328 
Level 0, model 0 , accuracy===0.953516 
Level 0, model 1 , accuracy===0.910547 
Level 0, model 2 , accuracy===0.988281 
Level 0, model 3 , accuracy===0.985156 




Output dimensionality of level 0 is 40 
 level 0 lasted 128.304944 seconds 
Input Dimensionality 40 at Level 1 
1 models included in Level 1 
Level 1, fold 1/5 , model 0 , accuracy===0.990234 
Level 1, fold 2/5 , model 0 , accuracy===0.994141 
Level 1, fold 3/5 , model 0 , accuracy===0.988281 
Level 1, fold 4/5 , model 0 , accuracy===0.994141 
Level 1, fold 5/5 , model 0 , accuracy===0.998047 
Level 1, model 0 , accuracy===0.992969 
Output dimensionality of level 1 is 10 
 level 1 lasted 7.630138 seconds 
 fit() lasted 135.936552 seconds 
1 estimators included in Level 0 
1 estimators included in Level 1 


In [35]:
pred_cls = np.argmax(preds, axis=1)
y_true = test['label'].astype('int')

In [36]:
print("F1", f1_score(y_true, pred_cls, average="micro"))
print("CM", confusion_matrix(y_true, pred_cls))
print("ACC", accuracy_score(y_true, pred_cls))

F1 0.9917883211678832
CM [[200   0   0   0   0   0   0   0   0   0]
 [  0 253   0   0   0   0   0   0   0   0]
 [  0   0   7   0   0   0   0   0   0   0]
 [  0   0   0  18   0   0   0   0   1   3]
 [  0   0   0   0  60   0   0   0   0   0]
 [  2   0   0   0   0 110   0   0   0   0]
 [  0   0   0   0   0   0  40   0   0   0]
 [  0   0   0   0   0   0   0  86   0   0]
 [  0   1   0   0   0   0   0   0 119   1]
 [  0   1   0   0   0   0   0   0   0 194]]
ACC 0.9917883211678832


In [37]:
final_model = clfs["GB"]

In [39]:
import pickle

In [40]:
with open('vcgate_gradient_boosting.pkl', 'wb') as f:
    pickle.dump(final_model, f)