In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame(columns=["content", "label"])

In [3]:
import os

In [4]:
encode = {}
index = 0
i = 0

for file in os.listdir("."):
    if '.txt' not in file:
        continue
    label = file.split("_")[0]
    encode[label] = index
    
    with open(file, 'r') as f:
        for line in f.readlines():
            df.loc[i] = [line.strip(" \n\r\t"), index]
            i += 1
            
    index += 1

In [5]:
len(df)

3656

In [6]:
df['label'].unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=object)

In [7]:
df.head()

Unnamed: 0,content,label
0,"VNU, UIT, Dept Comp Engn, Ho Chi Minh City, Vi...",0
1,"Vietnam Natl Univ HCMC, Univ Sci, Dept Optimiz...",0
2,"Univ Sci, Lab Adv Mat, VNU HCM, Ho Chi Minh Ci...",0
3,"Vietnam Natl Univ Ho Chi Minh City, Univ Sci, ...",0
4,"Viet Nam Natl Univ Ho Chi Minh City, Linh Trun...",0


In [8]:
df = df.sample(frac=1).reset_index(drop=True)

In [9]:
df.head()

Unnamed: 0,content,label
0,"Ho Chi Minh City Int Univ, Vietnam Natl Univ H...",0
1,"Vietnam Acad Sci & Technol, Ctr Informat & Com...",1
2,"Vietnam Natl Univ Ho Chi Minh City, Univ Nat S...",0
3,"Vietnam Natl Univ, Ctr Innovat Mat & Architect...",0
4,"Hanoi Univ Sci & Technol, MICA Inst, CNRS, UMI...",8


In [10]:
n = int(len(df) * 0.3)

In [11]:
df['content'] = df['content'].apply(lambda s: s.lower())
df['num_words'] = df['content'].apply(lambda s: len(s.split()))
df['num_unique_words'] = df['content'].apply(lambda s: len(set(w for w in s.split())))
df['words_vs_unique'] = df['num_unique_words'] / (df['num_words']+1) * 100

In [12]:
train = df[n:]

In [13]:
test = df[:n]

In [14]:
len(train) + len(test)

3656

In [15]:
train.to_csv("train.csv", index=False)

In [16]:
test.to_csv("test.csv", index=False)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [18]:
tfidf = TfidfVectorizer(
    min_df = 3, 
    max_df = 0.5, 
    ngram_range=(1,3),
    use_idf=True,
    sublinear_tf=True
)

In [19]:
X_train_tfidf = tfidf.fit_transform(train['content'])
X_test_tfidf = tfidf.transform(test['content'])

In [20]:
X_train_tfidf.shape

(2560, 3339)

In [21]:
EXCLUED_COLS = ['content', 'label']
static_cols = [c for c in train.columns if not c in EXCLUED_COLS]
X_train_static = train[static_cols].values
X_test_static = test[static_cols].values

In [22]:
train[static_cols].head()

Unnamed: 0,num_words,num_unique_words,words_vs_unique
1096,16,14,82.352941
1097,16,16,94.117647
1098,8,8,88.888889
1099,10,10,90.909091
1100,16,14,82.352941


In [23]:
from scipy.sparse import hstack, csr_matrix, vstack

In [24]:
X_train = hstack([X_train_tfidf, csr_matrix(X_train_static)]).tocsr()
X_test = hstack([X_test_tfidf, csr_matrix(X_test_static)]).tocsr()

In [25]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [26]:
clfs = {
    "RF": RandomForestClassifier(n_estimators=200, min_samples_leaf=2, max_depth=7, max_features=0.7, random_state=42, n_jobs=-1),
    "GB": GradientBoostingClassifier(n_estimators=200, min_samples_split=2, max_depth=5, max_features=0.7, random_state=111),
    "LR": LogisticRegression(penalty='l2', C=1.0),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(C=5, gamma=0.5)
}

In [27]:
for name, clf in clfs.items():
    print(name)
    y_true = train['label'].astype('int')
    clf.fit(X_train, y_true)
    y_pred = clf.predict(X_train)
    
    print("F1", f1_score(y_true, y_pred, average="micro"))
    print("CM", confusion_matrix(y_true, y_pred))
    print("ACC", accuracy_score(y_true, y_pred))
    
    y_true = test['label'].astype('int')
    y_pred = clf.predict(X_test)
    
    print("F1", f1_score(y_true, y_pred, average="micro"))
    print("CM", confusion_matrix(y_true, y_pred))
    print("ACC", accuracy_score(y_true, y_pred))

RF
F1 0.965234375
CM [[461   0   0   0   0   0   0   0   0   0]
 [ 16 572   0   0   0   0   0   0   0  14]
 [  1   0   0   0   0   0   0   0   0  15]
 [  1   0   0  52   0   0   0   0   0   2]
 [ 10   0   0   0 144   0   0   0   0   2]
 [  1   0   0   0   0 239   0   0   0   0]
 [  7   0   0   0   0   0 106   0   0   1]
 [  0   0   0   0   0   0   0 195   0   1]
 [  0   0   0   0   0   0   0   0 260  10]
 [  8   0   0   0   0   0   0   0   0 442]]
ACC 0.965234375
F1 0.947992700729927
CM [[219   0   0   0   0   0   0   0   0   0]
 [ 12 225   0   0   0   0   0   0   1   3]
 [  4   0   0   0   0   0   0   0   0   3]
 [  0   0   0  20   0   0   0   0   0   4]
 [ 10   0   0   0  68   0   0   0   0   0]
 [  0   0   0   0   0 103   0   0   0   0]
 [  6   0   0   0   0   0  37   0   0   0]
 [  0   0   0   0   0   0   0  66   0   0]
 [  1   0   0   0   0   0   0   0 116   9]
 [  2   1   0   0   0   0   0   0   1 185]]
ACC 0.947992700729927
GB
F1 0.99921875
CM [[461   0   0   0   0   0   0   0  



F1 0.993359375
CM [[461   0   0   0   0   0   0   0   0   0]
 [  2 599   0   0   0   0   0   0   0   1]
 [  0   0  11   0   0   0   0   0   0   5]
 [  0   0   0  53   0   1   0   0   0   1]
 [  0   0   0   0 154   0   0   0   0   2]
 [  0   0   0   0   0 240   0   0   0   0]
 [  0   0   0   0   0   0 114   0   0   0]
 [  0   0   0   0   0   0   0 196   0   0]
 [  0   0   0   0   0   0   0   0 266   4]
 [  1   0   0   0   0   0   0   0   0 449]]
ACC 0.993359375
F1 0.9890510948905109
CM [[219   0   0   0   0   0   0   0   0   0]
 [  0 241   0   0   0   0   0   0   0   0]
 [  0   1   1   0   0   0   0   0   0   5]
 [  0   0   0  20   0   0   0   0   0   4]
 [  0   0   0   0  78   0   0   0   0   0]
 [  0   0   0   0   0 103   0   0   0   0]
 [  0   0   0   0   0   0  43   0   0   0]
 [  0   0   0   0   0   0   0  66   0   0]
 [  1   0   0   0   0   0   0   0 124   1]
 [  0   0   0   0   0   0   0   0   0 189]]
ACC 0.9890510948905109
KNN
F1 0.883984375
CM [[429   8   0   1   1  17   0   1 

In [28]:
models=[ 
    ######## First level ########
    [
        RandomForestClassifier(n_estimators=200, min_samples_leaf=2, max_depth=7, max_features=0.7, random_state=42, n_jobs=-1),        
        ExtraTreesClassifier (n_estimators=200, min_samples_leaf=2, max_depth=7, max_features=0.7, random_state=42, n_jobs=-1),
        GradientBoostingClassifier(n_estimators=200, min_samples_split=2, max_depth=5, max_features=0.7, random_state=111),
        LogisticRegression(penalty='l2', C=1.0),
    ],
    ######## Second level ########
    [
        RandomForestClassifier (n_estimators=200, min_samples_leaf=2, max_depth=30, max_features=0.7, random_state=421, n_jobs=-1)
    ]
]

In [29]:
from pystacknet.pystacknet import StackNetClassifier

model = StackNetClassifier(
    models, metric="accuracy", 
    folds=5,
    restacking=False, 
    use_retraining=True, 
    use_proba=True, 
    random_state=12345, n_jobs=1, verbose=1
)

y_train = train['label'].astype('int')
model.fit(X_train, y_train)
preds=model.predict_proba(X_test)

Input Dimensionality 3342 at Level 0 
4 models included in Level 0 




Level 0, fold 1/5 , model 0 , accuracy===0.955078 
Level 0, fold 1/5 , model 1 , accuracy===0.896484 
Level 0, fold 1/5 , model 2 , accuracy===0.994141 
Level 0, fold 1/5 , model 3 , accuracy===0.992188 




Level 0, fold 2/5 , model 0 , accuracy===0.937500 
Level 0, fold 2/5 , model 1 , accuracy===0.896484 
Level 0, fold 2/5 , model 2 , accuracy===0.978516 
Level 0, fold 2/5 , model 3 , accuracy===0.984375 




Level 0, fold 3/5 , model 0 , accuracy===0.970703 
Level 0, fold 3/5 , model 1 , accuracy===0.900391 
Level 0, fold 3/5 , model 2 , accuracy===0.994141 
Level 0, fold 3/5 , model 3 , accuracy===0.988281 




Level 0, fold 4/5 , model 0 , accuracy===0.970703 
Level 0, fold 4/5 , model 1 , accuracy===0.908203 
Level 0, fold 4/5 , model 2 , accuracy===0.996094 
Level 0, fold 4/5 , model 3 , accuracy===0.992188 




Level 0, fold 5/5 , model 0 , accuracy===0.949219 
Level 0, fold 5/5 , model 1 , accuracy===0.902344 
Level 0, fold 5/5 , model 2 , accuracy===0.984375 
Level 0, fold 5/5 , model 3 , accuracy===0.976562 
Level 0, model 0 , accuracy===0.956641 
Level 0, model 1 , accuracy===0.900781 
Level 0, model 2 , accuracy===0.989453 
Level 0, model 3 , accuracy===0.986719 




Output dimensionality of level 0 is 40 
 level 0 lasted 138.341816 seconds 
Input Dimensionality 40 at Level 1 
1 models included in Level 1 
Level 1, fold 1/5 , model 0 , accuracy===0.992188 
Level 1, fold 2/5 , model 0 , accuracy===0.986328 
Level 1, fold 3/5 , model 0 , accuracy===0.994141 
Level 1, fold 4/5 , model 0 , accuracy===0.996094 
Level 1, fold 5/5 , model 0 , accuracy===0.984375 
Level 1, model 0 , accuracy===0.990625 
Output dimensionality of level 1 is 10 
 level 1 lasted 8.385583 seconds 
 fit() lasted 146.727935 seconds 
1 estimators included in Level 0 
1 estimators included in Level 1 


In [30]:
pred_cls = np.argmax(preds, axis=1)
y_true = test['label'].astype('int')

In [31]:
print("F1", f1_score(y_true, pred_cls, average="micro"))
print("CM", confusion_matrix(y_true, pred_cls))
print("ACC", accuracy_score(y_true, pred_cls))

F1 0.9945255474452555
CM [[219   0   0   0   0   0   0   0   0   0]
 [  0 241   0   0   0   0   0   0   0   0]
 [  0   0   7   0   0   0   0   0   0   0]
 [  0   1   0  20   0   0   0   0   1   2]
 [  0   0   0   0  78   0   0   0   0   0]
 [  0   0   0   0   0 103   0   0   0   0]
 [  0   0   0   0   0   0  43   0   0   0]
 [  0   0   0   0   0   0   0  66   0   0]
 [  0   0   0   0   1   0   0   0 125   0]
 [  0   0   0   0   0   0   0   0   1 188]]
ACC 0.9945255474452555


In [32]:
final_model = clfs["GB"]

In [33]:
import pickle

In [34]:
with open('vcgate_gradient_boosting.pkl', 'wb') as f:
    pickle.dump(final_model, f)

In [35]:
with open('vcgate_tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [36]:
encode_rev = {}
for k, v in encode.items():
    encode_rev[v] = k

In [37]:
sen = "vnu xuan thuy, ha noi, vietnam".lower()

In [38]:
sen_tfidf = tfidf.transform([sen])

In [39]:
num_words = len(sen.split())
num_unique_words = len(set(w for w in sen.split()))
words_vs_unique = num_unique_words / (num_words+1) * 100

In [40]:
sen_tfidf

<1x3339 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [41]:
feat = hstack([sen_tfidf, csr_matrix([num_words, num_unique_words, words_vs_unique])]).tocsr()

In [42]:
label = final_model.predict(feat)

In [43]:
encode_rev[label[0]]

'vnu'