In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame(columns=["content", "label"])

In [3]:
import os

In [4]:
encode = {}
index = 0
i = 0

for file in os.listdir("."):
    if '.txt' not in file:
        continue
    label = file.split("_")[0]
    encode[label] = index
    
    with open(file, 'r') as f:
        for line in f.readlines():
            df.loc[i] = [line.strip(" \n\r\t"), index]
            i += 1
            
    index += 1

In [5]:
len(df)

3656

In [6]:
df['label'].unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=object)

In [7]:
df.head()

Unnamed: 0,content,label
0,"VNU, UIT, Dept Comp Engn, Ho Chi Minh City, Vi...",0
1,"Vietnam Natl Univ HCMC, Univ Sci, Dept Optimiz...",0
2,"Univ Sci, Lab Adv Mat, VNU HCM, Ho Chi Minh Ci...",0
3,"Vietnam Natl Univ Ho Chi Minh City, Univ Sci, ...",0
4,"Viet Nam Natl Univ Ho Chi Minh City, Linh Trun...",0


In [8]:
df = df.sample(frac=1).reset_index(drop=True)

In [9]:
df.head()

Unnamed: 0,content,label
0,"Hue Univ Med & Pharm, Hue, Vietnam",7
1,"Nam, NH; Minh, CV, Vietnam Acad Sci & Technol,...",1
2,"Vietnam Natl Univ Ho Chi Minh City VNU HCM, Sc...",0
3,"Duy Tan Univ, Dept Informat Technol, Da Nang, ...",6
4,"VAST, Inst Environm Technol, Hanoi, Vietnam",1


In [10]:
n = int(len(df) * 0.3)

In [11]:
df['content'] = df['content'].apply(lambda s: s.lower())
df['num_words'] = df['content'].apply(lambda s: len(s.split()))
df['num_unique_words'] = df['content'].apply(lambda s: len(set(w for w in s.split())))
df['words_vs_unique'] = df['num_unique_words'] / (df['num_words']+1) * 100

In [12]:
train = df[n:]

In [13]:
test = df[:n]

In [14]:
len(train) + len(test)

3656

In [15]:
train.to_csv("train.csv", index=False)

In [16]:
test.to_csv("test.csv", index=False)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [18]:
tfidf = TfidfVectorizer(
    min_df = 3, 
    max_df = 0.5, 
    ngram_range=(1,3),
    use_idf=True,
    sublinear_tf=True
)

In [19]:
X_train_tfidf = tfidf.fit_transform(train['content'])
X_test_tfidf = tfidf.transform(test['content'])

In [20]:
X_train_tfidf.shape

(2560, 3278)

In [21]:
EXCLUED_COLS = ['content', 'label']
static_cols = [c for c in train.columns if not c in EXCLUED_COLS]
X_train_static = train[static_cols].values
X_test_static = test[static_cols].values

In [22]:
train[static_cols].head()

Unnamed: 0,num_words,num_unique_words,words_vs_unique
1096,13,12,85.714286
1097,13,13,92.857143
1098,16,15,88.235294
1099,23,19,79.166667
1100,12,12,92.307692


In [23]:
from scipy.sparse import hstack, csr_matrix, vstack

In [24]:
X_train = hstack([X_train_tfidf, csr_matrix(X_train_static)]).tocsr()
X_test = hstack([X_test_tfidf, csr_matrix(X_test_static)]).tocsr()

In [25]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [26]:
clfs = {
    "RF": RandomForestClassifier(n_estimators=200, min_samples_leaf=2, max_depth=7, max_features=0.7, random_state=42, n_jobs=-1),
    "GB": GradientBoostingClassifier(n_estimators=200, min_samples_split=2, max_depth=5, max_features=0.7, random_state=111),
    "LR": LogisticRegression(penalty='l2', C=1.0),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(C=5, gamma=0.5)
}

In [27]:
for name, clf in clfs.items():
    print(name)
    y_true = train['label'].astype('int')
    clf.fit(X_train, y_true)
    y_pred = clf.predict(X_train)
    
    print("F1", f1_score(y_true, y_pred, average="micro"))
    print("CM", confusion_matrix(y_true, y_pred))
    print("ACC", accuracy_score(y_true, y_pred))
    
    y_true = test['label'].astype('int')
    y_pred = clf.predict(X_test)
    
    print("F1", f1_score(y_true, y_pred, average="micro"))
    print("CM", confusion_matrix(y_true, y_pred))
    print("ACC", accuracy_score(y_true, y_pred))

RF
F1 0.957421875
CM [[471   0   0   0   0   0   0   0   0   0]
 [ 22 550   0   0   0   0   0   0   1  20]
 [  2   0   0   0   0   0   0   0   0  14]
 [  1   0   0  53   0   0   0   0   0   5]
 [ 11   0   0   0 159   0   0   0   0   0]
 [  1   0   0   0   0 241   0   0   0   0]
 [ 10   0   0   0   0   0  95   0   0   1]
 [  0   0   0   0   0   0   0 185   0   1]
 [  1   0   0   0   0   0   0   0 260  11]
 [  7   1   0   0   0   0   0   0   0 437]]
ACC 0.957421875
F1 0.948905109489051
CM [[208   1   0   0   0   0   0   0   0   0]
 [  6 230   0   0   0   0   0   0   1  13]
 [  3   0   0   0   0   0   0   0   0   4]
 [  0   0   0  18   0   0   0   0   0   2]
 [  9   0   0   0  53   0   0   0   0   2]
 [  0   0   0   0   0 101   0   0   0   0]
 [  3   0   0   0   0   0  48   0   0   0]
 [  0   0   0   0   0   0   0  76   0   0]
 [  0   0   0   0   0   0   0   0 116   8]
 [  3   0   0   0   0   0   0   0   1 190]]
ACC 0.948905109489051
GB
F1 0.99921875
CM [[470   0   0   0   0   0   0   0  



F1 0.99375
CM [[471   0   0   0   0   0   0   0   0   0]
 [  2 589   0   0   0   0   0   0   0   2]
 [  0   0  10   0   0   0   0   0   0   6]
 [  0   0   0  56   0   0   0   0   0   3]
 [  0   0   0   0 170   0   0   0   0   0]
 [  0   0   0   0   0 242   0   0   0   0]
 [  0   0   0   0   0   0 106   0   0   0]
 [  0   0   0   0   0   0   0 186   0   0]
 [  0   0   0   0   0   0   0   0 270   2]
 [  1   0   0   0   0   0   0   0   0 444]]
ACC 0.99375
F1 0.9917883211678832
CM [[209   0   0   0   0   0   0   0   0   0]
 [  0 250   0   0   0   0   0   0   0   0]
 [  0   0   3   0   0   0   0   0   0   4]
 [  0   0   0  18   0   1   0   0   0   1]
 [  0   0   0   0  62   0   0   0   0   2]
 [  0   0   0   0   0 101   0   0   0   0]
 [  0   0   0   0   0   0  51   0   0   0]
 [  0   0   0   0   0   0   0  76   0   0]
 [  0   0   0   0   0   0   0   0 123   1]
 [  0   0   0   0   0   0   0   0   0 194]]
ACC 0.9917883211678832
KNN
F1 0.8843749999999999
CM [[433  10   0   1   0  21   0   1  

In [28]:
models=[ 
    ######## First level ########
    [
        RandomForestClassifier(n_estimators=200, min_samples_leaf=2, max_depth=7, max_features=0.7, random_state=42, n_jobs=-1),        
        ExtraTreesClassifier (n_estimators=200, min_samples_leaf=2, max_depth=7, max_features=0.7, random_state=42, n_jobs=-1),
        GradientBoostingClassifier(n_estimators=200, min_samples_split=2, max_depth=5, max_features=0.7, random_state=111),
        LogisticRegression(penalty='l2', C=1.0),
    ],
    ######## Second level ########
    [
        RandomForestClassifier (n_estimators=200, min_samples_leaf=2, max_depth=30, max_features=0.7, random_state=421, n_jobs=-1)
    ]
]

In [29]:
from pystacknet.pystacknet import StackNetClassifier

model = StackNetClassifier(
    models, metric="accuracy", 
    folds=5,
    restacking=False, 
    use_retraining=True, 
    use_proba=True, 
    random_state=12345, n_jobs=1, verbose=1
)

y_train = train['label'].astype('int')
model.fit(X_train, y_train)
preds=model.predict_proba(X_test)

Input Dimensionality 3342 at Level 0 
4 models included in Level 0 




Level 0, fold 1/5 , model 0 , accuracy===0.955078 
Level 0, fold 1/5 , model 1 , accuracy===0.896484 
Level 0, fold 1/5 , model 2 , accuracy===0.994141 
Level 0, fold 1/5 , model 3 , accuracy===0.992188 




Level 0, fold 2/5 , model 0 , accuracy===0.937500 
Level 0, fold 2/5 , model 1 , accuracy===0.896484 
Level 0, fold 2/5 , model 2 , accuracy===0.978516 
Level 0, fold 2/5 , model 3 , accuracy===0.984375 




Level 0, fold 3/5 , model 0 , accuracy===0.970703 
Level 0, fold 3/5 , model 1 , accuracy===0.900391 
Level 0, fold 3/5 , model 2 , accuracy===0.994141 
Level 0, fold 3/5 , model 3 , accuracy===0.988281 




Level 0, fold 4/5 , model 0 , accuracy===0.970703 
Level 0, fold 4/5 , model 1 , accuracy===0.908203 
Level 0, fold 4/5 , model 2 , accuracy===0.996094 
Level 0, fold 4/5 , model 3 , accuracy===0.992188 




Level 0, fold 5/5 , model 0 , accuracy===0.949219 
Level 0, fold 5/5 , model 1 , accuracy===0.902344 
Level 0, fold 5/5 , model 2 , accuracy===0.984375 
Level 0, fold 5/5 , model 3 , accuracy===0.976562 
Level 0, model 0 , accuracy===0.956641 
Level 0, model 1 , accuracy===0.900781 
Level 0, model 2 , accuracy===0.989453 
Level 0, model 3 , accuracy===0.986719 




Output dimensionality of level 0 is 40 
 level 0 lasted 138.341816 seconds 
Input Dimensionality 40 at Level 1 
1 models included in Level 1 
Level 1, fold 1/5 , model 0 , accuracy===0.992188 
Level 1, fold 2/5 , model 0 , accuracy===0.986328 
Level 1, fold 3/5 , model 0 , accuracy===0.994141 
Level 1, fold 4/5 , model 0 , accuracy===0.996094 
Level 1, fold 5/5 , model 0 , accuracy===0.984375 
Level 1, model 0 , accuracy===0.990625 
Output dimensionality of level 1 is 10 
 level 1 lasted 8.385583 seconds 
 fit() lasted 146.727935 seconds 
1 estimators included in Level 0 
1 estimators included in Level 1 


In [30]:
pred_cls = np.argmax(preds, axis=1)
y_true = test['label'].astype('int')

In [31]:
print("F1", f1_score(y_true, pred_cls, average="micro"))
print("CM", confusion_matrix(y_true, pred_cls))
print("ACC", accuracy_score(y_true, pred_cls))

F1 0.9945255474452555
CM [[219   0   0   0   0   0   0   0   0   0]
 [  0 241   0   0   0   0   0   0   0   0]
 [  0   0   7   0   0   0   0   0   0   0]
 [  0   1   0  20   0   0   0   0   1   2]
 [  0   0   0   0  78   0   0   0   0   0]
 [  0   0   0   0   0 103   0   0   0   0]
 [  0   0   0   0   0   0  43   0   0   0]
 [  0   0   0   0   0   0   0  66   0   0]
 [  0   0   0   0   1   0   0   0 125   0]
 [  0   0   0   0   0   0   0   0   1 188]]
ACC 0.9945255474452555


In [28]:
final_model = clfs["GB"]

In [29]:
import pickle

In [30]:
with open('vcgate_gradient_boosting.pkl', 'wb') as f:
    pickle.dump(final_model, f)

In [31]:
with open('vcgate_tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [32]:
encode_rev = {}
for k, v in encode.items():
    encode_rev[v] = k

In [40]:
sen = "University of Engineering and Technology".lower()

In [41]:
sen_tfidf = tfidf.transform([sen])

In [42]:
num_words = len(sen.split())
num_unique_words = len(set(w for w in sen.split()))
words_vs_unique = num_unique_words / (num_words+1) * 100

In [43]:
sen_tfidf

<1x3278 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [44]:
feat = hstack([sen_tfidf, csr_matrix([num_words, num_unique_words, words_vs_unique])]).tocsr()

In [51]:
label = clfs["GB"].predict(feat)

In [52]:
encode_rev[label[0]]

'vast'

In [53]:
label

array([1])

In [57]:
train[train['label'] == 9]

Unnamed: 0,content,label,num_words,num_unique_words,words_vs_unique
1097,"hanoi natl univ, dept phys, coll nat sci, 334 ...",9,13,13,92.857143
1100,"vnu univ engn & technol uet, vnu, 144 xuan thu...",9,12,12,92.307692
1101,"vietnam natl univ, vnu univ sci, dept biochem ...",9,20,19,90.476190
1102,"vietnam natl univ, vnu univ engn & technol, fa...",9,18,15,78.947368
1109,"vietnam natl univ, univ sci, hanoi 100000, vie...",9,8,7,77.777778
1123,"vnu hanoi univ sci, fac chem, 19 le thanh tong...",9,12,12,92.307692
1124,"vietnam natl univ, vnu univ sci, fac phys, han...",9,10,9,81.818182
1137,"vnu univ engn & technol, fac engn phys & nanot...",9,15,13,81.250000
1153,"vnu univ sci, fac biol, natl key lab enzyme & ...",9,14,14,93.333333
1164,"vietnam natl univ, ifi, hanoi, vietnam",9,6,5,71.428571
