In [1]:
import pandas as pd
import numpy as np

from scipy.sparse import hstack, csr_matrix, vstack

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

from sklearn.ensemble import *
from sklearn.linear_model import *

from tqdm import *

import matplotlib.pyplot as plt
import gc

import lightgbm as lgb
%matplotlib inline

Load train và test data

In [2]:
# Load data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [3]:
train_df.head()

Unnamed: 0,id,content,label
0,train_000000,Dung dc sp tot cam on. shop Đóng gói sản phẩm ...,0
1,train_000001,Chất lượng sản phẩm tuyệt vời . Son mịn nhưng ...,0
2,train_000002,Chất lượng sản phẩm tuyệt vời nhưng k có hộp k...,0
3,train_000003,:(( Mình hơi thất vọng 1 chút vì mình đã kỳ vọ...,1
4,train_000004,Lần trước mình mua áo gió màu hồng rất ok mà đ...,1


In [4]:
test_df.head()

Unnamed: 0,id,content
0,test_000000,Chưa dùng thử nên chưa biết.
1,test_000001,Không đáng tiềnVì ngay đợt sale nên mới mua nh...
2,test_000002,Cám ơn shop. Đóng gói sản phẩm rất đẹp và chắc...
3,test_000003,Vải đẹp.phom oki luôn.quá ưng.
4,test_000004,Chuẩn hàng đóng gói đẹp.


In [5]:
df = pd.concat([train_df, test_df], axis=0)
# del train_df, test_df
# gc.collect()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [6]:
import re

regep = re.compile(r"[-()\"#/@;:<>{}`+=~|.!?,]_*&^%$")
renumber = re.compile(r"\d+")

def transform(df):
    df['content'] = df['content'].astype(str).fillna(' ')
    df['content'] = df['content'].apply(lambda x: str(x).lower())
    df['content'] = df['content'].apply(lambda x: renumber.sub('ddddd', x))
    df['content'] = df['content'].apply(lambda x: regep.sub('', x))
    df['content'] = df['content'].apply(lambda x: ' '.join([s for s in x.split(' ') if len(s) >= 3]))
    df['content'] = df['content'].astype(str).fillna(' ')
    return df

In [7]:
import emoji

def extract_emojis(str):
    return [c for c in str if c in emoji.UNICODE_EMOJI]

In [8]:
good_df = train_df[train_df['label'] == 0]
good_comment = good_df['content'].values
good_emoji = []
for c in good_comment:
    good_emoji += extract_emojis(c)

good_emoji = np.unique(np.asarray(good_emoji))

In [9]:
bad_df = train_df[train_df['label'] == 1]
bad_comment = bad_df['content'].values

bad_emoji = []
for c in bad_comment:
    bad_emoji += extract_emojis(c)

bad_emoji = np.unique(np.asarray(bad_emoji))

In [10]:
good_emoji

array(['↖', '↗', '☀', '☺', '♀', '♥', '✌', '✨', '❌', '❣', '❤', '⭐', '🆗',
       '🌝', '🌟', '🌧', '🌷', '🌸', '🌺', '🌼', '🍓', '🎈', '🎉', '🐅', '🐾', '👉',
       '👌', '👍', '👏', '💋', '💌', '💐', '💓', '💕', '💖', '💗', '💙', '💚', '💛',
       '💜', '💞', '💟', '💥', '💪', '💮', '💯', '💰', '📑', '🖤', '😀', '😁', '😂',
       '😃', '😄', '😅', '😆', '😇', '😉', '😊', '😋', '😌', '😍', '😎', '😑', '😓',
       '😔', '😖', '😗', '😘', '😙', '😚', '😛', '😜', '😝', '😞', '😟', '😡', '😢',
       '😣', '😥', '😩', '😪', '😫', '😬', '😭', '😯', '😰', '😱', '😲', '😳', '😻',
       '😿', '🙁', '🙂', '🙃', '🙄', '🙆', '🙌', '🤑', '🤔', '🤗', '🤙', '🤝', '🤣',
       '🤤', '\U0001f928', '\U0001f92a', '\U0001f92d'], dtype='<U1')

In [11]:
# Just remove "sad, bad" emoji :D
good_emoji_fix = [
    '↖', '↗', '☀', '☺', '♀', '♥', '✌', '✨', '❣', '❤', '⭐', '🆗',
       '🌝', '🌟', '🌧', '🌷', '🌸', '🌺', '🌼', '🍓', '🎈', '🎉', '🐅', '🐾', '👉',
       '👌', '👍', '👏', '💋', '💌', '💐', '💓', '💕', '💖', '💗', '💙', '💚', '💛',
       '💜', '💞', '💟', '💥', '💪', '💮', '💯', '💰', '📑', '🖤', '😀', '😁', '😂',
       '😃', '😄', '😅', '😆', '😇', '😉', '😊', '😋', '😌', '😍', '😎', '😑', '😓', '😔', 
    '😖', '😗', '😘', '😙', '😚', '😛', '😜', '😝', '😞', '😟', '😡', '😯', '😰', '😱', '😲', '😳', '😻', '🙂', '🙃', '🙄', '🙆', '🙌', '🤑', '🤔', '🤗',
]

In [12]:
bad_emoji

array(['☹', '✋', '❌', '❓', '❤', '⭐', '🎃', '👌', '👍', '👎', '👶', '💀', '💋',
       '😁', '😂', '😈', '😊', '😌', '😏', '😐', '😑', '😒', '😓', '😔', '😖', '😚',
       '😞', '😟', '😠', '😡', '😢', '😣', '😤', '😥', '😧', '😩', '😪', '😫', '😬',
       '😭', '😳', '😵', '😶', '🙁', '🙂', '🙄', '🤔', '🤚', '🤤'], dtype='<U1')

In [13]:
# Just remove "good" emoji :D
bad_emoji_fix = [
    '☹', '✋', '❌', '❓', '👎', '👶', '💀',
       '😐', '😑', '😒', '😓', '😔',
       '😞', '😟', '😠', '😡', '😢', '😣', '😤', '😥', '😧', '😩', '😪', '😫', '😬',
       '😭', '😳', '😵', '😶', '🙁', '🙄', '🤔',
]

In [14]:
def count_good_bad_emoji(row):
    comment = row['content']
    n_good_emoji = 0
    n_bad_emoji = 0
    try:
        for c in comment:
            if c in good_emoji_fix:
                n_good_emoji += 1
            if c in bad_emoji_fix:
                n_bad_emoji += 1
    except:
        pass
    row['n_good_emoji'] = n_good_emoji
    row['n_bad_emoji'] = n_bad_emoji
    
    return row

In [15]:
# Some features
df = df.apply(count_good_bad_emoji, axis=1)
df = transform(df)
df['num_words'] = df['content'].apply(lambda s: len(s.split()))
df['num_unique_words'] = df['content'].apply(lambda s: len(set(w for w in s.split())))
df['words_vs_unique'] = df['num_unique_words'] / (df['num_words']+1) * 100

In [16]:
df['good_bad_emoji_ratio'] = df['n_good_emoji'] / df['n_bad_emoji']
df['good_bad_emoji_ratio'] = df['good_bad_emoji_ratio'].replace(np.nan, 0)
df['good_bad_emoji_ratio'] = df['good_bad_emoji_ratio'].replace(np.inf, 99)
df['good_bad_emoji_diff'] = df['n_good_emoji'] - df['n_bad_emoji']
df['good_bad_emoji_sum'] = df['n_good_emoji'] + df['n_bad_emoji']

In [17]:
train_df = df[~df['label'].isnull()]
test_df = df[df['label'].isnull()]

train_comments = train_df['content'].fillna("none").values
test_comments = test_df['content'].fillna("none").values

y_train = train_df['label'].values

In [18]:
train_df.head()

Unnamed: 0,content,id,label,n_good_emoji,n_bad_emoji,num_words,num_unique_words,words_vs_unique,good_bad_emoji_ratio,good_bad_emoji_diff,good_bad_emoji_sum
0,dung tot cam on. shop đóng gói sản phẩm rất đẹ...,train_000000,0.0,0,0,19,17,85.0,0.0,0,0
1,chất lượng sản phẩm tuyệt vời son mịn nhưng kh...,train_000001,0.0,0,0,17,17,94.444444,0.0,0,0
2,chất lượng sản phẩm tuyệt vời nhưng hộp dây gi...,train_000002,0.0,0,0,12,12,92.307692,0.0,0,0
3,:(( mình hơi thất vọng ddddd chút mình vọng cu...,train_000003,1.0,0,0,86,69,79.310345,0.0,0,0
4,lần trước mình mua gió màu hồng rất đợt này lạ...,train_000004,1.0,0,0,21,20,90.909091,0.0,0,0


Tạo feature TFIDF đơn giản

In [19]:
tfidf = TfidfVectorizer(
    min_df = 5, 
    max_df = 0.8, 
    max_features=10000,
    ngram_range=(1,2),
    use_idf=True,
    sublinear_tf=True
)

In [20]:
X_train_tfidf = tfidf.fit_transform(train_comments)
X_test_tfidf = tfidf.transform(test_comments)

In [21]:
EXCLUED_COLS = ['id', 'content', 'label']
static_cols = [c for c in train_df.columns if not c in EXCLUED_COLS]
X_train_static = train_df[static_cols].values
X_test_static = test_df[static_cols].values

In [22]:
X_train = hstack([X_train_tfidf, csr_matrix(X_train_static)]).tocsr()
X_test = hstack([X_test_tfidf, csr_matrix(X_test_static)]).tocsr()
# X_train = X_train_tfidf
# X_test = X_test_tfidf

In [23]:
X_train.shape, X_test.shape, y_train.shape

((16087, 8584), (10981, 8584), (16087,))

In [24]:
X_train

<16087x8584 sparse matrix of type '<class 'numpy.float64'>'
	with 437027 stored elements in Compressed Sparse Row format>

# Stacking method

In [25]:
models=[ 
    ######## First level ########
    [
        RandomForestClassifier(n_estimators=200, min_samples_leaf=2, max_depth=30, max_features=0.7, random_state=42, n_jobs=-1),        
        ExtraTreesClassifier (n_estimators=200, min_samples_leaf=2, max_depth=30, max_features=0.7, random_state=42, n_jobs=-1),
        GradientBoostingClassifier(n_estimators=200, min_samples_split=2, max_depth=10, max_features=0.7, random_state=111),
        LogisticRegression(penalty='l2', C=1.0),
    ],
    ######## Second level ########
    [
        RandomForestClassifier (n_estimators=200, min_samples_leaf=2, max_depth=30, max_features=0.7, random_state=421, n_jobs=-1)
    ]
]

In [26]:
from pystacknet.pystacknet import StackNetClassifier

model = StackNetClassifier(
    models, metric="f1", 
    folds=5,
    restacking=False, 
    use_retraining=True, 
    use_proba=True, 
    random_state=12345, n_jobs=1, verbose=1
)

model.fit(X_train, y_train)
preds=model.predict_proba(X_test)

Input Dimensionality 8584 at Level 0 
4 models included in Level 0 




Level 0, fold 1/5 , model 0 , f1===0.789937 
Level 0, fold 1/5 , model 1 , f1===0.796422 
Level 0, fold 1/5 , model 2 , f1===0.854152 
Level 0, fold 1/5 , model 3 , f1===0.865323 




Level 0, fold 2/5 , model 0 , f1===0.810880 
Level 0, fold 2/5 , model 1 , f1===0.806602 
Level 0, fold 2/5 , model 2 , f1===0.864922 
Level 0, fold 2/5 , model 3 , f1===0.881895 




Level 0, fold 3/5 , model 0 , f1===0.805186 
Level 0, fold 3/5 , model 1 , f1===0.794823 
Level 0, fold 3/5 , model 2 , f1===0.865156 
Level 0, fold 3/5 , model 3 , f1===0.869629 




Level 0, fold 4/5 , model 0 , f1===0.801765 
Level 0, fold 4/5 , model 1 , f1===0.811232 
Level 0, fold 4/5 , model 2 , f1===0.863319 
Level 0, fold 4/5 , model 3 , f1===0.875536 




Level 0, fold 5/5 , model 0 , f1===0.804348 
Level 0, fold 5/5 , model 1 , f1===0.811184 
Level 0, fold 5/5 , model 2 , f1===0.858848 
Level 0, fold 5/5 , model 3 , f1===0.874243 
Level 0, model 0 , f1===0.802423 
Level 0, model 1 , f1===0.804053 
Level 0, model 2 , f1===0.861280 
Level 0, model 3 , f1===0.873325 




Output dimensionality of level 0 is 4 
 level 0 lasted 327.976752 seconds 
Input Dimensionality 4 at Level 1 
1 models included in Level 1 
Level 1, fold 1/5 , model 0 , f1===0.870911 
Level 1, fold 2/5 , model 0 , f1===0.882311 
Level 1, fold 3/5 , model 0 , f1===0.877091 
Level 1, fold 4/5 , model 0 , f1===0.874021 
Level 1, fold 5/5 , model 0 , f1===0.874912 
Level 1, model 0 , f1===0.875849 
Output dimensionality of level 1 is 1 
 level 1 lasted 18.080306 seconds 
 fit() lasted 346.058426 seconds 
1 estimators included in Level 0 
1 estimators included in Level 1 


In [27]:
pred_cls = np.argmax(preds, axis=1)

In [28]:
submission = pd.read_csv("sample_submission.csv")
submission['label'] = pred_cls

In [29]:
submission.head()

Unnamed: 0,id,label
0,test_000000,0
1,test_000001,1
2,test_000002,0
3,test_000003,0
4,test_000004,0


In [30]:
submission.to_csv("stack_demo.csv", index=False)

# Ensemble method

In [43]:
from sklearn.model_selection import cross_val_predict
models = [
    RandomForestClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1),
    ExtraTreesClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1),
    GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1),
    LogisticRegression(random_state=1)
]

In [55]:
def cross_val_and_predict(clf, X, y, X_test, nfolds):
    kf = StratifiedKFold(n_splits=nfolds, shuffle=True, random_state=42)
    
    oof_preds = np.zeros((X.shape[0], 2))
    sub_preds = np.zeros((X_test.shape[0], 2))
    
    for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
        X_train, y_train = X[train_idx], y[train_idx]
        X_valid, y_valid = X[valid_idx], y[valid_idx]
        
        clf.fit(X_train, y_train)
        
        oof_preds[valid_idx] = clf.predict_proba(X_valid)
        sub_preds += clf.predict_proba(X_test) / kf.n_splits
        
    return oof_preds, sub_preds

In [70]:
sub_preds = []

for clf in models:
    oof_pred, sub_pred = cross_val_and_predict(clf, X_train, y_train, X_test, nfolds=5)
    oof_pred_cls = oof_pred.argmax(axis=1)
    oof_f1 = f1_score(y_pred=oof_pred_cls, y_true=y_train)
    
    print(clf.__class__)
    print(f"F1 CV: {oof_f1}")
    
    sub_preds.append(sub_pred)

<class 'sklearn.ensemble.forest.RandomForestClassifier'>
F1 CV: 0.8028473369772468
<class 'sklearn.ensemble.forest.ExtraTreesClassifier'>
F1 CV: 0.8157690315898497
<class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>
F1 CV: 0.8664189047051398




<class 'sklearn.linear_model.logistic.LogisticRegression'>
F1 CV: 0.8793558041758711


Ta có thể thấy kết quả `Cross validation` của từng mô hình khá giống với các mô hình tại `Layer 0` của `stacking` phía trên. Hãy thử ensemble bằng cách lấy trung bình công xem sao

In [74]:
sub_preds = np.asarray(sub_preds)
sub_preds = sub_preds.mean(axis=0)
sub_pred_cls = sub_preds.argmax(axis=1)

In [76]:
submission_ensemble = submission.copy()
submission_ensemble['label'] = sub_pred_cls
submission_ensemble.to_csv("ensemble.csv", index=False)