In [2]:
from pathlib import Path
import pickle
import re

from tqdm import tqdm, tqdm_notebook

import numpy as np
import pandas as pd
from scipy import sparse

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.preprocessing import FunctionTransformer, LabelEncoder

from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC

In [3]:
pd.options.display.max_colwidth = 50 # default - 50
tqdm_notebook().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [4]:
DATA_PATH = Path('../data')
RANDOM_SEED = 17

**Data Loading**

In [5]:
train_df = pd.read_csv(DATA_PATH/'X_train_ftfy_nfkd_tweet.csv')
valid_df = pd.read_csv(DATA_PATH/'X_valid_ftfy_nfkd_tweet.csv')
test_df = pd.read_csv(DATA_PATH/'X_test_ftfy_nfkd_tweet.csv')

In [6]:
train_df['full_text'] = train_df['title'] + ' ' + train_df['text']
valid_df['full_text'] = valid_df['title'] + ' ' + valid_df['text']
test_df['full_text'] = test_df['title'] + ' ' + test_df['text']

**Data Cleaning**

In [7]:
X_train = train_df.fillna('')
X_valid = valid_df.fillna('')
X_test = test_df.fillna('')

In [8]:
le = LabelEncoder()
X_train['class'] = le.fit_transform(X_train['label'])
X_valid['class'] = le.transform(X_valid['label'])

In [9]:
le.classes_

array(['clickbait', 'news', 'other'], dtype=object)

**Training**

In [10]:
class ColumnExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.columns]

In [11]:
class NBTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, X, y=None):
        y = y.values
        
        pos_count = X[y==1].sum(0) 
        neg_count = X[y==0].sum(0)
        n = X.shape[1]
        p = (pos_count + self.alpha) / (pos_count.sum() + self.alpha * n)
        q = (neg_count + self.alpha) / (neg_count.sum() + self.alpha * n)
        self.r_ = np.log(p / q)
        return self
    
    def transform(self, X, y=None):
        return X.multiply(self.r_)

In [12]:
class TfidfVectorizerPlus(TfidfVectorizer):
    def __init__(self, fit_add=None, norm_type=None, pivot=5, slope=0.2, 
                       input='content', encoding='utf-8', decode_error='strict', 
                       strip_accents=None, lowercase=True, preprocessor=None, 
                       tokenizer=None, analyzer='word', stop_words=None, 
                       token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), 
                       max_df=1.0, min_df=1, max_features=None, vocabulary=None, 
                       binary=False, dtype=np.float64, norm='l2', 
                       use_idf=True, smooth_idf=True, sublinear_tf=False):
        super().__init__(input, encoding, decode_error,
                         strip_accents, lowercase, preprocessor,
                         tokenizer, analyzer, stop_words,
                         token_pattern, ngram_range,
                         max_df, min_df, max_features, vocabulary,
                         binary, dtype, norm,
                         use_idf, smooth_idf, sublinear_tf)
        
        self.fit_add = fit_add
        self.norm_type = norm_type
        self.pivot = pivot
        self.slope = slope
    
    def fit(self, X, y=None):
        if self.fit_add is not None:
            X_new = pd.concat([X, self.fit_add])
        else:
            X_new = X
        
        super().fit(X_new, y)
        return self
        
    def transform(self, X, y=None):
        res = super().transform(X)
            
        if self.norm_type == 'pivot_cosine':
            norm_factor = (1 - self.slope) * self.pivot + self.slope * sparse.linalg.norm(res, axis=1).reshape(-1, 1)
            res = sparse.csr_matrix(res.multiply(1 / norm_factor))
        elif self.norm_type == 'pivot_unique':
            unique_terms_num = (res > 0).sum(axis=1)
            norm_factor = (1 - self.slope) * self.pivot + self.slope * unique_terms_num
            res = sparse.csr_matrix(res.multiply(1 / norm_factor))
        elif self.norm_type is not None:
            raise ValueError('Incorrect normalization type')
            
        return res

In [13]:
class TextTruncater(BaseEstimator, TransformerMixin):
    def __init__(self, max_length=None):
        self.max_length = max_length
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if self.max_length is not None:
            return X.str[:self.max_length]
        else:
            return X

In [14]:
class ModelTransformer(TransformerMixin):

    def __init__(self, model):
        self.model = model

    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self

    def transform(self, X, **transform_params):
        return DataFrame(self.model.predict_proba(X))

In [15]:
pipe = Pipeline([
    ('features', FeatureUnion([
#         ('title', Pipeline([
#             ('extract', ColumnExtractor(columns='title')),
#             ('vec', TfidfVectorizer())
#         ])),
#         ('text', Pipeline([
#             ('extract', ColumnExtractor(columns='text')),
#             ('vec', TfidfVectorizer())
#         ])),
#         ('title_pos', Pipeline([
#             ('extract', ColumnExtractor('title_pos')),
#             ('vec', TfidfVectorizer())
#         ])),
#         ('text_pos', Pipeline([
#             ('extract', ColumnExtractor('text_pos')),
#             ('vec', TfidfVectorizer())
#         ])),
        ('text', Pipeline([
            ('extract', ColumnExtractor(columns='full_text')),
            ('vec', TfidfVectorizer())
        ])),
    ], 
#         transformer_weights={
#             'title': 0.4,
#             'text': 0.6,
#         }
    )),
    ('clf', LogisticRegression())
])

In [23]:
param_grid = {
#     'features__title__vec': [TfidfVectorizer()],
#     'features__title__vec__strip_accents': ['unicode'], #[None, 'unicode', 'ascii'],
#     'features__title__vec__lowercase': [False], #[True, False],
#     'features__title__vec__analyzer': ['word'], #['word', 'char', 'char_wb'],
#     'features__title__vec__stop_words': ['english'], #[None, 'english'],
#     'features__title__vec__token_pattern': [r'\b\w{5,}\b'], #, r'(?u)\b\w\w+\b'], #['r'\b\w+\b',']
#     'features__title__vec__ngram_range': [(1, 3)], #[(1, 3), (1, 4), (1, 5)],
#     'features__title__vec__max_df': [0.5],
#     'features__title__vec__min_df': [1],
#     'features__title__vec__max_features': [None], #[None, 50000, 100000],
#     'features__title__vec__binary': [True], #[True, False],
#     'features__title__vec__use_idf': [True], #[True, False],
#     'features__title__vec__smooth_idf': [True], #[True, False],
#     'features__title__vec__sublinear_tf': [True], #[True, False],


    'features__text__vec': [TfidfVectorizer()],
    'features__text__vec__strip_accents': [None], #[None, 'unicode', 'ascii'],
    'features__text__vec__lowercase': [False], #[True, False],
    'features__text__vec__analyzer': ['word'], #['word', 'char', 'char_wb'],
    'features__text__vec__stop_words': ['english'], #[None, 'english'],
#     'features__text__vec__token_pattern': [r'\b\w+\b'], #[r'\b\w+\b', r'(?u)\b\w\w+\b', r'\b\w{3,}\b', r'\b\w{4,}\b'],
    'features__text__vec__ngram_range': [(1, 2)], #(1, 3)],
    'features__text__vec__max_df': [0.4], #[0.8, 0.9, 1.0],
    'features__text__vec__min_df': [2], #[1, 2, 3],
    'features__text__vec__max_features': [100000], # 100000, 200000],
    'features__text__vec__binary': [True, False],
    'features__text__vec__use_idf': [True, False],
    'features__text__vec__smooth_idf': [True, False],
    'features__text__vec__sublinear_tf': [True, False],
     
#     'features__title_pos__vec': [TfidfVectorizer()],
#     'features__title_pos__vec__strip_accents': [None], #[None, 'unicode', 'ascii'],
#     'features__title_pos__vec__lowercase': [True], #[True, False],
#     'features__title_pos__vec__analyzer': ['word'], #['word', 'char', 'char_wb'],
#     'features__title_pos__vec__stop_words': [None], #[None, 'english'],
#     'features__title_pos__vec__ngram_range': [(1, 6)],
#     'features__title_pos__vec__max_df': [0.4],
#     'features__title_pos__vec__min_df': [60],
#     'features__title_pos__vec__max_features': [None], #[None, 50000, 100000],
#     'features__title_pos__vec__binary': [False], #[True, False],
#     'features__title_pos__vec__use_idf': [True], #[True, False],
#     'features__title_pos__vec__smooth_idf': [True], #[True, False],
#     'features__title_pos__vec__sublinear_tf': [True], #[True, False],
    
#     'features__text_pos__vec': [TfidfVectorizer()],
#     'features__text_pos__vec__strip_accents': [None], #[None, 'unicode', 'ascii'],
#     'features__text_pos__vec__lowercase': [True], #[True, False],
#     'features__text_pos__vec__analyzer': ['word'], #['char', 'char_wb'],
#     'features__text_pos__vec__stop_words': [None], #[None, 'english'],
#     'features__text_pos__vec__ngram_range': [(1, 3)],
#     'features__text_pos__vec__max_df': [1.0],
#     'features__text_pos__vec__min_df': [10],
#     'features__text_pos__vec__max_features': [None], #[None, 50000, 100000],
#     'features__text_pos__vec__binary': [True], #[True, False],
#     'features__text_pos__vec__use_idf': [False], #[True, False],
#     'features__text_pos__vec__smooth_idf': [True], #[True, False],
#     'features__text_pos__vec__sublinear_tf': [True], #[True, False],


    'clf': [LogisticRegression()],
    'clf__penalty': ['l2'], #['l1', 'l2'], # ['l2'],
    'clf__C': [2], #np.logspace(-2, 2, 5), # [2], 
    'clf__class_weight': ['balanced'], #['balanced', None],
    'clf__random_state': [RANDOM_SEED],
    'clf__solver':  ['lbfgs'], #['lbfgs']
    'clf__max_iter': [200],
    'clf__multi_class': ['multinomial'], #['ovr', 'multinomial'],
              
              
#               'features__title__vec': [TfidfVectorizer()],
#               'features__title__vec__strip_accents': [None, 'unicode', 'ascii'],
#               'features__title__vec__lowercase': [True, False],
#               'features__title__vec__analyzer': ['word', 'char', 'char_wb'],
#               'features__title__vec__stop_words': [None, 'english'],
#               'features__title__vec__token_pattern': [r'\b\w+\b'], #[r'\b\w+\b', r'(?u)\b\w\w+\b'],
#               'features__title__vec__ngram_range': [(1, 4)],
#               'features__title__vec__max_df': [0.8],
#               'features__title__vec__min_df': [1], #[1, 5, 10],
#               'features__title__vec__max_features': [70000],
#               'features__title__vec__binary': [True], #[True, False],
#               'features__title__vec__use_idf': [True], #[True, False],
#               'features__title__vec__smooth_idf': [True], #[True, False],
#               'features__title__vec__sublinear_tf': [True], #[True, False],

#               'clf': [LinearSVC()],
#               'clf__penalty': ['l2'],
#               'clf__loss': ['squared_hinge'], #['squared_hinge', 'hinge'],
#               'clf__dual': [False], #[True, False],
#               'clf__C': [0.4],
#               'clf__class_weight': ['balanced'],
#               'clf__random_state': [random_seed],
              
#               'features__text__vec': [TfidfVectorizer()],
#               'features__text__vec__strip_accents': ['ascii'], #[None, 'unicode', 'ascii'],
#               'features__text__vec__lowercase': [False], #[True, False],
#               'features__text__vec__analyzer': ['word'], #['word', 'char', 'char_wb'],
#               'features__text__vec__stop_words': [None], #[None, 'english'],
#               'features__text__vec__token_pattern': [r'\b\w+\b'], #[r'\b\w+\b', r'(?u)\b\w\w+\b'],
#               'features__text__vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
#               'features__text__vec__max_df': [0.9],
#               'features__text__vec__min_df': [1],
#               'features__text__vec__max_features': [200000, 300000, 400000, 500000],
#               'features__text__vec__binary': [False], #[True, False],
#               'features__text__vec__use_idf': [True], #[True, False],
#               'features__text__vec__smooth_idf': [False], #[True, False],
#               'features__text__vec__sublinear_tf': [True], #[True, False],
              
#               'features__text__nb_features__alpha': np.linspace(0.1, 1, 10),
    
#               'clf': [LinearSVC()],
# #               'clf__penalty': ['l2'],
# #               'clf__loss': ['squared_hinge', 'hinge'],
# #               'clf__dual': [False], #[True, False],
#               'clf__C': [1],
#               'clf__multi_class': ['crammer_singer'], #['ovr', 'crammer_singer']
#               'clf__class_weight': ['balanced'], #[None, 'balanced'],
#               'clf__random_state': [RANDOM_SEED],
                 
}

cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=RANDOM_SEED)

In [24]:
grid_search = GridSearchCV(pipe, param_grid, scoring='f1_macro', 
                           cv=cv, n_jobs=-1, return_train_score=False,
                           verbose=2, iid=True)

grid_search.fit(X_train, X_train['class'])

Fitting 2 folds for each of 16 candidates, totalling 32 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:  6.0min finished


GridSearchCV(cv=StratifiedKFold(n_splits=2, random_state=17, shuffle=True),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('text', Pipeline(memory=None,
     steps=[('extract', ColumnExtractor(columns='full_text')), ('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', ...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'features__text__vec': [TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=0.4, max_features=100000, min_df=2,
        ngram_range=(1, 2), norm='l2', preprocessor=...state': [17], 'clf__solver': ['lbfgs'], 'clf__max_iter': [200], 'clf__multi_cla

0.8058100230051004

In [25]:
grid_search.best_score_, grid_search.best_params_

(0.7958886685612655,
 {'clf': LogisticRegression(C=2, class_weight='balanced', dual=False,
            fit_intercept=True, intercept_scaling=1, max_iter=200,
            multi_class='multinomial', n_jobs=None, penalty='l2',
            random_state=17, solver='lbfgs', tol=0.0001, verbose=0,
            warm_start=False),
  'clf__C': 2,
  'clf__class_weight': 'balanced',
  'clf__max_iter': 200,
  'clf__multi_class': 'multinomial',
  'clf__penalty': 'l2',
  'clf__random_state': 17,
  'clf__solver': 'lbfgs',
  'features__text__vec': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
          lowercase=False, max_df=0.4, max_features=100000, min_df=2,
          ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=False,
          stop_words='english', strip_accents=None, sublinear_tf=True,
          token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
          vocabulary=Non

In [26]:
cv_results_df = pd.DataFrame(grid_search.cv_results_).T
cv_results_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
mean_fit_time,62.8536,77.332,65.0773,75.0503,62.0406,68.6479,67.5353,71.9565,70.9582,73.9817,74.6066,75.3846,66.0684,70.6941,66.9399,69.0976
std_fit_time,2.59811,0.837807,1.47991,0.538451,1.59855,0.601838,0.862036,0.0470495,0.741077,2.31545,1.45588,0.680367,0.817206,3.60303,0.612872,0.0266535
mean_score_time,15.2751,14.925,15.954,15.1276,15.5951,15.4313,15.5411,15.5642,15.5873,16.8674,16.1022,14.916,14.7843,13.0906,10.8019,8.22515
std_score_time,0.268173,1.18859,0.435821,0.170727,0.55438,0.447879,0.179735,0.33618,1.20794,0.391906,0.202396,0.0417947,0.338223,0.753737,0.712508,0.0797693
param_clf,"LogisticRegression(C=2, class_weight='balanced...","LogisticRegression(C=2, class_weight='balanced...","LogisticRegression(C=2, class_weight='balanced...","LogisticRegression(C=2, class_weight='balanced...","LogisticRegression(C=2, class_weight='balanced...","LogisticRegression(C=2, class_weight='balanced...","LogisticRegression(C=2, class_weight='balanced...","LogisticRegression(C=2, class_weight='balanced...","LogisticRegression(C=2, class_weight='balanced...","LogisticRegression(C=2, class_weight='balanced...","LogisticRegression(C=2, class_weight='balanced...","LogisticRegression(C=2, class_weight='balanced...","LogisticRegression(C=2, class_weight='balanced...","LogisticRegression(C=2, class_weight='balanced...","LogisticRegression(C=2, class_weight='balanced...","LogisticRegression(C=2, class_weight='balanced..."
param_clf__C,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
param_clf__class_weight,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced
param_clf__max_iter,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200
param_clf__multi_class,multinomial,multinomial,multinomial,multinomial,multinomial,multinomial,multinomial,multinomial,multinomial,multinomial,multinomial,multinomial,multinomial,multinomial,multinomial,multinomial
param_clf__penalty,l2,l2,l2,l2,l2,l2,l2,l2,l2,l2,l2,l2,l2,l2,l2,l2


In [27]:
print(len(grid_search.best_estimator_.get_params()['features__text__vec'].vocabulary_))
print(len(grid_search.best_estimator_.get_params()['features__text__vec'].stop_words_))

100000
4087202


In [28]:
y_val_pred = grid_search.predict_proba(X_valid).argmax(axis=1)
y_val_pred

array([1, 1, 1, ..., 2, 1, 1])

In [29]:
le.inverse_transform(y_val_pred)

array(['news', 'news', 'news', ..., 'other', 'news', 'news'], dtype=object)

In [30]:
np.bincount(y_val_pred) / len(y_val_pred)

array([0.15990991, 0.5777027 , 0.26238739])

**Evaluation**

In [31]:
f1_score(X_valid['class'], y_val_pred, average='macro')

0.7853015524379502

**Feature Importance**

**Predict & Submit**

In [173]:
full_train_df = pd.concat([X_train, X_valid], axis=0, ignore_index=True)

In [174]:
best_model = grid_search.best_estimator_
best_model.fit(full_train_df, full_train_df['class'])

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('title', Pipeline(memory=None,
     steps=[('extract', ColumnExtractor(columns='title')), ('vec', TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', inpu...l2',
          random_state=17, solver='lbfgs', tol=0.0001, verbose=0,
          warm_start=False))])

In [175]:
X_test.columns

Index(['id', 'title', 'text', 'title_pos', 'text_pos'], dtype='object')

In [176]:
X_test.columns = ['id', 'title', 'text', 'title_pos', 'text_pos']#, 'title_length', 'text_length',
       #'is_title_na', 'is_text_na']
X_test.head()

Unnamed: 0,id,title,text,title_pos,text_pos
0,0,Amazon_PROPN CEO_PROPN Jeff_PROPN Bezos_PROPN ...,More_ADJ Try_VERB Yahoo_PROPN Finance_PROPN on...,PROPN PROPN PROPN PROPN VERB ADV DET ADJ ADJ N...,ADJ VERB PROPN PROPN ADP PROPN SPACE PROPN PRO...
1,1,Does_VERB Laura_PROPN Dern_PROPN Handle_PROPN ...,More_ADJ Laura_PROPN Dern_PROPN seems_VERB to_...,VERB PROPN PROPN PROPN DET PROPN ADP DET PROPN...,ADJ PROPN PROPN VERB PART VERB ADV DET NOUN PU...
2,2,In_ADP this_DET photographer_NOUN 's_PART home...,Kirkuk_PROPN is_VERB a_DET city_NOUN of_ADP No...,ADP DET NOUN PART NOUN NOUN PUNCT VERB ADP ADP...,PROPN VERB DET NOUN ADP ADJ PROPN ADP DET ADJ ...
3,3,8_NUM Ways_PROPN To_PART Get_VERB Your_ADJ Spo...,Experts_NOUN say_VERB that_ADP communication_N...,NUM PROPN PART VERB ADJ NOUN PART VERB PART AD...,NOUN VERB ADP NOUN VERB DET NOUN ADP DET ADJ N...
4,4,US_PROPN says_VERB claim_VERB it_PRON supporte...,Share_VERB this_DET with_ADP Email_PROPN Faceb...,PROPN VERB VERB PRON VERB VERB ADP PROPN VERB ...,VERB DET ADP PROPN PROPN PROPN PROPN PROPN PRO...


In [177]:
(test_df.index == X_test.id).all()

True

In [178]:
y_test_pred = best_model.predict_proba(X_test).argmax(axis=1)
y_test_pred

array([0, 0, 1, ..., 1, 1, 1])

In [179]:
y_test_label = le.inverse_transform(y_test_pred)
y_test_label

array(['clickbait', 'clickbait', 'news', ..., 'news', 'news', 'news'],
      dtype=object)

In [180]:
np.bincount(y_test_pred) / len(y_test_label)

array([0.2066584 , 0.7060386 , 0.08730299])

In [181]:
submission_df = pd.DataFrame({'id': X_test['id'], 'label': y_test_label})
submission_df.head()

Unnamed: 0,id,label
0,0,clickbait
1,1,clickbait
2,2,news
3,3,clickbait
4,4,news


In [182]:
submission_df.to_csv('submission.csv', index=False)

In [183]:
!head submission.csv

id,label
0,clickbait
1,clickbait
2,news
3,clickbait
4,news
5,news
6,news
7,news
8,news


In [184]:
!wc -l submission.csv

5648 submission.csv


In [185]:
!kaggle competitions submit -c dlinnlp-spring-2019-clf -f submission.csv -m "LR (mn)"

100%|██████████████████████████████████████| 60.3k/60.3k [00:03<00:00, 16.0kB/s]
Successfully submitted to DL in NLP Spring 2019. Classification

**Error Analysis**

In [215]:
y_margins = grid_search.decision_function(X_valid)
y_val_pred = (y_margins - y_margins.min()) / (y_margins.max() - y_margins.min())
# y_val_pred = y_val_pred.argmax(axis=1)
y_val_pred

array([[0.59144613, 0.77370618, 0.36949007],
       [0.4860577 , 0.67253872, 0.57604596],
       [0.42553417, 0.79839178, 0.51071644],
       ...,
       [0.44698388, 0.51817221, 0.7694863 ],
       [0.66384886, 0.84248302, 0.22831051],
       [0.58173126, 0.84916822, 0.30374291]])

In [220]:
validate_df = X_valid.copy()
validate_df = validate_df.assign(pred_0 = y_val_pred[:, 0], 
                                 pred_1 = y_val_pred[:, 1],
                                 pred_2 = y_val_pred[:, 2],
                                 class_pred = y_val_pred.argmax(axis=1))
validate_df.head()

Unnamed: 0,label,title,text,class,pred_0,pred_1,pred_2,class_pred
0,news,Trump_PROPN says_VERB he_PRON is_VERB releasin...,"Bob_PROPN Bryan_PROPN ,_PUNCT Business_PROPN I...",1,0.591446,0.773706,0.36949,1
1,news,Fidel_PROPN Castro_PROPN 's_PART ashes_NOUN ma...,Cubans_PROPN have_VERB been_VERB lining_VERB t...,1,0.486058,0.672539,0.576046,1
2,news,Obama_PROPN Administration_PROPN Sending_VERB ...,WASHINGTON_PROPN —_PUNCT The_DET Obama_PROPN a...,1,0.425534,0.798392,0.510716,1
3,news,Insurers_NOUN Are_VERB Worried_ADJ About_ADP T...,The_DET main_ADJ industry_NOUN groups_NOUN rep...,1,0.575398,0.764276,0.394968,1
4,news,Kobe_PROPN Bryant_PROPN and_CCONJ Nike_PROPN F...,A_DET year_NOUN after_ADP Kobe_PROPN Bryant_PR...,1,0.543219,0.730411,0.461013,1


In [221]:
incorrect_df = validate_df[validate_df['class'] != validate_df['class_pred']]
incorrect_df.head()

Unnamed: 0,label,title,text,class,pred_0,pred_1,pred_2,class_pred
123,news,How_ADV to_PART Talk_VERB to_ADP Your_ADJ Teen...,"More_ADJ For_ADP parents_NOUN ,_PUNCT talking_...",1,0.746912,0.72041,0.26732,0
229,news,Genius_PROPN Bar_PROPN secrets_NOUN revealed_V...,One_NUM of_ADP the_DET biggest_ADJ things_NOUN...,1,0.598378,0.581579,0.554686,0
347,news,13_NUM Gifts_NOUN For_ADP People_NOUN Who_NOUN...,Love_NOUN is_VERB n't_ADV all_DET we_PRON need...,1,0.635953,0.620099,0.478591,0
391,news,Deer_PROPN Debacle_NOUN Exacerbates_VERB New_P...,Desc_PROPN,1,0.416454,0.626502,0.691686,2
490,news,Donald_PROPN Trump_PROPN Pays_VERB Respects_NO...,President_PROPN Donald_PROPN Trump_PROPN trave...,1,0.712625,0.705318,0.3167,0


In [238]:
validate_df.groupby(['class', 'class_pred'])['label'].count()

class  class_pred
0      0              542
       1                1
1      0               19
       1             2065
       2                4
2      2              921
Name: label, dtype: int64

In [239]:
incorrect_df.groupby(['class', 'class_pred'])['label'].count()

class  class_pred
0      1              1
1      0             19
       2              4
Name: label, dtype: int64

In [240]:
incorrect_df

Unnamed: 0,label,title,text,class,pred_0,pred_1,pred_2,class_pred
123,news,How_ADV to_PART Talk_VERB to_ADP Your_ADJ Teen...,"More_ADJ For_ADP parents_NOUN ,_PUNCT talking_...",1,0.746912,0.72041,0.26732,0
229,news,Genius_PROPN Bar_PROPN secrets_NOUN revealed_V...,One_NUM of_ADP the_DET biggest_ADJ things_NOUN...,1,0.598378,0.581579,0.554686,0
347,news,13_NUM Gifts_NOUN For_ADP People_NOUN Who_NOUN...,Love_NOUN is_VERB n't_ADV all_DET we_PRON need...,1,0.635953,0.620099,0.478591,0
391,news,Deer_PROPN Debacle_NOUN Exacerbates_VERB New_P...,Desc_PROPN,1,0.416454,0.626502,0.691686,2
490,news,Donald_PROPN Trump_PROPN Pays_VERB Respects_NO...,President_PROPN Donald_PROPN Trump_PROPN trave...,1,0.712625,0.705318,0.3167,0
497,news,These_DET trapdoor_NOUN waterslides_NOUN plung...,"David_PROPN Ibekwe_PROPN Apr._PROPN 12_NUM ,_P...",1,0.634602,0.630173,0.469868,0
694,news,How_ADV '_PUNCT Jurassic_PROPN World_PROPN '_P...,It_PRON is_VERB well_ADV -_PUNCT known_VERB at...,1,0.720751,0.715298,0.298594,0
726,news,10_NUM things_NOUN you_PRON need_VERB to_PART ...,"Jonathan_PROPN Garber_PROPN ,_PUNCT Business_P...",1,0.759789,0.700965,0.273889,0
754,news,Regis_PROPN Philbin_PROPN :_PUNCT Trump_PROPN ...,Daytime_ADJ talk_NOUN show_NOUN legend_NOUN Re...,1,0.780099,0.662219,0.292325,0
991,news,What_NOUN will_VERB change_VERB for_ADP your_A...,More_ADJ Try_VERB Yahoo_PROPN Finance_PROPN on...,1,0.759607,0.702148,0.272887,0


In [254]:
from collections import Counter

In [269]:
incorrect_df = pd.concat([incorrect_df, 
                          incorrect_df.title.str.findall(r'_([A-Z]+)').apply(Counter).apply(pd.Series).fillna(0).astype(int)], axis=1)

In [270]:
incorrect_df

Unnamed: 0,label,title,text,class,pred_0,pred_1,pred_2,class_pred,ADV,PART,...,NOUN,PROPN,PUNCT,NUM,DET,PRON,INTJ,CCONJ,SYM,SPACE
123,news,How_ADV to_PART Talk_VERB to_ADP Your_ADJ Teen...,"More_ADJ For_ADP parents_NOUN ,_PUNCT talking_...",1,0.746912,0.72041,0.26732,0,1,1,...,2,1,2,0,0,0,0,0,0,0
229,news,Genius_PROPN Bar_PROPN secrets_NOUN revealed_V...,One_NUM of_ADP the_DET biggest_ADJ things_NOUN...,1,0.598378,0.581579,0.554686,0,0,0,...,3,2,1,0,0,0,0,0,0,0
347,news,13_NUM Gifts_NOUN For_ADP People_NOUN Who_NOUN...,Love_NOUN is_VERB n't_ADV all_DET we_PRON need...,1,0.635953,0.620099,0.478591,0,0,0,...,4,0,0,1,0,0,0,0,0,0
391,news,Deer_PROPN Debacle_NOUN Exacerbates_VERB New_P...,Desc_PROPN,1,0.416454,0.626502,0.691686,2,0,0,...,1,5,0,0,0,0,0,0,0,0
490,news,Donald_PROPN Trump_PROPN Pays_VERB Respects_NO...,President_PROPN Donald_PROPN Trump_PROPN trave...,1,0.712625,0.705318,0.3167,0,0,0,...,1,6,0,0,0,0,0,0,0,0
497,news,These_DET trapdoor_NOUN waterslides_NOUN plung...,"David_PROPN Ibekwe_PROPN Apr._PROPN 12_NUM ,_P...",1,0.634602,0.630173,0.469868,0,0,0,...,3,0,1,0,2,1,0,0,0,0
694,news,How_ADV '_PUNCT Jurassic_PROPN World_PROPN '_P...,It_PRON is_VERB well_ADV -_PUNCT known_VERB at...,1,0.720751,0.715298,0.298594,0,1,1,...,4,2,2,0,0,0,0,0,0,0
726,news,10_NUM things_NOUN you_PRON need_VERB to_PART ...,"Jonathan_PROPN Garber_PROPN ,_PUNCT Business_P...",1,0.759789,0.700965,0.273889,0,0,1,...,2,0,0,1,1,1,0,0,0,0
754,news,Regis_PROPN Philbin_PROPN :_PUNCT Trump_PROPN ...,Daytime_ADJ talk_NOUN show_NOUN legend_NOUN Re...,1,0.780099,0.662219,0.292325,0,0,0,...,2,4,5,0,0,0,1,1,0,0
991,news,What_NOUN will_VERB change_VERB for_ADP your_A...,More_ADJ Try_VERB Yahoo_PROPN Finance_PROPN on...,1,0.759607,0.702148,0.272887,0,0,0,...,2,0,0,1,0,0,0,0,0,0


In [None]:
train_df.div()

In [None]:
sns.heatmap(pos_train_df.groupby('class').mean())