In [505]:
from pathlib import Path
import pickle
import re

import numpy as np
import pandas as pd
from scipy import sparse

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.preprocessing import FunctionTransformer, LabelEncoder

from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC

In [506]:
pd.options.display.max_colwidth = 50 # default - 50

In [507]:
DATA_PATH = Path('../data')
RANDOM_SEED = 17

**Data Loading**

In [508]:
train_df = pd.read_csv(DATA_PATH/'train.csv')
valid_df = pd.read_csv(DATA_PATH/'valid.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')

**Data Cleaning**

In [509]:
train_df['title_length'] = train_df['title'].str.len().fillna(0)
valid_df['title_length'] = valid_df['title'].str.len().fillna(0)
test_df['title_length'] = test_df['title'].str.len().fillna(0)

train_df['text_length'] = train_df['text'].str.len().fillna(0)
valid_df['text_length'] = valid_df['text'].str.len().fillna(0)
test_df['text_length'] = test_df['text'].str.len().fillna(0)

In [510]:
train_df['is_title_na'] = train_df['title'].isnull().astype(np.int8)
valid_df['is_title_na'] = valid_df['title'].isnull().astype(np.int8)
test_df['is_title_na'] = test_df['title'].isnull().astype(np.int8)

train_df['is_text_na'] = train_df['text'].isnull().astype(np.int8)
valid_df['is_text_na'] = valid_df['text'].isnull().astype(np.int8)
test_df['is_text_na'] = test_df['text'].isnull().astype(np.int8)

In [511]:
add_cols = ['title_length', 'text_length', 'is_title_na', 'is_text_na']

In [512]:
X_train = train_df.fillna('')
X_valid = valid_df.fillna('')
X_test = test_df.fillna('')

In [513]:
le = LabelEncoder()
X_train['class'] = le.fit_transform(X_train['label'])
X_valid['class'] = le.transform(X_valid['label'])

In [514]:
le.classes_

array(['clickbait', 'news', 'other'], dtype=object)

**Training**

In [515]:
class ColumnExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.columns]

In [516]:
class NBTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, X, y=None):
        y = y.values
        
        pos_count = X[y==1].sum(0) 
        neg_count = X[y==0].sum(0)
        n = X.shape[1]
        p = (pos_count + self.alpha) / (pos_count.sum() + self.alpha * n)
        q = (neg_count + self.alpha) / (neg_count.sum() + self.alpha * n)
        self.r_ = np.log(p / q)
        return self
    
    def transform(self, X, y=None):
        return X.multiply(self.r_)

In [517]:
class TfidfVectorizerPlus(TfidfVectorizer):
    def __init__(self, fit_add=None, norm_type=None, pivot=5, slope=0.2, 
                       input='content', encoding='utf-8', decode_error='strict', 
                       strip_accents=None, lowercase=True, preprocessor=None, 
                       tokenizer=None, analyzer='word', stop_words=None, 
                       token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), 
                       max_df=1.0, min_df=1, max_features=None, vocabulary=None, 
                       binary=False, dtype=np.float64, norm='l2', 
                       use_idf=True, smooth_idf=True, sublinear_tf=False):
        super().__init__(input, encoding, decode_error,
                         strip_accents, lowercase, preprocessor,
                         tokenizer, analyzer, stop_words,
                         token_pattern, ngram_range,
                         max_df, min_df, max_features, vocabulary,
                         binary, dtype, norm,
                         use_idf, smooth_idf, sublinear_tf)
        
        self.fit_add = fit_add
        self.norm_type = norm_type
        self.pivot = pivot
        self.slope = slope
    
    def fit(self, X, y=None):
        if self.fit_add is not None:
            X_new = pd.concat([X, self.fit_add])
        else:
            X_new = X
        
        super().fit(X_new, y)
        return self
        
    def transform(self, X, y=None):
        res = super().transform(X)
            
        if self.norm_type == 'pivot_cosine':
            norm_factor = (1 - self.slope) * self.pivot + self.slope * sparse.linalg.norm(res, axis=1).reshape(-1, 1)
            res = sparse.csr_matrix(res.multiply(1 / norm_factor))
        elif self.norm_type == 'pivot_unique':
            unique_terms_num = (res > 0).sum(axis=1)
            norm_factor = (1 - self.slope) * self.pivot + self.slope * unique_terms_num
            res = sparse.csr_matrix(res.multiply(1 / norm_factor))
        elif self.norm_type is not None:
            raise ValueError('Incorrect normalization type')
            
        return res

In [670]:
pipe = Pipeline([
    ('features', FeatureUnion([
#         ('title', Pipeline([
#             ('extract', ColumnExtractor(columns='title')),
#             ('vec', TfidfVectorizer()),
# #             ('nb_features', NBTransformer())
#         ])),
        ('text', Pipeline([
            ('extract', ColumnExtractor(columns='text')),
            ('vec', TfidfVectorizer()),
            ('nb_features', NBTransformer())
        ])),       
#         ('title_length', Pipeline([
#             ('extract', ColumnExtractor(columns=['title_length']))
#         ])),
#         ('text_length', Pipeline([
#             ('extract', ColumnExtractor(columns=['text_length']))
#         ])),
#         ('is_title_na', Pipeline([
#             ('extract', ColumnExtractor(columns=['is_title_na']))
#         ])),
#         ('is_text_na', Pipeline([
#             ('extract', ColumnExtractor(columns=['is_text_na']))
#         ])),
    ], 
#         transformer_weights={
#             'comment_text': 0.9,
#             'char_length': 0.1,
#         }
    )),
    ('clf', LinearSVC())
])

In [676]:
param_grid = {
#               'features__title__vec': [TfidfVectorizer()],
#               'features__title__vec__strip_accents': ['ascii'], #[None, 'unicode', 'ascii'],
#               'features__title__vec__lowercase': [True], #[True, False],
#               'features__title__vec__analyzer': ['word'], #['word', 'char', 'char_wb'],
#               'features__title__vec__stop_words': [None], #[None, 'english'],
#               'features__title__vec__token_pattern': [r'\b\w+\b'], #[r'\b\w+\b', r'(?u)\b\w\w+\b'],
#               'features__title__vec__ngram_range': [(1, 3)], #[(1, 1), (1, 2), (1, 3)],
#               'features__title__vec__max_df': [0.3], #[0.3, 0.4, 0.5],
#               'features__title__vec__min_df': [1], #[1, 2, 3],
#               'features__title__vec__max_features': [None], #[None, 100000, 200000, 300000],
#               'features__title__vec__binary': [False],
#               'features__title__vec__use_idf': [True], #[True, False],
#               'features__title__vec__smooth_idf': [False], #[True, False],
#               'features__title__vec__sublinear_tf': [False], #[True, False],
                            
              
#               'features__text__vec': [TfidfVectorizer()],
#               'features__text__vec__strip_accents': ['ascii'], #[None, 'unicode', 'ascii'],
#               'features__text__vec__lowercase': [False], #[True, False],
#               'features__text__vec__analyzer': ['word'], #['word', 'char', 'char_wb'],
#               'features__text__vec__stop_words': [None], #[None, 'english'],
#               'features__text__vec__token_pattern': [r'\b\w+\b'], #[r'\b\w+\b', r'(?u)\b\w\w+\b'],
#               'features__text__vec__ngram_range': [(1, 2)], #[(1, 1), (1, 2), (1, 3)],
#               'features__text__vec__max_df': [1.0],
#               'features__text__vec__min_df': [1],
#               'features__text__vec__max_features': [150000], #[50000, 100000, 150000],
#               'features__text__vec__binary': [False], #[True, False],
#               'features__text__vec__use_idf': [True], #[True, False],
#               'features__text__vec__smooth_idf': [False], #[True, False],
#               'features__text__vec__sublinear_tf': [True], #[True, False],
              
              
#               'clf': [LogisticRegression()],
#               'clf__penalty': ['l2'], # ['l1', 'l2'], # ['l2'],
#               'clf__C': [5], #np.logspace(-2, 2, 5), # [2], 
#               'clf__class_weight': ['balanced'], #[None, 'balanced'], #['balanced']
#               'clf__random_state': [random_seed],
#               'clf__solver':  ['lbfgs'], #['lbfgs']
#               'clf__max_iter': [200],
              
              
              
#               'features__title__vec': [TfidfVectorizer()],
#               'features__title__vec__strip_accents': [None], #[None, 'unicode', 'ascii'],
#               'features__title__vec__lowercase': [True], #[True, False],
#               'features__title__vec__analyzer': ['word'], #['word', 'char', 'char_wb'],
#               'features__title__vec__stop_words': [None], #[None, 'english'],
#               'features__title__vec__token_pattern': [r'\b\w+\b'], #[r'\b\w+\b', r'(?u)\b\w\w+\b'],
#               'features__title__vec__ngram_range': [(1, 4)],
#               'features__title__vec__max_df': [0.8],
#               'features__title__vec__min_df': [1], #[1, 5, 10],
#               'features__title__vec__max_features': [70000],
#               'features__title__vec__binary': [True], #[True, False],
#               'features__title__vec__use_idf': [True], #[True, False],
#               'features__title__vec__smooth_idf': [True], #[True, False],
#               'features__title__vec__sublinear_tf': [True], #[True, False],

#               'clf': [LinearSVC()],
#               'clf__penalty': ['l2'],
#               'clf__loss': ['squared_hinge'], #['squared_hinge', 'hinge'],
#               'clf__dual': [False], #[True, False],
#               'clf__C': [0.4],
#               'clf__class_weight': ['balanced'],
#               'clf__random_state': [random_seed],
              
              'features__text__vec': [TfidfVectorizer()],
              'features__text__vec__strip_accents': [None, 'unicode', 'ascii'],
              'features__text__vec__lowercase': [True, False],
              'features__text__vec__analyzer': ['word', 'char', 'char_wb'],
              'features__text__vec__stop_words': [None, 'english'],
#               'features__text__vec__token_pattern': [r'\b\w+\b'], #[r'\b\w+\b', r'(?u)\b\w\w+\b'],
#               'features__text__vec__ngram_range': [(1, 2)], #[(1, 1), (1, 2), (1, 3)],
#               'features__text__vec__max_df': [1.0],
#               'features__text__vec__min_df': [1],
#               'features__text__vec__max_features': [150000], #[50000, 100000, 150000],
#               'features__text__vec__binary': [False], #[True, False],
#               'features__text__vec__use_idf': [True], #[True, False],
#               'features__text__vec__smooth_idf': [False], #[True, False],
#               'features__text__vec__sublinear_tf': [True], #[True, False],
              
#               'features__text__nb_features__alpha': [0.2, 0.5, 0.8], #np.linspace(0.1, 1, 10),
    
              'clf': [LinearSVC()],
              'clf__penalty': ['l2'],
              'clf__loss': ['squared_hinge'], #['squared_hinge', 'hinge'],
              'clf__dual': [False], #[True, False],
              'clf__C': [0.3],
              'clf__class_weight': ['balanced'], #[None, 'balanced'],
              'clf__random_state': [random_seed],
             
}

cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=random_seed)

In [None]:
grid_search = GridSearchCV(pipe, param_grid, scoring='f1_macro', 
                           cv=cv, n_jobs=-1, return_train_score=True,
                           verbose=2, iid=True)

grid_search.fit(X_train, X_train['class'])

Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
grid_search.best_score_, grid_search.best_params_

In [674]:
cv_results_df = pd.DataFrame(grid_search.cv_results_).T
cv_results_df

Unnamed: 0,0,1,2
mean_fit_time,10.2232,10.6665,12.5214
std_fit_time,0.283026,0.0136195,0.33712
mean_score_time,7.66202,8.22604,7.59158
std_score_time,0.838299,1.22304,0.473638
param_clf,"LinearSVC(C=0.3, class_weight='balanced', dual...","LinearSVC(C=0.3, class_weight='balanced', dual...","LinearSVC(C=0.3, class_weight='balanced', dual..."
param_clf__C,0.3,0.3,0.3
param_clf__class_weight,balanced,balanced,balanced
param_clf__dual,False,False,False
param_clf__loss,squared_hinge,squared_hinge,squared_hinge
param_clf__penalty,l2,l2,l2


In [675]:
print(len(grid_search.best_estimator_.get_params()['features__text__vec'].vocabulary_))

138288


In [653]:
y_margins = grid_search.decision_function(X_valid)
y_val_pred = (y_margins - y_margins.min()) / (y_margins.max() - y_margins.min())
y_val_pred = y_val_pred.argmax(axis=1)
y_val_pred

array([1, 1, 1, ..., 2, 1, 1])

In [654]:
le.inverse_transform(y_val_pred)

array(['news', 'news', 'news', ..., 'other', 'news', 'news'], dtype=object)

In [655]:
np.bincount(y_val_pred) / len(y_val_pred)

array([0.12471847, 0.6089527 , 0.26632883])

**Evaluation**

In [656]:
f1_score(X_valid['class'], y_val_pred, average='macro')

0.767289826049255

**Feature Importance**

**Predict & Submit**

In [480]:
full_train_df = pd.concat([X_train, X_valid], axis=0, ignore_index=True)

In [481]:
best_model = grid_search.best_estimator_
best_model.fit(full_train_df, full_train_df['class'])



Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('title', Pipeline(memory=None,
     steps=[('extract', ColumnExtractor(columns='title')), ('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', inp... penalty='l2', random_state=17,
          solver='lbfgs', tol=0.0001, verbose=0, warm_start=False))])

In [484]:
X_test.columns = ['id', 'title', 'text', 'title_length', 'text_length',
       'is_title_na', 'is_text_na']
X_test.head()

Unnamed: 0,id,title,text,title_length,text_length,is_title_na,is_text_na
0,0,Amazon CEO Jeff Bezos is now the second riches...,More Try Yahoo Finance on Firefox » Amazon CEO...,64,3499.0,0,0
1,1,Does Laura Dern Handle a Lightsaber in the New...,More Laura Dern seems to be everywhere these d...,67,2296.0,0,0
2,2,"In this photographer’s home town, stepping out...",Kirkuk is a city of Northern Iraq in the Kurdi...,69,4732.0,0,0
3,3,"8 Ways To Get Your Spouse To Open Up More, Acc...",Experts say that communication is the cornerst...,66,4485.0,0,0
4,4,US says claim it supported IS in Syria is 'lud...,Share this with Email Facebook Messenger Messe...,53,2276.0,0,0


In [488]:
(test_df.index == X_test.id).all()

True

In [489]:
y_test_pred = best_model.predict_proba(X_test).argmax(axis=1)
y_test_pred

array([1, 0, 1, ..., 1, 1, 1])

In [496]:
y_test_label = le.inverse_transform(y_test_pred)
y_test_label

array(['news', 'clickbait', 'news', ..., 'news', 'news', 'news'],
      dtype=object)

In [497]:
np.bincount(y_test_pred) / len(y_test_label)

array([0.17000177, 0.74836196, 0.08163627])

In [498]:
submission_df = pd.DataFrame({'id': X_test['id'], 'label': y_test_label})
submission_df.head()

Unnamed: 0,id,label
0,0,news
1,1,clickbait
2,2,news
3,3,clickbait
4,4,news


In [499]:
submission_df.to_csv('submission.csv', index=False)

In [500]:
!head submission.csv

id,label
0,news
1,clickbait
2,news
3,clickbait
4,news
5,news
6,news
7,news
8,news


In [502]:
!wc -l submission.csv

5648 submission.csv
