In [2]:
from pathlib import Path
import pickle
import re

import numpy as np
import pandas as pd
from scipy import sparse

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.preprocessing import FunctionTransformer, LabelEncoder

from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC

In [3]:
pd.options.display.max_colwidth = 50 # default - 50

In [4]:
DATA_PATH = Path('../data')
RANDOM_SEED = 17

**Data Loading**

In [219]:
train_df = pd.read_csv(DATA_PATH/'train.csv')
valid_df = pd.read_csv(DATA_PATH/'valid.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')

In [220]:
with open(DATA_PATH/'X_train_ftfy_spacy.pkl', 'rb') as  f:
    X_train_clean = pickle.load(f)

train_df['title'] = X_train_clean['clean_title']
train_df['text'] = X_train_clean['clean_text']

with open(DATA_PATH/'X_valid_ftfy_spacy.pkl', 'rb') as  f:
    X_valid_clean = pickle.load(f)

valid_df['title'] = X_valid_clean['clean_title']
valid_df['text'] = X_valid_clean['clean_text']

with open(DATA_PATH/'X_test_ftfy_spacy.pkl', 'rb') as  f:
    X_test_clean = pickle.load(f)

test_df['title'] = X_test_clean['clean_title']
test_df['text'] = X_test_clean['clean_text']

**Data Cleaning**

In [221]:
X_train = train_df.fillna('')
X_valid = valid_df.fillna('')
X_test = test_df.fillna('')

In [222]:
le = LabelEncoder()
X_train['class'] = le.fit_transform(X_train['label'])
X_valid['class'] = le.transform(X_valid['label'])

In [223]:
le.classes_

array(['clickbait', 'news', 'other'], dtype=object)

**Training**

In [224]:
class ColumnExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.columns]

In [225]:
class NBTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, X, y=None):
        y = y.values
        
        pos_count = X[y==1].sum(0) 
        neg_count = X[y==0].sum(0)
        n = X.shape[1]
        p = (pos_count + self.alpha) / (pos_count.sum() + self.alpha * n)
        q = (neg_count + self.alpha) / (neg_count.sum() + self.alpha * n)
        self.r_ = np.log(p / q)
        return self
    
    def transform(self, X, y=None):
        return X.multiply(self.r_)

In [226]:
class TfidfVectorizerPlus(TfidfVectorizer):
    def __init__(self, fit_add=None, norm_type=None, pivot=5, slope=0.2, 
                       input='content', encoding='utf-8', decode_error='strict', 
                       strip_accents=None, lowercase=True, preprocessor=None, 
                       tokenizer=None, analyzer='word', stop_words=None, 
                       token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), 
                       max_df=1.0, min_df=1, max_features=None, vocabulary=None, 
                       binary=False, dtype=np.float64, norm='l2', 
                       use_idf=True, smooth_idf=True, sublinear_tf=False):
        super().__init__(input, encoding, decode_error,
                         strip_accents, lowercase, preprocessor,
                         tokenizer, analyzer, stop_words,
                         token_pattern, ngram_range,
                         max_df, min_df, max_features, vocabulary,
                         binary, dtype, norm,
                         use_idf, smooth_idf, sublinear_tf)
        
        self.fit_add = fit_add
        self.norm_type = norm_type
        self.pivot = pivot
        self.slope = slope
    
    def fit(self, X, y=None):
        if self.fit_add is not None:
            X_new = pd.concat([X, self.fit_add])
        else:
            X_new = X
        
        super().fit(X_new, y)
        return self
        
    def transform(self, X, y=None):
        res = super().transform(X)
            
        if self.norm_type == 'pivot_cosine':
            norm_factor = (1 - self.slope) * self.pivot + self.slope * sparse.linalg.norm(res, axis=1).reshape(-1, 1)
            res = sparse.csr_matrix(res.multiply(1 / norm_factor))
        elif self.norm_type == 'pivot_unique':
            unique_terms_num = (res > 0).sum(axis=1)
            norm_factor = (1 - self.slope) * self.pivot + self.slope * unique_terms_num
            res = sparse.csr_matrix(res.multiply(1 / norm_factor))
        elif self.norm_type is not None:
            raise ValueError('Incorrect normalization type')
            
        return res

In [227]:
class TextTruncater(BaseEstimator, TransformerMixin):
    def __init__(self, max_length=None):
        self.max_length = max_length
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if self.max_length is not None:
            return X.str[:self.max_length]
        else:
            return X

In [242]:
pipe = Pipeline([
    ('features', FeatureUnion([
        ('title', Pipeline([
            ('extract', ColumnExtractor(columns='title')),
            ('vec', TfidfVectorizer()),
#             ('nb_features', NBTransformer())
            
        ])),
        ('text', Pipeline([
            ('extract', ColumnExtractor(columns='text')),
            ('vec', TfidfVectorizer()),
#             ('nb_features', NBTransformer())
        ])),       
    ], 
#         transformer_weights={
#             'title': 0.4,
#             'text': 0.6,
#         }
    )),
    ('clf', LinearSVC())
])

In [243]:
param_grid = {
              'features__title__vec': [TfidfVectorizer()],
              'features__title__vec__strip_accents': ['ascii'], #[None, 'unicode', 'ascii'],
              'features__title__vec__lowercase': [True], #[True, False],
              'features__title__vec__analyzer': ['word'], #['word', 'char', 'char_wb'],
              'features__title__vec__stop_words': [None], #[None, 'english'],
              'features__title__vec__token_pattern': [r'\b\w+\b'], #[r'\b\w+\b', r'(?u)\b\w\w+\b'],
              'features__title__vec__ngram_range': [(1, 3)], #[(1, 1), (1, 2), (1, 3)],
              'features__title__vec__max_df': [0.3], #[0.3], #[0.3, 0.4, 0.5],
              'features__title__vec__min_df': [1], #[1, 2, 3],
              'features__title__vec__max_features': [None], #[None, 100000, 200000, 300000],
              'features__title__vec__binary': [False],
              'features__title__vec__use_idf': [True], #[True, False],
              'features__title__vec__smooth_idf': [False], #[True, False],
              'features__title__vec__sublinear_tf': [False], #[True, False],
                                
    
              'features__text__vec': [TfidfVectorizer()],
              'features__text__vec__strip_accents': ['ascii'], #[None, 'unicode', 'ascii'],
              'features__text__vec__lowercase': [False], #[True, False],
              'features__text__vec__analyzer': ['word'], #['word', 'char', 'char_wb'],
              'features__text__vec__stop_words': [None], #[None, 'english'],
              'features__text__vec__token_pattern': [r'\b\w+\b'], #[r'\b\w+\b', r'(?u)\b\w\w+\b'],
              'features__text__vec__ngram_range': [(1, 2)], #[(1, 1), (1, 2), (1, 3)],
              'features__text__vec__max_df': [0.8],
              'features__text__vec__min_df': [1],
              'features__text__vec__max_features': [200000], #[50000, 100000, 150000],
              'features__text__vec__binary': [False], #[True, False],
              'features__text__vec__use_idf': [True], #[True, False],
              'features__text__vec__smooth_idf': [False], #[True, False],
              'features__text__vec__sublinear_tf': [True], #[True, False],
              
              
              'clf': [LogisticRegression()],
              'clf__penalty': ['l2'], # ['l1', 'l2'], # ['l2'],
              'clf__C': [5], #np.logspace(-2, 2, 5), # [2], 
              'clf__class_weight': ['balanced'], #[None, 'balanced'], #['balanced']
              'clf__random_state': [RANDOM_SEED],
              'clf__solver':  ['lbfgs'], #['lbfgs']
              'clf__max_iter': [200],
              'clf__multi_class': ['multinomial'], #['ovr', 'multinomial'],
              
              
#               'features__title__vec': [TfidfVectorizer()],
#               'features__title__vec__strip_accents': [None], #[None, 'unicode', 'ascii'],
#               'features__title__vec__lowercase': [True], #[True, False],
#               'features__title__vec__analyzer': ['word'], #['word', 'char', 'char_wb'],
#               'features__title__vec__stop_words': [None], #[None, 'english'],
#               'features__title__vec__token_pattern': [r'\b\w+\b'], #[r'\b\w+\b', r'(?u)\b\w\w+\b'],
#               'features__title__vec__ngram_range': [(1, 4)],
#               'features__title__vec__max_df': [0.8],
#               'features__title__vec__min_df': [1], #[1, 5, 10],
#               'features__title__vec__max_features': [70000],
#               'features__title__vec__binary': [True], #[True, False],
#               'features__title__vec__use_idf': [True], #[True, False],
#               'features__title__vec__smooth_idf': [True], #[True, False],
#               'features__title__vec__sublinear_tf': [True], #[True, False],

#               'clf': [LinearSVC()],
#               'clf__penalty': ['l2'],
#               'clf__loss': ['squared_hinge'], #['squared_hinge', 'hinge'],
#               'clf__dual': [False], #[True, False],
#               'clf__C': [0.4],
#               'clf__class_weight': ['balanced'],
#               'clf__random_state': [random_seed],
              
#               'features__text__vec': [TfidfVectorizer()],
#               'features__text__vec__strip_accents': ['ascii'], #[None, 'unicode', 'ascii'],
#               'features__text__vec__lowercase': [False], #[True, False],
#               'features__text__vec__analyzer': ['word'], #['word', 'char', 'char_wb'],
#               'features__text__vec__stop_words': [None], #[None, 'english'],
#               'features__text__vec__token_pattern': [r'\b\w+\b'], #[r'\b\w+\b', r'(?u)\b\w\w+\b'],
#               'features__text__vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
#               'features__text__vec__max_df': [0.9],
#               'features__text__vec__min_df': [1],
#               'features__text__vec__max_features': [200000, 300000, 400000, 500000],
#               'features__text__vec__binary': [False], #[True, False],
#               'features__text__vec__use_idf': [True], #[True, False],
#               'features__text__vec__smooth_idf': [False], #[True, False],
#               'features__text__vec__sublinear_tf': [True], #[True, False],
              
#               'features__text__nb_features__alpha': np.linspace(0.1, 1, 10),
    
#               'clf': [LinearSVC()],
# #               'clf__penalty': ['l2'],
# #               'clf__loss': ['squared_hinge', 'hinge'],
# #               'clf__dual': [False], #[True, False],
#               'clf__C': [1],
#               'clf__multi_class': ['crammer_singer'], #['ovr', 'crammer_singer']
#               'clf__class_weight': ['balanced'], #[None, 'balanced'],
#               'clf__random_state': [RANDOM_SEED],
                 
}

cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=RANDOM_SEED)

In [244]:
grid_search = GridSearchCV(pipe, param_grid, scoring='f1_macro', 
                           cv=cv, n_jobs=-1, return_train_score=False,
                           verbose=2, iid=True)

grid_search.fit(X_train, X_train['class'])

Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.7min finished


GridSearchCV(cv=StratifiedKFold(n_splits=2, random_state=17, shuffle=True),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('title', Pipeline(memory=None,
     steps=[('extract', ColumnExtractor(columns='title')), ('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', inp...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'features__title__vec': [TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.3, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=No...state': [17], 'clf__solver': ['lbfgs'], 'clf__max_iter': [200], 'clf__multi_cla

In [245]:
grid_search.best_score_, grid_search.best_params_

(0.8077506273242253,
 {'clf': LogisticRegression(C=5, class_weight='balanced', dual=False,
            fit_intercept=True, intercept_scaling=1, max_iter=200,
            multi_class='multinomial', n_jobs=None, penalty='l2',
            random_state=17, solver='lbfgs', tol=0.0001, verbose=0,
            warm_start=False),
  'clf__C': 5,
  'clf__class_weight': 'balanced',
  'clf__max_iter': 200,
  'clf__multi_class': 'multinomial',
  'clf__penalty': 'l2',
  'clf__random_state': 17,
  'clf__solver': 'lbfgs',
  'features__text__vec': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
          lowercase=False, max_df=0.8, max_features=200000, min_df=1,
          ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=False,
          stop_words=None, strip_accents='ascii', sublinear_tf=True,
          token_pattern='\\b\\w+\\b', tokenizer=None, use_idf=True,
          vocabulary=None),
  'fe

In [246]:
cv_results_df = pd.DataFrame(grid_search.cv_results_).T
cv_results_df

Unnamed: 0,0
mean_fit_time,80.6462
std_fit_time,2.30501
mean_score_time,16.027
std_score_time,0.0552391
param_clf,"LogisticRegression(C=5, class_weight='balanced..."
param_clf__C,5
param_clf__class_weight,balanced
param_clf__max_iter,200
param_clf__multi_class,multinomial
param_clf__penalty,l2


In [258]:
print(len(grid_search.best_estimator_.get_params()['features__text__vec'].vocabulary_))
print(len(grid_search.best_estimator_.get_params()['features__text__vec'].stop_words_))

200000
2911532


In [247]:
y_val_pred = grid_search.predict_proba(X_valid).argmax(axis=1)
y_val_pred

array([1, 1, 1, ..., 2, 1, 1])

In [248]:
le.inverse_transform(y_val_pred)

array(['news', 'news', 'news', ..., 'other', 'news', 'news'], dtype=object)

In [249]:
np.bincount(y_val_pred) / len(y_val_pred)

array([0.13316441, 0.60416667, 0.26266892])

**Evaluation**

In [250]:
f1_score(X_valid['class'], y_val_pred, average='macro')

0.8080476964917223

**Feature Importance**

**Predict & Submit**

In [251]:
full_train_df = pd.concat([X_train, X_valid], axis=0, ignore_index=True)

In [252]:
best_model = grid_search.best_estimator_
best_model.fit(full_train_df, full_train_df['class'])



Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('title', Pipeline(memory=None,
     steps=[('extract', ColumnExtractor(columns='title')), ('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', inp...l2',
          random_state=17, solver='lbfgs', tol=0.0001, verbose=0,
          warm_start=False))])

In [261]:
X_test.columns = ['id', 'title', 'text']#, 'title_length', 'text_length',
       #'is_title_na', 'is_text_na']
X_test.head()

Unnamed: 0,id,title,text
0,0,Amazon_PROPN CEO_PROPN Jeff_PROPN Bezos_PROPN ...,More_ADJ Try_VERB Yahoo_PROPN Finance_PROPN on...
1,1,Does_VERB Laura_PROPN Dern_PROPN Handle_PROPN ...,More_ADJ Laura_PROPN Dern_PROPN seems_VERB to_...
2,2,In_ADP this_DET photographer_NOUN 's_PART home...,Kirkuk_PROPN is_VERB a_DET city_NOUN of_ADP No...
3,3,8_NUM Ways_PROPN To_PART Get_VERB Your_ADJ Spo...,Experts_NOUN say_VERB that_ADP communication_N...
4,4,US_PROPN says_VERB claim_VERB it_PRON supporte...,Share_VERB this_DET with_ADP Email_PROPN Faceb...


In [262]:
(test_df.index == X_test.id).all()

True

In [263]:
y_test_pred = best_model.predict_proba(X_test).argmax(axis=1)
y_test_pred

array([0, 0, 1, ..., 1, 1, 1])

In [264]:
y_test_label = le.inverse_transform(y_test_pred)
y_test_label

array(['clickbait', 'clickbait', 'news', ..., 'news', 'news', 'news'],
      dtype=object)

In [265]:
np.bincount(y_test_pred) / len(y_test_label)

array([0.17496016, 0.74375775, 0.0812821 ])

In [266]:
submission_df = pd.DataFrame({'id': X_test['id'], 'label': y_test_label})
submission_df.head()

Unnamed: 0,id,label
0,0,clickbait
1,1,clickbait
2,2,news
3,3,clickbait
4,4,news


In [267]:
submission_df.to_csv('submission.csv', index=False)

In [268]:
!head submission.csv

id,label
0,clickbait
1,clickbait
2,news
3,clickbait
4,news
5,news
6,news
7,news
8,news


In [269]:
!wc -l submission.csv

5648 submission.csv


In [None]:
!kaggle competitions submit -c dlinnlp-spring-2019-clf -f submission.csv -m "LR (mn)"

  0%|                                               | 0.00/59.3k [00:00<?, ?B/s]