In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import precision_score, accuracy_score, recall_score, roc_auc_score
from sklearn.decomposition import TruncatedSVD, PCA
from nltk.tokenize import RegexpTokenizer
import scipy


In [2]:
df_train = pd.read_csv('./train_feature.csv')
df_test = pd.read_csv('./test_feature.csv')

print(len(df_train))
print(len(df_test))

27643
11847


In [3]:
import re
from bs4 import BeautifulSoup
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag

nltk.download('stopwords')
stop = stopwords.words('english')

porter = PorterStemmer()
wnl = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')

def prep(text):
    
    # Remove HTML tags.
#     text = BeautifulSoup(text,'html.parser').get_text()
    
    text = re.sub("[^a-zA-Z]", "", text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    
    
    token  = nltk.word_tokenize(text)
    
    text = [porter.stem(w) for w in token if w not in stop]
    
    # Join the words back into one string separated by space, and return the result.
    return " ".join(text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nlplab/harry/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df_train.columns

Index(['Id', 'Popularity', 'topic', 'channel', 'weekday', 'author',
       'img count', 'title', 'content', 'media count', 'n_tokens_title',
       'n_tokens_content', 'n_unique_tokens', 'n_non_stop_words',
       'n_non_stop_unique_tokens'],
      dtype='object')

In [5]:
# Normalize the numerical features
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numerical = ['img count', 'media count', 'n_tokens_content', 'n_tokens_title']
df_train[numerical] = scaler.fit_transform(df_train[numerical])


In [6]:
# numerical = ['img count', 'media count']
df_test[numerical] = scaler.transform(df_test[numerical])

In [7]:
df_train['img count'].values

array([0.00892857, 0.01785714, 0.01785714, ..., 0.13392857, 0.02678571,
       0.00892857])

In [8]:
all_topic = df_train['topic'].values.astype('U').tolist() + df_test['topic'].values.astype('U').tolist()
all_channel = df_train['channel'].values.astype('U').tolist() + df_test['channel'].values.astype('U').tolist()
all_titles = df_train['title'].values.astype('U').tolist() + df_test['title'].values.astype('U').tolist()


In [9]:
tfidf_vectorize = TfidfVectorizer(preprocessor=prep, 
                                  ngram_range=(1,1), 
                                  sublinear_tf = True,
                                  max_features = 4000,
                                  dtype = np.float32)

In [10]:
topic_tfidf = tfidf_vectorize.fit(all_topic)
train_topic = tfidf_vectorize.transform(df_train['topic'].values.astype('U').tolist())
test_topic = tfidf_vectorize.transform(df_test['topic'].values.astype('U').tolist())

In [11]:
print(train_topic.shape)
print(test_topic.shape)

(27643, 4000)
(11847, 4000)


In [12]:
title_tfidf = tfidf_vectorize.fit(all_titles)
train_title = tfidf_vectorize.transform(df_train['title'].values.astype('U').tolist())
test_title = tfidf_vectorize.transform(df_test['title'].values.astype('U').tolist())

print(train_title.shape)
print(test_title.shape)

(27643, 4000)
(11847, 4000)


In [13]:
channle_tfidf = tfidf_vectorize.fit(all_channel)
train_channel = tfidf_vectorize.transform(df_train['channel'].values.astype('U').tolist())
test_channel = tfidf_vectorize.transform(df_test['channel'].values.astype('U').tolist())

In [14]:
print(train_channel.shape)
print(test_channel.shape)

(27643, 33)
(11847, 33)


In [15]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [16]:
# ohe channel
OHE = OneHotEncoder(handle_unknown='ignore')
train_ohe_channel = OHE.fit_transform(df_train['channel'].values.reshape(-1,1)).toarray()

# OHE = OneHotEncoder(handle_unknown='ignore')
test_ohe_channel = OHE.transform(df_test['channel'].values.reshape(-1,1)).toarray()
print(train_ohe_channel.shape)
print(test_ohe_channel.shape)

(27643, 33)
(11847, 33)


In [17]:
# ohe weekday
OHE = OneHotEncoder(handle_unknown='ignore')
train_ohe_weekday = OHE.fit_transform(df_train['weekday'].values.reshape(-1,1)).toarray()
test_ohe_weekday = OHE.transform(df_test['weekday'].values.reshape(-1,1)).toarray()
print(train_ohe_weekday.shape)
print(test_ohe_weekday.shape)

(27643, 7)
(11847, 7)


In [18]:
# ohe author
OHE = OneHotEncoder(handle_unknown='ignore')
train_ohe_author = OHE.fit_transform(df_train['author'].values.reshape(-1,1)).toarray()
test_ohe_author = OHE.transform(df_test['author'].values.reshape(-1,1)).toarray()

print(train_ohe_author.shape)
print(test_ohe_author.shape)

(27643, 428)
(11847, 428)


In [19]:
train_topic.toarray().shape
train_title.toarray().shape
train_channel.toarray().shape
# train_ohe_weekday.shape
# train_ohe_author.shape
# .shape

(27643, 33)

In [20]:
df_train.columns

Index(['Id', 'Popularity', 'topic', 'channel', 'weekday', 'author',
       'img count', 'title', 'content', 'media count', 'n_tokens_title',
       'n_tokens_content', 'n_unique_tokens', 'n_non_stop_words',
       'n_non_stop_unique_tokens'],
      dtype='object')

In [49]:
x_train = np.concatenate([
#                           train_title.toarray(), 
#                           train_topic.toarray(), 
#                           train_channel.toarray(), 
                          train_ohe_channel,
                          train_ohe_weekday,
                          train_ohe_author, 
                          np.expand_dims(df_train['img count'].values, axis=-1),
                          np.expand_dims(df_train['media count'].values, axis=-1),
                            np.expand_dims(df_train['n_tokens_title'].values, axis=-1),
                            np.expand_dims(df_train['n_tokens_content'].values, axis=-1),
                            np.expand_dims(df_train['n_unique_tokens'].values, axis=-1),
                            np.expand_dims(df_train['n_non_stop_words'].values, axis=-1),
#                             np.expand_dims(df_train['n_non_stop_unique_tokens'].values, axis=-1),
                            ], axis=1)

x_test = np.concatenate([
#                           test_title.toarray(), 
#                           test_topic.toarray(), 
#                           test_channel.toarray(), 
                          test_ohe_channel,
                          test_ohe_weekday,
                          test_ohe_author, 
                          np.expand_dims(df_test['img count'].values, axis=-1),
                          np.expand_dims(df_test['media count'].values, axis=-1),
                            np.expand_dims(df_test['n_tokens_title'].values, axis=-1),
                            np.expand_dims(df_test['n_tokens_content'].values, axis=-1),
                            np.expand_dims(df_test['n_unique_tokens'].values, axis=-1),
                            np.expand_dims(df_test['n_non_stop_words'].values, axis=-1),
#                             np.expand_dims(df_test['n_non_stop_unique_tokens'].values, axis=-1)
                            ], axis=1)

y_train = df_train['Popularity'].to_numpy()
y_train[y_train==-1] = 0

print(x_train.shape, y_train.shape)
print(x_test.shape)

(27643, 474) (27643,)
(11847, 474)


In [50]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2)

In [51]:
import xgboost as xgb
from xgboost import XGBClassifier

In [52]:
d_train = xgb.DMatrix(x_train, y_train)
d_valid = xgb.DMatrix(x_val, y_val)
d_test = xgb.DMatrix(x_test)

In [55]:
xgb_params = {'eta': 0.02, 
              'max_depth': 6, 
              'subsample': 1, 
              'colsample_bytree': 1,
              'min_child_weight' : 1.5,
              'objective': 'binary:logistic', 
              'eval_metric': 'auc', 
#               'lambda': 1.5,
#               'alpha': 0.6,
              'n_estimators': 600,
             }

In [56]:
fit_params = {
    'eval_metric': "auc",
    'early_stopping_rounds': 30,
    'verbose' :  True,
    'objective' :'binary:logistic'
}
xgb_model = xgb.XGBClassifier(**fit_params)

In [57]:
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
model = xgb.train(xgb_params, d_train, 2000, watchlist, verbose_eval=10, early_stopping_rounds=50)

Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-auc:0.57047	valid-auc:0.54499
[10]	train-auc:0.57703	valid-auc:0.54822
[20]	train-auc:0.58448	valid-auc:0.55032
[30]	train-auc:0.58815	valid-auc:0.55262
[40]	train-auc:0.59153	valid-auc:0.55323
[50]	train-auc:0.59944	valid-auc:0.55611
[60]	train-auc:0.60472	valid-auc:0.55803
[70]	train-auc:0.60998	valid-auc:0.55907
[80]	train-auc:0.61633	valid-auc:0.56142
[90]	train-auc:0.62205	valid-auc:0.56323
[100]	train-auc:0.62707	valid-auc:0.56470
[110]	train-auc:0.63049	valid-auc:0.56512
[120]	train-auc:0.63511	valid-auc:0.56529
[130]	train-auc:0.63671	valid-auc:0.56543
[140]	train-auc:0.63870	valid-auc:0.56513
[150]	train-auc:0.64093	valid-auc:0.56522
[160]	train-auc:0.64

In [58]:
file_name = './outputs/xgb.csv'
y_pred = model.predict(d_test)
df_submission = pd.read_csv('./sample_submission.csv')
df_submission['Popularity'] = y_pred
df_submission.to_csv(file_name, index=False)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
0.541255122112588


In [37]:
forest = RandomForestClassifier(criterion='entropy',
                                n_estimators=200, 
                                random_state=1,
                                n_jobs=2)
forest.fit(x_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=200, n_jobs=2,
                       random_state=1)

In [43]:
from sklearn.feature_selection import SelectFromModel

importances = forest.feature_importances_
# get sort indices in descending order
indices = np.argsort(importances)[::-1]

for f in range(x_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, 
                            [indices[f]], 
                            importances[indices[f]]))

 1) [473]                          0.120693
 2) [474]                          0.119217
 3) [472]                          0.118320
 4) [471]                          0.117598
 5) [470]                          0.071868
 6) [468]                          0.066164
 7) [469]                          0.038207
 8) [28]                           0.008662
 9) [7]                            0.008631
10) [467]                          0.008506
11) [3]                            0.007470
12) [32]                           0.007268
13) [34]                           0.007250
14) [37]                           0.006995
15) [31]                           0.006981
16) [38]                           0.006574
17) [39]                           0.006337
18) [35]                           0.005262
19) [33]                           0.005132
20) [347]                          0.005094
21) [24]                           0.004705
22) [392]                          0.004650
23) [416]                       