In [5]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import precision_score, accuracy_score, recall_score, roc_auc_score
from sklearn.decomposition import TruncatedSVD, PCA
from nltk.tokenize import RegexpTokenizer
import scipy


In [139]:
df_train = pd.read_csv('./train_feature.csv')
df_test = pd.read_csv('./test_feature.csv')

print(len(df_train))
print(len(df_test))

27643
11847


In [140]:
import re
from bs4 import BeautifulSoup
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag

nltk.download('stopwords')
stop = stopwords.words('english')

porter = PorterStemmer()
wnl = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
wnl = WordNetLemmatizer()
def prep(text):
    
    # Remove HTML tags.
#     text = BeautifulSoup(text,'html.parser').get_text()
#     text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower()
    
    text = text.strip()
    
    
#     token  = text.split()
    
#     text = [porter.stem(w) for w in token if w not in stop]
    
    # Join the words back into one string separated by space, and return the result.
#     return " ".join(text)
    return text

def tokenize(text):
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nlplab/harry/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [141]:
df_train.columns

Index(['Id', 'Popularity', 'topic', 'channel', 'weekday', 'pub_date', 'author',
       'img count', 'title', 'content', 'media count', 'n_tokens_title',
       'n_tokens_content', 'n_unique_tokens', 'n_non_stop_words',
       'n_non_stop_unique_tokens', 'num_hrefs', 'num_self_hrefs',
       'day_of_month', 'month', 'hour'],
      dtype='object')

In [142]:
# Normalize the numerical features
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numerical = ['img count', 'media count', 'n_tokens_content', 'n_tokens_title', 'num_hrefs', 'num_self_hrefs']
df_train[numerical] = scaler.fit_transform(df_train[numerical])


In [143]:
# numerical = ['img count', 'media count']
df_test[numerical] = scaler.transform(df_test[numerical])

In [144]:
df_train

Unnamed: 0,Id,Popularity,topic,channel,weekday,pub_date,author,img count,title,content,...,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,day_of_month,month,hour
0,0,-1,Asteroid Asteroids challenge Earth Space U.S. ...,world,Wed,"Wed, 19 Jun 2013 15:04:30 +0000",,0.008929,NASA's Grand Challenge: Stop Asteroids From De...,There may be killer asteroids headed for Eart...,...,0.333333,0.069166,0.530435,0.594783,0.436522,0.064516,0.000000,19,6,15
1,1,1,Apps and Software Google open source opn pledg...,tech,Thu,"Thu, 28 Mar 2013 17:40:55 +0000",Christina Warren,0.017857,Google's New Open Source Patent Pledge: We Won...,Google took a stand of sorts against patent-l...,...,0.555556,0.035758,0.511475,0.606557,0.377049,0.051613,0.025424,28,3,17
2,2,1,Entertainment NFL NFL Draft Sports Television,entertainment,Wed,"Wed, 07 May 2014 19:15:20 +0000",Sam Laird,0.017857,Ballin': 2014 NFL Draft Picks Get to Choose Th...,You've spend countless hours training to be a...,...,0.555556,0.135857,0.527828,0.612208,0.451526,0.029032,0.033898,7,5,19
3,3,-1,Sports Video Videos Watercooler,watercooler,Fri,"Fri, 11 Oct 2013 02:26:50 +0000",Sam Laird,0.008929,Cameraperson Fails Deliver Slapstick Laughs,Tired of the same old sports fails and ne...,...,0.166667,0.031923,0.733577,0.715328,0.591241,0.035484,0.050847,11,10,2
4,4,-1,Entertainment instagram instagram video NFL Sp...,entertainment,Thu,"Thu, 17 Apr 2014 03:31:43 +0000",Connor Finnegan,0.464286,NFL Star Helps Young Fan Prove Friendship With...,"At 6-foot-5 and 298 pounds, All-Pro NFL star ...",...,0.444444,0.167533,0.516058,0.734307,0.478832,0.045161,0.059322,17,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27638,27638,-1,cuba Internet freedom U.S. World USAID,world,Tue,"Tue, 08 Apr 2014 16:26:31 +0000",Lorenzo Franceschi-Bicchierai,0.017857,Chief of USAID Doesn't Know Who Created 'Cuban...,The chief of the U.S. Agency for Internationa...,...,0.388889,0.035511,0.551155,0.551155,0.415842,0.032258,0.033898,8,4,16
27639,27639,-1,Apps and Software Dev & Design Gadgets Hardwar...,tech,Wed,"Wed, 09 Jul 2014 01:03:24 +0000",Adario Strange,0.026786,Photo of Samsung's Rumored Virtual Reality Hea...,"Back in May, reports surfaced claiming that S...",...,0.388889,0.037243,0.548896,0.589905,0.429022,0.067742,0.042373,9,7,1
27640,27640,-1,Food hot dogs humor Photography Watercooler,watercooler,Thu,"Thu, 10 Jul 2014 12:30:13 +0000",Christine Erickson,0.133929,14 Dogs That Frankly Cannot Take the Heat,There's nothing more helpless than the middle...,...,0.333333,0.019055,0.647059,0.664706,0.464706,0.067742,0.016949,10,7,12
27641,27641,-1,Business marissa mayer Media stocks Yahoo,business,Tue,"Tue, 16 Apr 2013 20:49:16 +0000",Seth Fiegerman,0.026786,"Yahoo Earnings Beat Estimates, But Core Proble...",Yahoo's profits in the first quarter beat Wal...,...,0.333333,0.051225,0.586047,0.604651,0.467442,0.048387,0.059322,16,4,20


In [145]:
all_topic = df_train['topic'].values.astype('U').tolist() + df_test['topic'].values.astype('U').tolist()
all_channel = df_train['channel'].values.astype('U').tolist() + df_test['channel'].values.astype('U').tolist()
all_titles = df_train['title'].values.astype('U').tolist() + df_test['title'].values.astype('U').tolist()


In [146]:
train_contents = list(df_train['content'].values) 
test_contents = list(df_test['content'].values)
all_contents = train_contents + test_contents

In [147]:

# tfidf_vectorize = TfidfVectorizer(preprocessor=prep,  tokenizer=tokenize,
#                                   ngram_range=(1,1), 
#                                   sublinear_tf = True,
#                                   max_features = 20000,
#                                   dtype = np.float32)
# tfidf_vectorize.fit(all_contents)
# train_content_tfidf = tfidf_vectorize.transform(train_contents)
# test_content_tfidf = tfidf_vectorize.transform(test_contents)

In [148]:
# print(train_content_tfidf.shape)
# print(test_content_tfidf.shape)

In [149]:
# vocab = tfidf_vectorize.get_feature_names()
# print(vocab[:500])
# print(vocab[-500:])

In [150]:
from sklearn.feature_extraction.text import HashingVectorizer

hashvec = HashingVectorizer(n_features=1024,
                            preprocessor=prep,
                            tokenizer=tokenize)

In [151]:
tfidf_vectorize = TfidfVectorizer(preprocessor=prep,  tokenizer=tokenize,
                                  ngram_range=(1,1), 
                                  sublinear_tf = True,
                                  max_features = 500,
                                  dtype = np.float32)

In [152]:
# topic_tfidf = tfidf_vectorize.fit(all_topic)
# train_topic = tfidf_vectorize.transform(df_train['topic'].values.astype('U').tolist())
# test_topic = tfidf_vectorize.transform(df_test['topic'].values.astype('U').tolist())

print(train_topic.shape)
print(test_topic.shape)



In [153]:
train_topic_hash = hashvec.transform(df_train['topic'].values.astype('U').tolist())
test_topic_hash = hashvec.transform(df_test['topic'].values.astype('U').tolist())
print(train_topic_hash.shape)
print(test_topic_hash.shape)


(27643, 1024)
(11847, 1024)


In [156]:
# tfidf_vectorize = TfidfVectorizer(preprocessor=prep,  tokenizer=tokenize,
#                                   ngram_range=(1,1), 
#                                   sublinear_tf = True,
#                                   max_features = 500,
#                                   dtype = np.float32)
# title_tfidf = tfidf_vectorize.fit(all_titles)
# train_title = tfidf_vectorize.transform(df_train['title'].values.astype('U').tolist())
# test_title = tfidf_vectorize.transform(df_test['title'].values.astype('U').tolist())

# print(train_title.shape)
# print(test_title.shape)

In [157]:
# channle_tfidf = tfidf_vectorize.fit(all_channel)
# train_channel = tfidf_vectorize.transform(df_train['channel'].values.astype('U').tolist())
# test_channel = tfidf_vectorize.transform(df_test['channel'].values.astype('U').tolist())

In [158]:
# print(train_channel.shape)
# print(test_channel.shape)

In [159]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [160]:
# ohe channel
OHE = OneHotEncoder(handle_unknown='ignore')
train_ohe_channel = OHE.fit_transform(df_train['channel'].values.reshape(-1,1)).toarray()

# OHE = OneHotEncoder(handle_unknown='ignore')
test_ohe_channel = OHE.transform(df_test['channel'].values.reshape(-1,1)).toarray()
print(train_ohe_channel.shape)
print(test_ohe_channel.shape)

(27643, 33)
(11847, 33)


In [161]:
# ohe weekday
OHE = OneHotEncoder(handle_unknown='ignore')
all_week_day = list(df_train['weekday'].values) +  list(df_test['weekday'].values)
OHE.fit(np.array(all_week_day).reshape(-1,1))

train_ohe_weekday = OHE.transform(df_train['weekday'].values.reshape(-1,1)).toarray()
test_ohe_weekday = OHE.transform(df_test['weekday'].values.reshape(-1,1)).toarray()
print(train_ohe_weekday.shape)
print(test_ohe_weekday.shape)

(27643, 8)
(11847, 8)


In [162]:
# ohe author
OHE = OneHotEncoder(handle_unknown='ignore')
all_author = list(df_train['author'].values) +  list(df_test['author'].values)
OHE.fit(np.array(all_author).reshape(-1,1))

train_ohe_author = OHE.transform(df_train['author'].values.reshape(-1,1)).toarray()
test_ohe_author = OHE.transform(df_test['author'].values.reshape(-1,1)).toarray()

print(train_ohe_author.shape)
print(test_ohe_author.shape)

(27643, 470)
(11847, 470)


In [163]:
df_train.columns

Index(['Id', 'Popularity', 'topic', 'channel', 'weekday', 'pub_date', 'author',
       'img count', 'title', 'content', 'media count', 'n_tokens_title',
       'n_tokens_content', 'n_unique_tokens', 'n_non_stop_words',
       'n_non_stop_unique_tokens', 'num_hrefs', 'num_self_hrefs',
       'day_of_month', 'month', 'hour'],
      dtype='object')

In [164]:
x_train = np.concatenate([
#                           train_content_tfidf.toarray(),
#                           train_title.toarray(), 
#                           train_topic.toarray(), 
#                             train_topic_hash.toarray(), 

                          train_ohe_channel,
                          train_ohe_weekday,
                          train_ohe_author, 
                          np.expand_dims(df_train['img count'].values, axis=-1),
                          np.expand_dims(df_train['media count'].values, axis=-1),
                            np.expand_dims(df_train['n_tokens_title'].values, axis=-1),
                            np.expand_dims(df_train['n_tokens_content'].values, axis=-1),
                            np.expand_dims(df_train['n_unique_tokens'].values, axis=-1),
                            np.expand_dims(df_train['n_non_stop_words'].values, axis=-1),
                            np.expand_dims(df_train['n_non_stop_unique_tokens'].values, axis=-1),
                            np.expand_dims(df_train['num_hrefs'].values, axis=-1),
                            np.expand_dims(df_train['num_self_hrefs'].values, axis=-1),
                            np.expand_dims(df_train['day_of_month'].values, axis=-1),
                            np.expand_dims(df_train['month'].values, axis=-1),
                            np.expand_dims(df_train['hour'].values, axis=-1),

                            ], axis=1)

x_test = np.concatenate([
#                            test_content_tfidf.toarray(),
#                           test_title.toarray(), 
#                           test_topic.toarray(), 
#                             test_topic_hash.toarray(), 

                          test_ohe_channel,
                          test_ohe_weekday,
                          test_ohe_author, 
                          np.expand_dims(df_test['img count'].values, axis=-1),
                          np.expand_dims(df_test['media count'].values, axis=-1),
                            np.expand_dims(df_test['n_tokens_title'].values, axis=-1),
                            np.expand_dims(df_test['n_tokens_content'].values, axis=-1),
                            np.expand_dims(df_test['n_unique_tokens'].values, axis=-1),
                            np.expand_dims(df_test['n_non_stop_words'].values, axis=-1),
                            np.expand_dims(df_test['n_non_stop_unique_tokens'].values, axis=-1),
                            np.expand_dims(df_test['num_hrefs'].values, axis=-1),
                            np.expand_dims(df_test['num_self_hrefs'].values, axis=-1),
                            np.expand_dims(df_test['day_of_month'].values, axis=-1),
                            np.expand_dims(df_test['month'].values, axis=-1),
                            np.expand_dims(df_test['hour'].values, axis=-1),
                            ], axis=1)

y_train = df_train['Popularity'].to_numpy()
y_train[y_train==-1] = 0

print(x_train.shape, y_train.shape)
print(x_test.shape)

(27643, 523) (27643,)
(11847, 523)


In [165]:
# x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2)

In [166]:
import xgboost as xgb
from xgboost import XGBClassifier
from scipy import stats

In [167]:
d_train = xgb.DMatrix(x_train, y_train)
# d_valid = xgb.DMatrix(x_val, y_val)
d_test = xgb.DMatrix(x_test)

In [168]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, learning_curve
from sklearn.model_selection import RandomizedSearchCV, KFold

kfold = StratifiedKFold(n_splits = 5, random_state = 2021 ,shuffle=True)

In [169]:
# param_dist = {'n_estimators': stats.randint(150, 500),
#               'learning_rate': stats.uniform(0.01, 0.07),
#               'subsample': stats.uniform(0.3, 0.7),
#               'max_depth': [3, 4, 5, 6, 7, 8, 9],
#               'colsample_bytree': stats.uniform(0.5, 0.45),
#               'min_child_weight': [1, 2, 3]
#              }

# fit_params = {
#     'eval_metric': "auc",
#     'early_stopping_rounds': 50,
#     'verbose' :  True,
#     'objective' :'binary:logistic'
# }

# xgb_model = xgb.XGBClassifier(**fit_params)

In [170]:
# clf = RandomizedSearchCV(xgb_model, 
#                          param_distributions = param_dist, 
#                          n_iter = 20, 
#                          scoring='roc_auc', 
#                          verbose = 3, 
#                          cv=kfold,
#                          n_jobs = 2, refit=True)
# clf.fit(x_train, y_train)

In [171]:
xgb_params = {'eta': 0.05, 
              'max_depth': 5, 
              'subsample': 0.8, 
              'colsample_bytree': 0.8,
              'min_child_weight' : 1.5,
              'objective': 'binary:logistic', 
              'eval_metric': 'auc', 
#               'lambda': 1.5,
#               'alpha': 0.6,
#               'n_estimators': 119,
             }
xgb_model = xgb.XGBClassifier(**xgb_params)
xgb_params = xgb_model.get_xgb_params()

In [172]:
cvresult = xgb.cv(xgb_params, d_train, num_boost_round=1000, verbose_eval=10, nfold=5, metrics=['auc'],
     early_stopping_rounds=50, stratified=True)

[0]	train-auc:0.58665+0.00162	test-auc:0.57602+0.00987
[10]	train-auc:0.61243+0.00075	test-auc:0.58598+0.00814
[20]	train-auc:0.62242+0.00120	test-auc:0.58737+0.00800
[30]	train-auc:0.63114+0.00158	test-auc:0.58812+0.00789
[40]	train-auc:0.63963+0.00090	test-auc:0.58770+0.00726
[50]	train-auc:0.64681+0.00169	test-auc:0.58820+0.00781
[60]	train-auc:0.65456+0.00124	test-auc:0.58787+0.00775
[70]	train-auc:0.66122+0.00137	test-auc:0.58741+0.00736
[80]	train-auc:0.66730+0.00105	test-auc:0.58714+0.00747
[90]	train-auc:0.67201+0.00120	test-auc:0.58739+0.00773
[99]	train-auc:0.67657+0.00134	test-auc:0.58691+0.00762


In [133]:
cvresult.shape[0]

50

In [134]:
xgb_model.set_params(n_estimators=cvresult.shape[0])

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8,
              enable_categorical=False, eta=0.05, eval_metric='auc', gamma=None,
              gpu_id=None, importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=5,
              min_child_weight=1.5, missing=nan, monotone_constraints=None,
              n_estimators=50, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=0.8,
              tree_method=None, validate_parameters=None, verbosity=None)

In [135]:
xgb_model.fit(x_train, y_train, eval_metric='auc', verbose=True)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8,
              enable_categorical=False, eta=0.05, eval_metric='auc', gamma=0,
              gpu_id=-1, importance_type=None, interaction_constraints='',
              learning_rate=0.0500000007, max_delta_step=0, max_depth=5,
              min_child_weight=1.5, missing=nan, monotone_constraints='()',
              n_estimators=50, n_jobs=36, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.8, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [137]:
file_name = './outputs/xgb_topic_hash.csv'
y_pred = xgb_model.predict_proba(x_test)[:,1]
df_submission = pd.read_csv('./sample_submission.csv')
df_submission['Popularity'] = y_pred
df_submission.to_csv(file_name, index=False)

In [None]:
d_train = xgb.DMatrix(x_train, y_train)
d_valid = xgb.DMatrix(x_val, y_val)
d_test = xgb.DMatrix(x_test)

In [None]:
fit_params = {
    'eval_metric': "auc",
    'early_stopping_rounds': 30,
    'verbose' :  True,
    'objective' :'binary:logistic'
}
xgb_model = xgb.XGBClassifier(**fit_params)

In [None]:
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
model = xgb.train(xgb_params, d_train, 2000, watchlist, verbose_eval=10, early_stopping_rounds=50)

In [None]:
file_name = './outputs/xgb.csv'
y_pred = model.predict(d_test)
df_submission = pd.read_csv('./sample_submission.csv')
df_submission['Popularity'] = y_pred
df_submission.to_csv(file_name, index=False)

In [None]:
forest = RandomForestClassifier(criterion='entropy',
                                n_estimators=200, 
                                random_state=1,
                                n_jobs=2)
forest.fit(x_train, y_train)

In [None]:
from sklearn.feature_selection import SelectFromModel

importances = forest.feature_importances_
# get sort indices in descending order
indices = np.argsort(importances)[::-1]

for f in range(x_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, 
                            [indices[f]], 
                            importances[indices[f]]))