In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import precision_score, accuracy_score, recall_score, roc_auc_score
from sklearn.decomposition import TruncatedSVD, PCA
from nltk.tokenize import RegexpTokenizer
import scipy
import xgboost as xgb
from xgboost import XGBClassifier
from scipy import stats
from sklearn.model_selection import GridSearchCV, StratifiedKFold, learning_curve
from sklearn.model_selection import RandomizedSearchCV, KFold
from summa import keywords


In [2]:
df_train = pd.read_csv('./train_feature.csv')
df_test = pd.read_csv('./test_feature.csv')
df_all = pd.concat([df_train, df_test])
print(len(df_train))
print(len(df_test))
print(len(df_all))

27643
11847
39490


In [3]:
df_train['is_weekend'] = df_train['weekday'].apply(lambda x: 1 if x ==' Sat' or x == ' Sun' else 0)
df_test['is_weekend'] = df_test['weekday'].apply(lambda x: 1 if x ==' Sat' or x == ' Sun' else 0)

In [4]:
# # month
# def pop_month(x):
#     if x == 3:
#         return 1
#     elif x == 10:
#         return -1 # -1 means not popular
#     else:
#         return 0

# df_train['popular_month'] = df_train['month'].apply(pop_month)
# df_test['popular_month'] = df_test['month'].apply(pop_month)
# df_train.corr()['popular_month']['Popularity']

In [5]:
# # hours
# def pop_hour(x):
#     if x == 13 or x == 21:
#         return -1
#     elif x == 5:
#         return 1
#     else:
#         return 0

# df_train['popular_hour'] = df_train['hour'].apply(pop_hour)
# df_test['popular_hour'] = df_test['hour'].apply(pop_hour)
# df_train.corr()['popular_hour']['Popularity']


In [6]:
# # channel 
# # just watch the EDA and assign weights
# def pop_channel(x):
#     if x == 'social-media' or x == 'tech':
#         return 2
#     elif x == 'marketing' or x == 'lifestyle':
#         return 1
#     elif x == 'world' or x == 'entertainment':
#         return -2
#     elif x == 'business':
#         return -1
#     else:
#         return 0

# df_train['popular_channel'] = df_train['channel'].apply(pop_channel)
# df_test['popular_channel'] = df_test['hour'].apply(pop_channel)

In [7]:
# def map_popularity_author(col):
#     df = df_train.groupby(f'{col}').mean().reset_index().sort_values(by='Popularity', ascending=False) \
#               [[f'{col}', 'Popularity']]
#     df.columns=[f'{col}', 'avg_popularity']
    
#     '''
#     pop_5 = df[df['avg_popularity'] >= 0.7][f'{col}'].values
#     pop_4 = df[(df['avg_popularity'] < 0.7) & (df['avg_popularity'] >= 0.6)][f'{col}'].values
#     pop_3 = df[(df['avg_popularity'] < 0.6) & (df['avg_popularity'] >= 0.5)][f'{col}'].values
#     pop_2 = df[(df['avg_popularity'] < 0.5) & (df['avg_popularity'] >= 0.4)][f'{col}'].values
#     pop_1 = df[(df['avg_popularity'] < 0.4) & (df['avg_popularity'] >= 0.3)][f'{col}'].values
#     pop_0 = df[df['avg_popularity'] < 0.3][f'{col}'].values
#     '''
#     pop_5 = df[df['avg_popularity'] >= 0.5][f'{col}'].values
#     pop_2 = df[(df['avg_popularity'] >= 0.2) & (df['avg_popularity'] < 0.5)][f'{col}'].values
#     unpop_2 = df[(df['avg_popularity'] <= -0.2) & (df['avg_popularity'] >= -0.5)][f'{col}'].values
#     unpop_5 = df[df['avg_popularity'] < -0.5][f'{col}'].values
    
#     def lambda_fxn(x):
#         '''
#         if x in pop_5:
#             return 5
#         elif x in pop_4:
#             return 4
#         elif x in pop_3:
#             return 3
#         elif x in pop_2:
#             return 2
#         elif x in pop_1:
#             return 1
#         elif x in pop_0:
#             return -1
            
#         # To catch news desks/sections/subsections/material in test but not in train
#         else:
#             return 0
#         '''
        
#         if x in pop_5:
#             return 5
#         elif x in pop_2:
#             return 2
#         elif x in unpop_5:
#             return -5
#         elif x in unpop_2:
#             return -2
#         else:
#             return 0
        
    
#     df_train[f'popular_{col}'] = df_train[f'{col}'].apply(lambda_fxn)
#     df_test[f'popular_{col}'] = df_test[f'{col}'].apply(lambda_fxn)
# map_popularity_author('author')

In [8]:
col = 'author'
df = df_train.groupby(f'{col}').mean().reset_index().sort_values(by='Popularity', ascending=False) \
              [[f'{col}', 'Popularity']]
df.columns=[f'{col}', 'avg_popularity']

In [9]:
author_avg_score = {}
for i, row in df.iterrows():
    author_name = row['author']
    score = row['avg_popularity']
    author_avg_score[author_name] = score

In [10]:
df_train['author_popularity'] = df_train['author'].apply(lambda x: author_avg_score[x] if x in author_avg_score else 0.0)
df_test['author_popularity'] = df_test['author'].apply(lambda x: author_avg_score[x] if x in author_avg_score else 0.0)

In [11]:
import re
from bs4 import BeautifulSoup
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag

nltk.download('stopwords')
stop = stopwords.words('english')

# define extra stopwords
extra_stopwords = ["ain't", "amn't", "aren't", "can't", "could've", "couldn't",
                    "daresn't", "didn't", "doesn't", "don't", "gonna", "gotta", 
                    "hadn't", "hasn't", "haven't", "he'd", "he'll", "he's", "how'd",
                    "how'll", "how's", "I'd", "I'll", "I'm", "I've", "isn't", "it'd",
                    "it'll", "it's", "let's", "mayn't", "may've", "mightn't", 
                    "might've", "mustn't", "must've", "needn't", "o'clock", "ol'",
                    "oughtn't", "shan't", "she'd", "she'll", "she's", "should've",
                    "shouldn't", "somebody's", "someone's", "something's", "that'll",
                    "that're", "that's", "that'd", "there'd", "there're", "there's", 
                    "these're", "they'd", "they'll", "they're", "they've", "this's",
                    "those're", "tis", "twas", "twasn't", "wasn't", "we'd", "we'd've",
                    "we'll", "we're", "we've", "weren't", "what'd", "what'll", 
                    "what're", "what's", "what've", "when's", "where'd", "where're",
                    "where's", "where've", "which's", "who'd", "who'd've", "who'll",
                    "who're", "who's", "who've", "why'd", "why're", "why's", "won't",
                    "would've", "wouldn't", "y'all", "you'd", "you'll", "you're", 
                    "you've", "'s", "'d", "'m", "abov", "afterward", "ai", "alon", "alreadi", "alway", "ani", 
                     "anoth", "anyon", "anyth", "anywher", "becam", "becaus", "becom", "befor", 
                     "besid", "ca", "cri", "dare", "describ", "did", "doe", "dure", "els", 
                     "elsewher", "empti", "everi", "everyon", "everyth", "everywher", "fifti", 
                     "forti", "gon", "got", "henc", "hereaft", "herebi", "howev", "hundr", "inde", 
                     "let", "ll", "mani", "meanwhil", "moreov", "n't", "na", "need", "nobodi", "noon", 
                     "noth", "nowher", "ol", "onc", "onli", "otherwis", "ought", "ourselv", "perhap", 
                     "pleas", "sever", "sha", "sinc", "sincer", "sixti", "somebodi", "someon", "someth", 
                     "sometim", "somewher", "ta", "themselv", "thenc", "thereaft", "therebi", "therefor", 
                     "togeth", "twelv", "twenti", "ve", "veri", "whatev", "whenc", "whenev", 
                    "wherea", "whereaft", "wherebi", "wherev", "whi", "wo", "anywh", "el", "elsewh", "everywh", 
                    "ind", "otherwi", "plea", "somewh", "yourselv"]

stop = stop + extra_stopwords
porter = PorterStemmer()
wnl = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
wnl = WordNetLemmatizer()
def prep(text):
    
    # Remove HTML tags.
#     text = BeautifulSoup(text,'html.parser').get_text()
    text = text.translate(str.maketrans('', '', string.punctuation))
#     text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower()
    
    text = text.strip()

    return text

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    tokens = [porter.stem(w) for w in tokens if w not in stop and w.isalpha()]
    return tokens
#     return [porter.stem(w) for w in re.split('\s+', text.strip()) \
#             if w not in stop and re.match('[a-zA-Z]+', w)]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nlplab/harry/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# from tqdm import tqdm
# tqdm.pandas()
# df_train['keywords'] = df_train['content'].progress_apply(prep) # 此步驟約要花五分鐘
# df_train['keywords'] = df_train['keywords'].progress_apply(tokenize)
# df_train['keywords'] = df_train['keywords'].progress_apply(lambda x: ' '.join(x))
# df_train['keywords'] = df_train['keywords'].progress_apply(lambda x: keywords.keywords(x).replace('\n', ' '))

In [13]:
# df_train.to_csv('./train_feature.csv',index=False,header=True)

In [14]:
# tqdm.pandas()
# df_test['keywords'] = df_test['content'].progress_apply(prep) # 此步驟約要花五分鐘
# df_test['keywords'] = df_test['keywords'].progress_apply(tokenize)
# df_test['keywords'] = df_test['keywords'].progress_apply(lambda x: ' '.join(x))
# df_test['keywords'] = df_test['keywords'].progress_apply(lambda x: keywords.keywords(x).replace('\n', ' '))

In [15]:
# df_test.to_csv('./test_feature.csv',index=False,header=True)

In [16]:
df_train.columns

Index(['Id', 'Popularity', 'topic', 'channel', 'weekday', 'pub_date', 'author',
       'img count', 'title', 'content', 'media count', 'n_tokens_title',
       'n_tokens_content', 'n_unique_tokens', 'n_non_stop_words',
       'n_non_stop_unique_tokens', 'num_hrefs', 'num_self_hrefs',
       'global_sentiment_polarity', 'global_subjectivity',
       'title_subjectivity', 'title_sentiment_polarity',
       'abs_title_subjectivity', 'abs_title_sentiment_polarity',
       'rate_positive_words', 'rate_negative_words', 'avg_positive_polarity',
       'min_positive_polarity', 'max_positive_polarity',
       'avg_negative_polarity', 'min_negative_polarity',
       'max_negative_polarity', 'day_of_month', 'month', 'hour', 'is_weekend',
       'keywords', 'author_popularity'],
      dtype='object')

In [17]:
all_topic = df_train['topic'].values.astype('U').tolist() + df_test['topic'].values.astype('U').tolist()
all_channel = df_train['channel'].values.astype('U').tolist() + df_test['channel'].values.astype('U').tolist()
all_titles = df_train['title'].values.astype('U').tolist() + df_test['title'].values.astype('U').tolist()


In [18]:
train_contents = list(df_train['content'].values) 
test_contents = list(df_test['content'].values)
all_contents = train_contents + test_contents

In [19]:
# tfidf_vectorize = TfidfVectorizer(preprocessor=prep,  
#                                   tokenizer=tokenize,
#                                   max_features = 5000,
#                                   dtype = np.float32)

# train_keywords_tfidf = tfidf_vectorize.fit_transform(df_train['keywords'].values.astype('U').tolist())
# test_keywords_tfidf = tfidf_vectorize.transform(df_test['keywords'].values.astype('U').tolist())
# print(train_keywords_tfidf.shape)
# print(test_keywords_tfidf.shape)

In [20]:
# tfidf_vectorize = TfidfVectorizer(preprocessor=prep,  
#                                   tokenizer=tokenize,
#                                   max_features = 1024,
#                                   dtype = np.float32)

# train_topic_tfidf = tfidf_vectorize.fit_transform(df_train['topic'].values.astype('U').tolist())
# test_topic_tfidf = tfidf_vectorize.transform(df_test['topic'].values.astype('U').tolist())
# print(train_topic_tfidf.shape)
# print(test_topic_tfidf.shape)

In [21]:
# vocab  = tfidf_vectorize.get_feature_names()
# print(vocab[:100])

In [22]:
# tfidf_vectorize = TfidfVectorizer(preprocessor=prep,  
#                                   tokenizer=tokenize,
#                                   ngram_range=(1,1), 
#                                   sublinear_tf = True,
# #                                   max_features = 2048,
#                                   dtype = np.float32)
# all_content_tfidf = tfidf_vectorize.fit_transform(all_contents)

In [23]:
# train_content_tfidf = tfidf_vectorize.transform(train_contents)
# test_content_tfidf = tfidf_vectorize.transform(test_contents)
# print(train_content_tfidf.shape)
# print(test_content_tfidf.shape)

In [24]:
# from sklearn.decomposition import TruncatedSVD, PCA
# svd = TruncatedSVD(n_components=100)
# svd.fit(all_content_tfidf)

# train_content_svd = svd.transform(train_content_tfidf)
# test_content_svd = svd.transform(test_content_tfidf)

# print(train_content_svd.shape)
# print(test_content_svd.shape)

In [25]:
# train_topic_tfidf = tfidf_vectorize.transform(df_train['topic'].values.astype('U').tolist())
# test_topic_tfidf = tfidf_vectorize.transform(df_test['topic'].values.astype('U').tolist())

# print(train_topic_tfidf.shape)
# print(test_topic_tfidf.shape)

In [26]:

# train_title_tfidf = tfidf_vectorize.transform(df_train['title'].values.astype('U').tolist())
# test_title_tfidf = tfidf_vectorize.transform(df_test['title'].values.astype('U').tolist())

# print(train_title_tfidf.shape)
# print(test_title_tfidf.shape)

In [27]:
from sklearn.feature_extraction.text import HashingVectorizer

hashvec = HashingVectorizer(n_features=1024,
                            preprocessor=prep,
                            tokenizer=tokenize,
                           dtype = np.float32)

train_topic_hash = hashvec.fit_transform(df_train['topic'].values.astype('U').tolist())
test_topic_hash = hashvec.transform(df_test['topic'].values.astype('U').tolist())
print(train_topic_hash.shape)
print(test_topic_hash.shape)



(27643, 1024)
(11847, 1024)


In [28]:
# from sklearn.feature_extraction.text import HashingVectorizer

# hashvec = HashingVectorizer(n_features=1024,
#                            dtype = np.float32)

# train_keywords_hash = hashvec.fit_transform(df_train['keywords'].values.astype('U').tolist())
# test_keywords_hash = hashvec.transform(df_test['keywords'].values.astype('U').tolist())
# print(train_keywords_hash.shape)
# print(test_keywords_hash.shape)

In [29]:
# train_content_hash = hashvec.fit_transform(train_contents)
# test_content_hash = hashvec.transform(test_contents)
# print(train_content_hash.shape)
# print(test_content_hash.shape)

In [30]:
# train_title_hash = hashvec.transform(df_train['title'].values.astype('U').tolist())
# test_title_hash = hashvec.transform(df_test['title'].values.astype('U').tolist())
# print(train_title_hash.shape)
# print(test_title_hash.shape)

In [31]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [32]:
# ohe channel
OHE_channel = OneHotEncoder(handle_unknown='ignore')
train_ohe_channel = OHE_channel.fit_transform(df_train['channel'].values.reshape(-1,1)).toarray()
test_ohe_channel = OHE_channel.transform(df_test['channel'].values.reshape(-1,1)).toarray()
print(train_ohe_channel.shape)
print(test_ohe_channel.shape)


(27643, 33)
(11847, 33)


In [33]:
# ohe weekday
OHE_weekday = OneHotEncoder(handle_unknown='ignore')
all_week_day = list(df_train['weekday'].values) +  list(df_test['weekday'].values)
# OHE_weekday.fit(np.array(all_week_day).reshape(-1,1))

train_ohe_weekday = OHE_weekday.fit_transform(df_train['weekday'].values.reshape(-1,1)).toarray()
test_ohe_weekday = OHE_weekday.transform(df_test['weekday'].values.reshape(-1,1)).toarray()
print(train_ohe_weekday.shape)
print(test_ohe_weekday.shape)

(27643, 7)
(11847, 7)


In [34]:
# ohe author
OHE_author = OneHotEncoder(handle_unknown='ignore')
all_author = list(df_train['author'].values) +  list(df_test['author'].values)
# OHE_author.fit(np.array(all_author).reshape(-1,1))

train_ohe_author = OHE_author.fit_transform(df_train['author'].values.reshape(-1,1)).toarray()
test_ohe_author = OHE_author.transform(df_test['author'].values.reshape(-1,1)).toarray()

print(train_ohe_author.shape)
print(test_ohe_author.shape)

(27643, 428)
(11847, 428)


In [35]:
# # Normalize the numerical features
# from sklearn.preprocessing import MinMaxScaler , StandardScaler
# scaler = MinMaxScaler()
# # scaler = StandardScaler()
# need_to_normalize = [
#     'img count', 
#     'media count',
#     'n_tokens_content',
#     'n_tokens_title', 
#     'num_hrefs', 
#     'num_self_hrefs',
# #     'day_of_month',
# #     'month',
# #     'hour',
# #     'rate_positive_words',
# #     'rate_negative_words',
# #     'avg_positive_polarity',
# #     'avg_negative_polarity',
# #     'max_positive_polarity',
# #     'min_positive_polarity',
# #     'max_negative_polarity',
# #     'min_negative_polarity',
# #     'abs_title_subjectivity',
# #     'abs_title_sentiment_polarity',
# #     'popular_month',
# #     'popular_hour',
# #     'popular_channel',
# #     'popular_author',
#     ]

# df_train_normalize = df_train.copy(deep=True)
# df_test_normalize = df_test.copy(deep=True)

# # scaler.fit(df_all[numerical])
# df_train_normalize[need_to_normalize] = scaler.fit_transform(df_train[need_to_normalize])
# df_test_normalize[need_to_normalize] = scaler.transform(df_test[need_to_normalize])

In [36]:
df_train.columns

Index(['Id', 'Popularity', 'topic', 'channel', 'weekday', 'pub_date', 'author',
       'img count', 'title', 'content', 'media count', 'n_tokens_title',
       'n_tokens_content', 'n_unique_tokens', 'n_non_stop_words',
       'n_non_stop_unique_tokens', 'num_hrefs', 'num_self_hrefs',
       'global_sentiment_polarity', 'global_subjectivity',
       'title_subjectivity', 'title_sentiment_polarity',
       'abs_title_subjectivity', 'abs_title_sentiment_polarity',
       'rate_positive_words', 'rate_negative_words', 'avg_positive_polarity',
       'min_positive_polarity', 'max_positive_polarity',
       'avg_negative_polarity', 'min_negative_polarity',
       'max_negative_polarity', 'day_of_month', 'month', 'hour', 'is_weekend',
       'keywords', 'author_popularity'],
      dtype='object')

In [53]:
df_train_concat = pd.concat([df_train[['Popularity',
                                                'img count', 
                                                'media count',
                                                'n_tokens_title',
                                                'n_tokens_content',
                                                'n_unique_tokens',
                                                'n_non_stop_words',
                                                'n_non_stop_unique_tokens',
                                                'num_hrefs',
                                                'num_self_hrefs',
                                                'global_sentiment_polarity',
                                                'global_subjectivity',
                                                'title_subjectivity',
                                                'title_sentiment_polarity',
                                                'abs_title_subjectivity',
                                                'abs_title_sentiment_polarity',
                                                'rate_positive_words',
                                                'rate_negative_words',
                                                'avg_positive_polarity',
                                                'min_positive_polarity',
                                                'max_positive_polarity',
                                                'avg_negative_polarity',
                                                'min_negative_polarity',
                                                'max_negative_polarity',
                                                'day_of_month',
                                                'month',
                                                'hour',
                                                'is_weekend',
#                                                 'popular_month',
#                                                 'popular_hour',
#                                                 'popular_channel',
#                                                 'popular_author',
                                                'author_popularity',

                                               ]], 
#                       pd.DataFrame(train_topic_hash.toarray()),
                      pd.DataFrame(train_ohe_channel, columns=OHE_channel.get_feature_names()),
                      pd.DataFrame(train_ohe_weekday, columns=OHE_weekday.get_feature_names()),
#                       pd.DataFrame(train_ohe_author,  columns=OHE_author.get_feature_names()),
                     ], axis=1)

print(df_train.shape)
print(df_train_concat.shape)

(27643, 38)
(27643, 69)


In [54]:
df_test_concat = pd.concat([df_test[[
                                                'img count', 
                                                'media count',
                                                'n_tokens_title',
                                                'n_tokens_content',
                                                'n_unique_tokens',
                                                'n_non_stop_words',
                                                'n_non_stop_unique_tokens',
                                                'num_hrefs',
                                                'num_self_hrefs',
                                                'global_sentiment_polarity',
                                                'global_subjectivity',
                                                'title_subjectivity',
                                                'title_sentiment_polarity',
                                                'abs_title_subjectivity',
                                                'abs_title_sentiment_polarity',
                                                'rate_positive_words',
                                                'rate_negative_words',
                                                'avg_positive_polarity',
                                                'min_positive_polarity',
                                                'max_positive_polarity',
                                                'avg_negative_polarity',
                                                'min_negative_polarity',
                                                'max_negative_polarity',
                                                'day_of_month',
                                                'month',
                                                'hour',
                                                'is_weekend',
#                                                 'popular_month',
#                                                 'popular_hour',
#                                                 'popular_channel',
#                                                 'popular_author',                  
                                                 'author_popularity',

                                               ]], 
#                       pd.DataFrame(test_topic_hash.toarray()),
                      pd.DataFrame(test_ohe_channel, columns=OHE_channel.get_feature_names()),
                      pd.DataFrame(test_ohe_weekday, columns=OHE_weekday.get_feature_names()),
#                       pd.DataFrame(test_ohe_author,  columns=OHE_author.get_feature_names()),
                     ], axis=1)

print(df_test.shape)
print(df_test_concat.shape)

(11847, 37)
(11847, 68)


In [55]:
df_train_concat.head()

Unnamed: 0,Popularity,img count,media count,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,...,x0_viral,x0_watercooler,x0_world,x0_ Fri,x0_ Mon,x0_ Sat,x0_ Sun,x0_ Thu,x0_ Tue,x0_ Wed
0,0,1,0,8,575,0.507826,0.615652,0.424348,22,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,2,0,12,305,0.478689,0.619672,0.357377,18,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1,2,25,12,1119,0.488829,0.638963,0.421805,11,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0,1,21,5,274,0.711679,0.733577,0.583942,13,6,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,52,1,10,1370,0.49708,0.756934,0.464964,16,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [56]:
# feats = abs(df_train_concat._get_numeric_data().corr()['Popularity']).sort_values(ascending=False)

In [57]:
# keep_cols = []
# th = 0.004
# for index, value in feats.items():
# #     print(f"Index : {index} \t\t, Value : {value}")
#     if value > th and index != 'Popularity':
#         keep_cols.append(index)
# print(len(keep_cols))

In [58]:
# keep_cols = list(feats.keys())
# keep_cols = keep_cols[:1001]
# keep_cols.pop(0)

In [59]:
# print(keep_cols[:100])

In [60]:
x_train = df_train_concat.drop(['Popularity'], axis=1).to_numpy()
x_test = df_test_concat.to_numpy()
y_train = df_train['Popularity'].to_numpy()
y_train[y_train==-1] = 0

print(x_train.shape, y_train.shape)
print(x_test.shape)


(27643, 68) (27643,)
(11847, 68)


In [61]:
# x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2)
# kfold = StratifiedKFold(n_splits = 5, random_state = 2021 ,shuffle=True)

In [62]:
d_train = xgb.DMatrix(x_train, y_train)
# d_valid = xgb.DMatrix(x_val, y_val)
d_test = xgb.DMatrix(x_test)

In [63]:
xgb_params = {'eta': 0.05, 
              'max_depth': 4, 
              'subsample': 0.7 ,
              'colsample_bytree': 0.7,
              'min_child_weight' : 4,
              'objective': 'binary:logistic', 
              'eval_metric': 'auc', 
#               'lambda': 1.5,
              'alpha': 0.005,
#               'n_estimators': 119,
             }
xgb_model = xgb.XGBClassifier(**xgb_params)
xgb_params = xgb_model.get_xgb_params()

In [64]:
cvresult = xgb.cv(xgb_params, d_train, num_boost_round=1000, verbose_eval=10, nfold=5, metrics=['auc'],
     early_stopping_rounds=50, stratified=True)

[0]	train-auc:0.56057+0.00340	test-auc:0.54795+0.00360
[10]	train-auc:0.61147+0.00341	test-auc:0.59486+0.00629
[20]	train-auc:0.61805+0.00436	test-auc:0.59756+0.00873
[30]	train-auc:0.62314+0.00212	test-auc:0.59886+0.00832
[40]	train-auc:0.62713+0.00204	test-auc:0.59974+0.00880
[50]	train-auc:0.63181+0.00160	test-auc:0.60025+0.00924
[60]	train-auc:0.63734+0.00168	test-auc:0.60058+0.00954
[70]	train-auc:0.64261+0.00189	test-auc:0.60055+0.00923
[80]	train-auc:0.64763+0.00171	test-auc:0.60063+0.00880
[90]	train-auc:0.65193+0.00161	test-auc:0.60050+0.00896
[100]	train-auc:0.65656+0.00188	test-auc:0.60039+0.00879
[110]	train-auc:0.66111+0.00187	test-auc:0.59982+0.00864
[120]	train-auc:0.66550+0.00195	test-auc:0.59986+0.00850
[126]	train-auc:0.66784+0.00208	test-auc:0.59972+0.00859


In [49]:
cvresult.shape[0]

50

In [50]:
xgb_model.set_params(n_estimators=cvresult.shape[0])

XGBClassifier(alpha=0.005, base_score=None, booster=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.7, enable_categorical=False, eta=0.05,
              eval_metric='auc', gamma=None, gpu_id=None, importance_type=None,
              interaction_constraints=None, learning_rate=None,
              max_delta_step=None, max_depth=4, min_child_weight=4, missing=nan,
              monotone_constraints=None, n_estimators=50, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=None,
              reg_alpha=None, reg_lambda=None, scale_pos_weight=None,
              subsample=0.7, tree_method=None, validate_parameters=None, ...)

In [51]:
xgb_model.fit(x_train, y_train, eval_metric='auc', verbose=True)



XGBClassifier(alpha=0.005, base_score=0.5, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.7,
              enable_categorical=False, eta=0.05, eval_metric='auc', gamma=0,
              gpu_id=-1, importance_type=None, interaction_constraints='',
              learning_rate=0.0500000007, max_delta_step=0, max_depth=4,
              min_child_weight=4, missing=nan, monotone_constraints='()',
              n_estimators=50, n_jobs=36, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0.00499999989, reg_lambda=1,
              scale_pos_weight=1, subsample=0.7, tree_method='exact',
              validate_parameters=1, ...)

In [52]:
file_name = './outputs/xgb_topic_hash_avg_author_score.csv'
y_pred = xgb_model.predict_proba(x_test)[:,1]
df_submission = pd.read_csv('./sample_submission.csv')
df_submission['Popularity'] = y_pred
df_submission.to_csv(file_name, index=False)

In [None]:
# x_train = np.concatenate([
# #                             train_topic_tfidf.toarray(), 
# #                             train_title_tfidf.toarray(), 
# #                           train_content_tfidf.toarray(),

                            
# #                         train_content_svd,
#                             train_topic_hash.toarray(), 
# #                             train_title_hash.toarray(),
# #                             train_content_hash.toarray(),

#                           train_ohe_channel,
#                           train_ohe_weekday,
#                           train_ohe_author, 
#                           np.expand_dims(df_train['is_weekend'].values, axis=-1),
#                           np.expand_dims(df_train['img count'].values, axis=-1),
#                           np.expand_dims(df_train['media count'].values, axis=-1),
#                             np.expand_dims(df_train['n_tokens_title'].values, axis=-1),
#                             np.expand_dims(df_train['n_tokens_content'].values, axis=-1),
#                             np.expand_dims(df_train['n_unique_tokens'].values, axis=-1),
#                             np.expand_dims(df_train['n_non_stop_words'].values, axis=-1),
#                             np.expand_dims(df_train['n_non_stop_unique_tokens'].values, axis=-1),
#                             np.expand_dims(df_train['num_hrefs'].values, axis=-1),
#                             np.expand_dims(df_train['num_self_hrefs'].values, axis=-1),
#                             np.expand_dims(df_train['day_of_month'].values, axis=-1),
#                             np.expand_dims(df_train['month'].values, axis=-1),
#                             np.expand_dims(df_train['hour'].values, axis=-1),
# #                             sentiment
#                             np.expand_dims(df_train['global_sentiment_polarity'].values, axis=-1),
#                             np.expand_dims(df_train['global_subjectivity'].values, axis=-1),
#                             np.expand_dims(df_train['title_subjectivity'].values, axis=-1),
#                             np.expand_dims(df_train['title_sentiment_polarity'].values, axis=-1),
#                             np.expand_dims(df_train['abs_title_subjectivity'].values, axis=-1),
#                             np.expand_dims(df_train['abs_title_sentiment_polarity'].values, axis=-1),
# #                                 word sentiment
#                             np.expand_dims(df_train['rate_positive_words'].values, axis=-1),
#                             np.expand_dims(df_train['rate_negative_words'].values, axis=-1),
#                             np.expand_dims(df_train['avg_positive_polarity'].values, axis=-1),
#                             np.expand_dims(df_train['min_positive_polarity'].values, axis=-1),
#                             np.expand_dims(df_train['max_positive_polarity'].values, axis=-1),
#                             np.expand_dims(df_train['avg_negative_polarity'].values, axis=-1),
#                             np.expand_dims(df_train['min_negative_polarity'].values, axis=-1),
#                             np.expand_dims(df_train['max_negative_polarity'].values, axis=-1),


#                             ], axis=1)

# x_test = np.concatenate([
# #                             test_topic_tfidf.toarray(), 
# #                             test_title_tfidf.toarray(), 
# #                           test_content_tfidf.toarray(),
    
# #                         test_content_svd,
#                             test_topic_hash.toarray(), 
# #                             test_title_hash.toarray(),
# #                             test_content_hash.toarray(),

#                           test_ohe_channel,
#                           test_ohe_weekday,
#                           test_ohe_author, 
#                           np.expand_dims(df_test['is_weekend'].values, axis=-1),
#                           np.expand_dims(df_test['img count'].values, axis=-1),
#                           np.expand_dims(df_test['media count'].values, axis=-1),
#                             np.expand_dims(df_test['n_tokens_title'].values, axis=-1),
#                             np.expand_dims(df_test['n_tokens_content'].values, axis=-1),
#                             np.expand_dims(df_test['n_unique_tokens'].values, axis=-1),
#                             np.expand_dims(df_test['n_non_stop_words'].values, axis=-1),
#                             np.expand_dims(df_test['n_non_stop_unique_tokens'].values, axis=-1),
#                             np.expand_dims(df_test['num_hrefs'].values, axis=-1),
#                             np.expand_dims(df_test['num_self_hrefs'].values, axis=-1),
#                             np.expand_dims(df_test['day_of_month'].values, axis=-1),
#                             np.expand_dims(df_test['month'].values, axis=-1),
#                             np.expand_dims(df_test['hour'].values, axis=-1),
# #                             sentiment
#                             np.expand_dims(df_test['global_sentiment_polarity'].values, axis=-1),
#                             np.expand_dims(df_test['global_subjectivity'].values, axis=-1),
#                             np.expand_dims(df_test['title_subjectivity'].values, axis=-1),
#                             np.expand_dims(df_test['title_sentiment_polarity'].values, axis=-1),
#                             np.expand_dims(df_test['abs_title_subjectivity'].values, axis=-1),
#                             np.expand_dims(df_test['abs_title_sentiment_polarity'].values, axis=-1),
# #                                 word sentiment
#                             np.expand_dims(df_test['rate_positive_words'].values, axis=-1),
#                             np.expand_dims(df_test['rate_negative_words'].values, axis=-1),
#                             np.expand_dims(df_test['avg_positive_polarity'].values, axis=-1),
#                             np.expand_dims(df_test['min_positive_polarity'].values, axis=-1),
#                             np.expand_dims(df_test['max_positive_polarity'].values, axis=-1),
#                             np.expand_dims(df_test['avg_negative_polarity'].values, axis=-1),
#                             np.expand_dims(df_test['min_negative_polarity'].values, axis=-1),
#                             np.expand_dims(df_test['max_negative_polarity'].values, axis=-1),
#                             ], axis=1)

# y_train = df_train['Popularity'].to_numpy()
# y_train[y_train==-1] = 0

# print(x_train.shape, y_train.shape)
# print(x_test.shape)

In [None]:
# param_dist = {'n_estimators': stats.randint(150, 500),
#               'learning_rate': stats.uniform(0.01, 0.07),
#               'subsample': stats.uniform(0.3, 0.7),
#               'max_depth': [3, 4, 5, 6, 7, 8, 9],
#               'colsample_bytree': stats.uniform(0.5, 0.45),
#               'min_child_weight': [1, 2, 3]
#              }

# fit_params = {
#     'eval_metric': "auc",
#     'early_stopping_rounds': 50,
#     'verbose' :  True,
#     'objective' :'binary:logistic'
# }

# xgb_model = xgb.XGBClassifier(**fit_params)
# clf = RandomizedSearchCV(xgb_model, 
#                          param_distributions = param_dist, 
#                          n_iter = 20, 
#                          scoring='roc_auc', 
#                          verbose = 3, 
#                          cv=kfold,
#                          n_jobs = 2, refit=True)
# clf.fit(x_train, y_train)

In [None]:
d_train = xgb.DMatrix(x_train, y_train)
d_valid = xgb.DMatrix(x_val, y_val)
d_test = xgb.DMatrix(x_test)

In [None]:
fit_params = {
    'eval_metric': "auc",
    'early_stopping_rounds': 30,
    'verbose' :  True,
    'objective' :'binary:logistic'
}
xgb_model = xgb.XGBClassifier(**fit_params)

In [None]:
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
model = xgb.train(xgb_params, d_train, 2000, watchlist, verbose_eval=10, early_stopping_rounds=50)

In [None]:
file_name = './outputs/xgb.csv'
y_pred = model.predict(d_test)
df_submission = pd.read_csv('./sample_submission.csv')
df_submission['Popularity'] = y_pred
df_submission.to_csv(file_name, index=False)

In [None]:
forest = RandomForestClassifier(criterion='entropy',
                                n_estimators=200, 
                                random_state=1,
                                n_jobs=2)
forest.fit(x_train, y_train)

In [None]:
from sklearn.feature_selection import SelectFromModel

importances = forest.feature_importances_
# get sort indices in descending order
indices = np.argsort(importances)[::-1]

for f in range(x_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, 
                            [indices[f]], 
                            importances[indices[f]]))