### Importovanje biblioteka i podataka

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scikitplot as skplt
from scipy import stats
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator, clone

from sklearn import model_selection, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression

In [2]:
pd.set_option("display.max_columns", 30)

In [3]:
train_df = pd.read_csv('input/train.tsv', delimiter='\t')
train_df.head()

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,compression_ratio,embed_ratio,framebased,frameTagRatio,hasDomainLink,html_ratio,image_ratio,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,4042,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,0.443783,0.0,0,0.090774,0,0.245831,0.003883,1,1,24,0,5424,170,8,0.152941,0.07913,0
1,http://www.popsci.com/technology/article/2012-...,8471,"{""title"":""The Fully Electronic Futuristic Star...",recreation,0.574147,3.677966,0.508021,0.28877,0.213904,0.144385,0.468649,0.0,0,0.098707,0,0.20349,0.088652,1,1,40,0,4973,187,9,0.181818,0.125448,1
2,http://www.menshealth.com/health/flu-fighting-...,1164,"{""title"":""Fruits that Fight the Flu fruits tha...",health,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,0.525448,0.0,0,0.072448,0,0.226402,0.120536,1,1,55,0,2240,258,11,0.166667,0.057613,1
3,http://www.dumblittleman.com/2007/12/10-foolpr...,6684,"{""title"":""10 Foolproof Tips for Better Sleep ""...",health,0.801248,1.543103,0.4,0.1,0.016667,0.0,0.480725,0.0,0,0.095861,0,0.265656,0.035343,1,0,24,0,2737,120,5,0.041667,0.100858,1
4,http://bleacherreport.com/articles/1205138-the...,9006,"{""title"":""The 50 Coolest Jerseys You Didn t Kn...",sports,0.719157,2.676471,0.5,0.222222,0.123457,0.04321,0.446143,0.0,0,0.024908,0,0.228887,0.050473,1,1,14,0,12032,162,10,0.098765,0.082569,0


In [4]:
train_df.drop(['urlid','is_news','framebased'],axis=1, inplace=True)
#URL kolona je izbačena nešto kasnije, pre enkodiranja.  

### Priprema podataka

#### Boilerplate kolona

In [5]:
train_df['boilerplate'] = train_df['boilerplate'].replace(to_replace=':null', value=':""', regex=True)

In [6]:
train_df.boilerplate=train_df.boilerplate.map(eval)

In [7]:
train_df.boilerplate[1].keys()

dict_keys(['title', 'body', 'url'])

In [9]:
train_df.rename({'url':'full_url'},axis=1, inplace=True)

In [10]:
train_df=pd.concat([train_df.drop('boilerplate', axis=1),train_df['boilerplate'].apply(pd.Series)], axis=1)

In [12]:
train_df.rename({'url':'url_words'},axis=1, inplace=True)
train_df.rename({'full_url':'url'},axis=1, inplace=True)

#### Nedostajuće vrednosti kao np.nan

Nedostajuće vrednosti se pojavljuju u okviru znaka pitanja.

In [13]:
train_df.replace(to_replace='?', value=np.nan, inplace=True)

In [14]:
train_df.head()

Unnamed: 0,url,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,compression_ratio,embed_ratio,frameTagRatio,hasDomainLink,html_ratio,image_ratio,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label,title,body,url_words,related
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,0.443783,0.0,0.090774,0,0.245831,0.003883,1,24,0,5424,170,8,0.152941,0.07913,0,IBM Sees Holographic Calls Air Breathing Batte...,A sign stands outside the International Busine...,bloomberg news 2010 12 23 ibm predicts hologra...,
1,http://www.popsci.com/technology/article/2012-...,recreation,0.574147,3.677966,0.508021,0.28877,0.213904,0.144385,0.468649,0.0,0.098707,0,0.20349,0.088652,1,40,0,4973,187,9,0.181818,0.125448,1,The Fully Electronic Futuristic Starting Gun T...,And that can be carried on a plane without the...,popsci technology article 2012 07 electronic f...,
2,http://www.menshealth.com/health/flu-fighting-...,health,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,0.525448,0.0,0.072448,0,0.226402,0.120536,1,55,0,2240,258,11,0.166667,0.057613,1,Fruits that Fight the Flu fruits that fight th...,Apples The most popular source of antioxidants...,menshealth health flu fighting fruits cm mmc F...,
3,http://www.dumblittleman.com/2007/12/10-foolpr...,health,0.801248,1.543103,0.4,0.1,0.016667,0.0,0.480725,0.0,0.095861,0,0.265656,0.035343,0,24,0,2737,120,5,0.041667,0.100858,1,10 Foolproof Tips for Better Sleep,There was a period in my life when I had a lot...,dumblittleman 2007 12 10 foolproof tips for be...,
4,http://bleacherreport.com/articles/1205138-the...,sports,0.719157,2.676471,0.5,0.222222,0.123457,0.04321,0.446143,0.0,0.024908,0,0.228887,0.050473,1,14,0,12032,162,10,0.098765,0.082569,0,The 50 Coolest Jerseys You Didn t Know Existed...,Jersey sales is a curious business Whether you...,bleacherreport articles 1205138 the 50 coolest...,


#### Ostale kolone- korekcija tipova

In [15]:
train_df['alchemy_category_score'] = train_df['alchemy_category_score'].astype(dtype='float')
train_df['news_front_page'] = train_df['news_front_page'].astype(dtype='float')

#### Podela podataka na X i y

In [16]:
X=train_df.loc[:, train_df.columns!= 'label']
y=train_df['label']

In [17]:
categorical_cols = list(X.select_dtypes(include='object').columns)
numerical_cols = list(X.select_dtypes(exclude='object').columns)

#### alchemy_category kolona
Ređe kategorije postaju jedna

In [18]:
train_df.alchemy_category.value_counts()

recreation            1229
arts_entertainment     941
business               880
health                 506
sports                 380
culture_politics       343
computer_internet      296
science_technology     289
gaming                  76
religion                72
law_crime               31
unknown                  6
weather                  4
Name: alchemy_category, dtype: int64

In [19]:
X.alchemy_category[X.groupby('alchemy_category')['alchemy_category'].transform('count').lt(100)]='RARE'

In [20]:
categorical_cols = list(X.select_dtypes(include='object').columns)
numerical_cols = list(X.select_dtypes(exclude='object').columns)

In [21]:
X.reset_index(inplace = True, drop=True) #proveriti da li treba
y.reset_index(inplace = True, drop=True)

### Podela podataka na train i test

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42)

### Imputacija nedostajućih vrednosti

#### fit i transform- train

In [23]:
class CustomImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.missing_map = {}

    def fit(self, X, y=None):

        num_atts = X.select_dtypes(exclude='object').columns.tolist()

        for att in num_atts:
            self.missing_map.update({att: X[att].median()})

        cat_atts = X.select_dtypes(include='object').columns.tolist()

        for att in cat_atts:
            self.missing_map.update({att: X[att].value_counts().index[0]})

        return (self)

    def transform(self, X, y=None):

        for att, impute_value in self.missing_map.items():
            X[att] = X[att].fillna(impute_value)
        return (X)

In [24]:
custom_imputer=CustomImputer().fit(X_train)
X_train=custom_imputer.transform(X_train)

#### transform- test

In [25]:
X_test=custom_imputer.transform(X_test)

In [26]:
X_test=custom_imputer.transform(X_test) #test df ne sadrži target kolonu.

In [27]:
categorical_cols = list(X_train.select_dtypes(include='object').columns)
numerical_cols = list(X_train.select_dtypes(exclude='object').columns)

In [28]:
categorical_cols

['url', 'alchemy_category', 'title', 'body', 'url_words', 'related']

In [29]:
X_train.reset_index(inplace = True, drop=True) #proveriti da li treba
y_train.reset_index(inplace = True, drop=True)

X_test.reset_index(inplace = True, drop=True) #proveriti da li treba
y_test.reset_index(inplace = True, drop=True)

### Izbacivanje kolona  i odvajanje za kasniju obradu

In [30]:
titles_train=X_train.title
titles_test=X_test.title
bodies_train=X_train.body
bodies_test=X_test.body
X_train.drop(['url','body','title','url_words'],inplace=True,axis=1)
X_test.drop(['url','body','title','url_words'],inplace=True,axis=1)

### Enkodiranje kategoričkih varijabli

In [31]:
categorical_cols = list(X_train.select_dtypes(include='object').columns)
numerical_cols = list(X_train.select_dtypes(exclude='object').columns)

#### Enkodiranje- train

In [32]:
enc = OneHotEncoder(dtype=np.int8).fit(X_train[categorical_cols])

X_new = pd.DataFrame(enc.transform(X_train[categorical_cols]).toarray())
X_new.columns = enc.get_feature_names(categorical_cols)

X_new.head()

Unnamed: 0,alchemy_category_RARE,alchemy_category_arts_entertainment,alchemy_category_business,alchemy_category_computer_internet,alchemy_category_culture_politics,alchemy_category_health,alchemy_category_recreation,alchemy_category_science_technology,alchemy_category_sports,related_
0,0,0,0,0,0,0,1,0,0,1
1,0,0,0,0,1,0,0,0,0,1
2,0,0,0,0,0,0,1,0,0,1
3,0,0,0,0,0,0,1,0,0,1
4,0,1,0,0,0,0,0,0,0,1


In [33]:
X_train=pd.concat([X_train[numerical_cols],X_new], axis=1)

In [34]:
X_train.isna().sum().sum()

0

#### Enkodiranje- test

In [35]:
X_new = pd.DataFrame(enc.transform(X_test[categorical_cols]).toarray())
X_new.columns = enc.get_feature_names(categorical_cols)

In [36]:
X_test=pd.concat([X_test[numerical_cols],X_new], axis=1)

In [37]:
X_train.reset_index(inplace = True, drop=True) #proveriti da li treba
y_train.reset_index(inplace = True, drop=True)
#just in case
X_test.reset_index(inplace = True, drop=True) #proveriti da li treba
y_test.reset_index(inplace = True, drop=True)

###  CountVectorizer- titles and bodies

In [38]:
#!pip install nltk

In [39]:
import nltk
#nltk.download('stopwords') #odkomentarisati pri prvom izvršavanju

In [40]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [41]:
stop_words_en=set(stopwords.words('english'))
stop_words_sp=set(stopwords.words('spanish'))
stop_words_it=set(stopwords.words('italian'))
stop_words_all=stop_words_en|stop_words_sp|stop_words_it

#### CountVectorizer- train set - titles

In [42]:
cv2 = CountVectorizer( ngram_range=(1,2),lowercase=True,stop_words=stop_words_all,dtype=np.int8, min_df=0.01)
cv2.fit(titles_train)
transformed_titles_train =cv2.transform(titles_train)
df_cv=pd.DataFrame(transformed_titles_train.todense(), columns=cv2.get_feature_names())

In [43]:
X_train=pd.concat([X_train,df_cv], axis=1)

#### CountVectorizer- test set - titles

In [44]:
transformed_titles_test =cv2.transform(titles_test)
df_cv=pd.DataFrame(transformed_titles_test.todense(), columns=cv2.get_feature_names())

In [45]:
X_test=pd.concat([X_test,df_cv], axis=1)

#### CountVectorizer- train set - bodies

In [46]:
cv2 = CountVectorizer(ngram_range=(1,2),stop_words=stop_words_all,dtype=np.int8, min_df=0.08)
cv2.fit(bodies_train)
transformed_bodies_train =cv2.transform(bodies_train)
df_cv=pd.DataFrame(transformed_bodies_train.todense(), columns=cv2.get_feature_names())

In [47]:
X_train=pd.concat([X_train,df_cv], axis=1)

#### CountVectorizer- test set - bodies

In [48]:
transformed_bodies_test =cv2.transform(bodies_test)
df_cv=pd.DataFrame(transformed_bodies_test.todense(), columns=cv2.get_feature_names())

In [49]:
X_test=pd.concat([X_test,df_cv], axis=1)

### Skaliranje podataka

In [50]:
categorical_cols = list(X_train.select_dtypes(include='object').columns)
numerical_cols = list(X_train.select_dtypes(exclude='object').columns)

#### Skaliranje- train

In [51]:
scaler = MinMaxScaler()
scaler.fit(X_train[numerical_cols])
X_train[numerical_cols]=scaler.transform(X_train[numerical_cols])

#### Skaliranje- test

In [52]:
X_test[numerical_cols]=scaler.transform(X_test[numerical_cols])

### Izbacivanje outliera

In [53]:
z = np.abs(stats.zscore(X_train[numerical_cols]))
X_train['is_outlier'] = (np.any(z > 3, axis = 1)) 

In [54]:
X_train['label'] = y_train
del X_train['is_outlier']

In [55]:
y_train=X_train['label']
del X_train['label']

In [56]:
X_train.reset_index(inplace = True, drop=True)
y_train.reset_index(inplace = True, drop=True)

In [57]:
X_train.shape,y_train.shape

((5546, 330), (5546,))

### Kreiranje modela

Metrika je area under the ROC curve. 

In [58]:
model = LogisticRegression()

In [59]:
model.fit(X_train,y_train)
roc_auc_score(y_train, model.predict(X_train))
#0.7941116122111008 odkomentarisano
# 0.7940873962358479 zakomentarisano

0.7939129370592953

In [60]:
roc_auc_score(y_test, model.predict(X_test))
#0.7868106988662291 odkomentarisano
# 0.786266629225315 zakomentarisano

0.7868106988662291

In [61]:
#body je bio više poboljšao model, ali oni zajedno rezultiraju sa dosta dobrim rezultatima, boljim nego oba pojedinačno.
# iako alchemy_category ne pravi neku preveliku razliku, odlučeno je se nova podela usvoji.
#krenuli smo od base modela koji je na testu ima auc 0.649, a uz malo truda doveli smo ga do 0.786