### Importovanje biblioteka i podataka

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scikitplot as skplt
from scipy import stats
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator, clone

from sklearn import model_selection, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score

In [2]:
train_df = pd.read_csv('input/train.tsv', delimiter='\t')
train_df.head()

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,...,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,4042,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,...,1,1,24,0,5424,170,8,0.152941,0.07913,0
1,http://www.popsci.com/technology/article/2012-...,8471,"{""title"":""The Fully Electronic Futuristic Star...",recreation,0.574147,3.677966,0.508021,0.28877,0.213904,0.144385,...,1,1,40,0,4973,187,9,0.181818,0.125448,1
2,http://www.menshealth.com/health/flu-fighting-...,1164,"{""title"":""Fruits that Fight the Flu fruits tha...",health,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,...,1,1,55,0,2240,258,11,0.166667,0.057613,1
3,http://www.dumblittleman.com/2007/12/10-foolpr...,6684,"{""title"":""10 Foolproof Tips for Better Sleep ""...",health,0.801248,1.543103,0.4,0.1,0.016667,0.0,...,1,0,24,0,2737,120,5,0.041667,0.100858,1
4,http://bleacherreport.com/articles/1205138-the...,9006,"{""title"":""The 50 Coolest Jerseys You Didn t Kn...",sports,0.719157,2.676471,0.5,0.222222,0.123457,0.04321,...,1,1,14,0,12032,162,10,0.098765,0.082569,0


In [3]:
train_df.drop(['urlid','is_news','framebased'],axis=1, inplace=True)
#URL kolona je izbačena nešto kasnije, pre enkodiranja.  

### Priprema podataka

#### Boilerplate kolona

In [4]:
train_df['boilerplate'] = train_df['boilerplate'].replace(to_replace=':null', value=':""', regex=True)

In [5]:
train_df.boilerplate=train_df.boilerplate.map(eval)
train_df.boilerplate[1].keys()

dict_keys(['title', 'body', 'url'])

In [6]:
train_df.shape

(7395, 24)

In [7]:
train_df.rename({'url':'full_url'},axis=1, inplace=True)

In [8]:
train_df=pd.concat([train_df.drop('boilerplate', axis=1),train_df['boilerplate'].apply(pd.Series)], axis=1)

In [9]:
train_df.shape

(7395, 27)

In [10]:
train_df.rename({'url':'url_words'},axis=1, inplace=True)
train_df.rename({'full_url':'url'},axis=1, inplace=True)

Nedostajuće vrednosti se pojavljuju u okviru znaka pitanja.

#### Nedostajuće vrednosti kao np.nan

In [11]:
train_df.replace(to_replace='?', value=np.nan, inplace=True)

In [12]:
train_df.head()

Unnamed: 0,url,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,compression_ratio,embed_ratio,...,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label,title,body,url_words,related
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,0.443783,0.0,...,5424,170,8,0.152941,0.07913,0,IBM Sees Holographic Calls Air Breathing Batte...,A sign stands outside the International Busine...,bloomberg news 2010 12 23 ibm predicts hologra...,
1,http://www.popsci.com/technology/article/2012-...,recreation,0.574147,3.677966,0.508021,0.28877,0.213904,0.144385,0.468649,0.0,...,4973,187,9,0.181818,0.125448,1,The Fully Electronic Futuristic Starting Gun T...,And that can be carried on a plane without the...,popsci technology article 2012 07 electronic f...,
2,http://www.menshealth.com/health/flu-fighting-...,health,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,0.525448,0.0,...,2240,258,11,0.166667,0.057613,1,Fruits that Fight the Flu fruits that fight th...,Apples The most popular source of antioxidants...,menshealth health flu fighting fruits cm mmc F...,
3,http://www.dumblittleman.com/2007/12/10-foolpr...,health,0.801248,1.543103,0.4,0.1,0.016667,0.0,0.480725,0.0,...,2737,120,5,0.041667,0.100858,1,10 Foolproof Tips for Better Sleep,There was a period in my life when I had a lot...,dumblittleman 2007 12 10 foolproof tips for be...,
4,http://bleacherreport.com/articles/1205138-the...,sports,0.719157,2.676471,0.5,0.222222,0.123457,0.04321,0.446143,0.0,...,12032,162,10,0.098765,0.082569,0,The 50 Coolest Jerseys You Didn t Know Existed...,Jersey sales is a curious business Whether you...,bleacherreport articles 1205138 the 50 coolest...,


#### Ostale kolone- korekcija tipova

In [13]:
train_df['alchemy_category_score'] = train_df['alchemy_category_score'].astype(dtype='float')
train_df['news_front_page'] = train_df['news_front_page'].astype(dtype='float')

#### Podela podataka na X i y

In [14]:
train_df.drop(['url','title','body','url_words'],inplace=True,axis=1)

In [15]:
X=train_df.loc[:, train_df.columns!= 'label']
y=train_df['label']

In [16]:
categorical_cols = list(X.select_dtypes(include='object').columns)
numerical_cols = list(X.select_dtypes(exclude='object').columns)

In [17]:
X.reset_index(inplace = True, drop=True)
y.reset_index(inplace = True, drop=True)

### Podela podataka na train i test

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42)

### Imputacija nedostajućih vrednosti
#### fit i transform- train

In [19]:
class CustomImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.missing_map = {}

    def fit(self, X, y=None):

        num_atts = X.select_dtypes(exclude='object').columns.tolist()

        for att in num_atts:
            self.missing_map.update({att: X[att].median()})

        cat_atts = X.select_dtypes(include='object').columns.tolist()

        for att in cat_atts:
            self.missing_map.update({att: X[att].value_counts().index[0]})

        return (self)

    def transform(self, X, y=None):

        for att, impute_value in self.missing_map.items():
            X[att] = X[att].fillna(impute_value)
        return (X)

In [20]:
custom_imputer=CustomImputer().fit(X_train)
X_train=custom_imputer.transform(X_train)

#### transform- test

In [21]:
X_test=custom_imputer.transform(X_test)

In [22]:
X_test=custom_imputer.transform(X_test) #test df ne sadrži target kolonu.

In [23]:
categorical_cols = list(X_train.select_dtypes(include='object').columns)
numerical_cols = list(X_train.select_dtypes(exclude='object').columns)

In [24]:
X_train.reset_index(inplace = True, drop=True) #proveriti da li treba
y_train.reset_index(inplace = True, drop=True)

X_test.reset_index(inplace = True, drop=True) #proveriti da li treba
y_test.reset_index(inplace = True, drop=True)

### Enkodiranje kategoričkih varijabli

#### Enkodiranje- train

In [25]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(dtype=np.int8).fit(X_train[categorical_cols])

X_new = pd.DataFrame(enc.transform(X_train[categorical_cols]).toarray())
X_new.columns = enc.get_feature_names(categorical_cols)

X_new.head()

Unnamed: 0,alchemy_category_arts_entertainment,alchemy_category_business,alchemy_category_computer_internet,alchemy_category_culture_politics,alchemy_category_gaming,alchemy_category_health,alchemy_category_law_crime,alchemy_category_recreation,alchemy_category_religion,alchemy_category_science_technology,alchemy_category_sports,alchemy_category_unknown,alchemy_category_weather,related_
0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
1,0,0,0,1,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,1,0,0,0,0,0,1
3,0,0,0,0,0,0,0,1,0,0,0,0,0,1
4,1,0,0,0,0,0,0,0,0,0,0,0,0,1


In [26]:
X_train=pd.concat([X_train[numerical_cols],X_new], axis=1)

#### Enkodiranje- test

In [27]:
X_new = pd.DataFrame(enc.transform(X_test[categorical_cols]).toarray())
X_new.columns = enc.get_feature_names(categorical_cols)

In [28]:
X_test=pd.concat([X_test[numerical_cols],X_new], axis=1)

In [29]:
X_train.reset_index(inplace = True, drop=True) #proveriti da li treba
y_train.reset_index(inplace = True, drop=True)
#just in case
X_test.reset_index(inplace = True, drop=True) #proveriti da li treba
y_test.reset_index(inplace = True, drop=True)

In [30]:
X_train.isna().sum().sum()

0

Metrika je area under the ROC curve. 

### Skaliranje podataka

In [31]:
categorical_cols = list(X_train.select_dtypes(include='object').columns)
numerical_cols = list(X_train.select_dtypes(exclude='object').columns)

#### Skaliranje- train

In [32]:
scaler = MinMaxScaler()
scaler.fit(X_train[numerical_cols])
X_train[numerical_cols]=scaler.transform(X_train[numerical_cols])

#### Skaliranje- test

In [33]:
X_test[numerical_cols]=scaler.transform(X_test[numerical_cols])

In [34]:
categorical_cols = list(X.select_dtypes(include='object').columns)
numerical_cols = list(X.select_dtypes(exclude='object').columns)

### Izbacivanje outliera

In [35]:
categorical_cols = list(X_train.select_dtypes(include='object').columns)
numerical_cols = list(X_train.select_dtypes(exclude='object').columns)

In [36]:
z = np.abs(stats.zscore(X_train[numerical_cols]))
X_train['is_outlier'] = (np.any(z > 3, axis = 1)) 

In [37]:
X_train['label'] = y_train
del X_train['is_outlier']

In [38]:
y_train=X_train['label']

In [39]:
del X_train['label']

In [40]:
X_train.reset_index(inplace = True, drop=True)
y_train.reset_index(inplace = True, drop=True)

In [41]:
X_train.shape,y_train.shape

((5546, 34), (5546,))

### Kreiranje osnovnog modela

In [42]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

In [43]:
model.fit(X_train,y_train)
roc_auc_score(y_train, model.predict(X_train))

0.6607287185843288

In [44]:
roc_auc_score(y_test, model.predict(X_test))

0.6491224683211063

In [45]:
#osnovni model, sa izbacenim tekstualnim kategorijama, svim osim alchemy_category, daju auc od 0.6 na trainu i na test setu.
#brzo izvrsavanje
#grupisanje retkih članaka kod alchemy_category dalo je nešto lošiji rezultat, 0.59, ali držaćemo se ovog sa rezultatom od 0.6
#skaliranjem smo sa 0.6 skočili na 0.66 na trening odnosno 0.65 na test setu.