# Feature Selection

In [78]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.feature_selection import SelectPercentile, SelectFromModel, VarianceThreshold, RFE, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

In [42]:
# importing normalized news body and title
all_news = pd.read_csv("normalized_texts_labels.csv")
all_text = all_news.normalized_title.str.cat(all_news.normalized_text, sep=' ')
y = all_news.fake
# removing rows with missing values
x = pd.notnull(all_text)
y = y[x]
all_text = all_text[x] 

In [43]:
xtrain, xvalid, ytrain, yvalid = train_test_split(all_text, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)

In [44]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

In [45]:
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

## Univariate Statistics

### F-score

In [73]:
# removing features that only have zeros as entries (the univariate statistics approach doesn't work otherwise)
selector = VarianceThreshold()
s = selector.fit(xtrain_ctv)
xt = s.transform(xtrain_ctv)
xv = s.transform(xvalid_ctv)
# use t_classif (the default) and SelectPercentile to select 50% of features
select = SelectPercentile(percentile=50)
select.fit(xt, ytrain)
# transform validation set
xt_selected = select.transform(xt)
xv_selected = select.transform(xv)

In [66]:
clf = LogisticRegression(C=1.0)
clf.fit(xt_selected, ytrain)
y_pred=clf.predict(xv_selected)
print("Accuracy:",metrics.accuracy_score(yvalid, y_pred))
print("Precision:",metrics.precision_score(yvalid, y_pred))
print("Recall:",metrics.recall_score(yvalid, y_pred))



Accuracy: 0.9520573801434503
Precision: 0.9498580889309366
Recall: 0.9313543599257885


In [67]:
clf = MultinomialNB()
clf.fit(xt_selected, ytrain)
y_pred=clf.predict(xv_selected)
print("Accuracy:",metrics.accuracy_score(yvalid, y_pred))
print("Precision:",metrics.precision_score(yvalid, y_pred))
print("Recall:",metrics.recall_score(yvalid, y_pred))

Accuracy: 0.8267270668176671
Precision: 0.9936204146730463
Recall: 0.577922077922078


In [74]:
seed = 1234
clf = XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                    subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xt_selected, ytrain)
y_pred=clf.predict(xv_selected)
print("Accuracy:",metrics.accuracy_score(yvalid, y_pred))
print("Precision:",metrics.precision_score(yvalid, y_pred))
print("Recall:",metrics.recall_score(yvalid, y_pred))

Accuracy: 0.9577198942997357
Precision: 0.9488847583643123
Recall: 0.9471243042671614


In [68]:
# removing features that only have zeros as entries (the univariate statistics approach doesn't work otherwise)
selector = VarianceThreshold()
s = selector.fit(xtrain_tfv)
xt = s.transform(xtrain_tfv)
xv = s.transform(xvalid_tfv)
# use t_classif (the default) and SelectPercentile to select 50% of features
select = SelectPercentile(percentile=50)
select.fit(xt, ytrain)
# transform validation set
xt_selected = select.transform(xt)
xv_selected = select.transform(xv)

In [69]:
clf = LogisticRegression(C=1.0)
clf.fit(xt_selected, ytrain)
y_pred=clf.predict(xv_selected)
print("Accuracy:",metrics.accuracy_score(yvalid, y_pred))
print("Precision:",metrics.precision_score(yvalid, y_pred))
print("Recall:",metrics.recall_score(yvalid, y_pred))



Accuracy: 0.947527368818422
Precision: 0.9643916913946587
Recall: 0.9044526901669759


In [70]:
clf = MultinomialNB()
clf.fit(xt_selected, ytrain)
y_pred=clf.predict(xv_selected)
print("Accuracy:",metrics.accuracy_score(yvalid, y_pred))
print("Precision:",metrics.precision_score(yvalid, y_pred))
print("Recall:",metrics.recall_score(yvalid, y_pred))

Accuracy: 0.7557568893922235
Precision: 0.9976905311778291
Recall: 0.4007421150278293


In [72]:
seed = 1234
clf = XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                    subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xt_selected, ytrain)
y_pred=clf.predict(xv_selected)
print("Accuracy:",metrics.accuracy_score(yvalid, y_pred))
print("Precision:",metrics.precision_score(yvalid, y_pred))
print("Recall:",metrics.recall_score(yvalid, y_pred))

Accuracy: 0.9528123820309551
Precision: 0.9440820130475303
Recall: 0.9397031539888683


### Mutual Information

In [None]:
# removing features that only have zeros as entries (the univariate statistics approach doesn't work otherwise)
selector = VarianceThreshold()
s = selector.fit(xtrain_ctv)
xt = s.transform(xtrain_ctv)
xv = s.transform(xvalid_ctv)
# use t_classif (the default) and SelectPercentile to select 50% of features
select = SelectPercentile(score_func = mutual_info_classif,percentile=50)
select.fit(xt, ytrain)
# transform validation set
xt_selected = select.transform(xt)
xv_selected = select.transform(xv)

## Model-Based Feature Selection

### Random Forest

In [33]:
select = SelectFromModel(
    RandomForestClassifier(n_estimators=100, random_state=42),
    threshold="median")
select.fit(xtrain_ctv, ytrain)
xtrain_selected = select.transform(xtrain_ctv)
xvalid_selected = select.transform(xvalid_ctv)

In [34]:
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_selected, ytrain)
y_pred=clf.predict(xvalid_selected)
print("Accuracy:",metrics.accuracy_score(yvalid, y_pred))
print("Precision:",metrics.precision_score(yvalid, y_pred))
print("Recall:",metrics.recall_score(yvalid, y_pred))



Accuracy: 0.9490373725934315
Precision: 0.9394221808014911
Recall: 0.935064935064935


In [35]:
clf = MultinomialNB()
clf.fit(xtrain_selected, ytrain)
y_pred=clf.predict(xvalid_selected)
print("Accuracy:",metrics.accuracy_score(yvalid, y_pred))
print("Precision:",metrics.precision_score(yvalid, y_pred))
print("Recall:",metrics.recall_score(yvalid, y_pred))

Accuracy: 0.922234805587014
Precision: 0.9007352941176471
Recall: 0.9090909090909091


In [36]:
select = SelectFromModel(
    RandomForestClassifier(n_estimators=100, random_state=42),
    threshold="median")
select.fit(xtrain_tfv, ytrain)
xtrain_selected = select.transform(xtrain_tfv)
xvalid_selected = select.transform(xvalid_tfv)

In [37]:
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_selected, ytrain)
y_pred=clf.predict(xvalid_selected)
print("Accuracy:",metrics.accuracy_score(yvalid, y_pred))
print("Precision:",metrics.precision_score(yvalid, y_pred))
print("Recall:",metrics.recall_score(yvalid, y_pred))



Accuracy: 0.9497923744809362
Precision: 0.9664363277393879
Recall: 0.9081632653061225


In [38]:
clf = MultinomialNB()
clf.fit(xtrain_selected, ytrain)
y_pred=clf.predict(xvalid_selected)
print("Accuracy:",metrics.accuracy_score(yvalid, y_pred))
print("Precision:",metrics.precision_score(yvalid, y_pred))
print("Recall:",metrics.recall_score(yvalid, y_pred))

Accuracy: 0.748961872404681
Precision: 0.9975903614457832
Recall: 0.38404452690166974


### Extra Trees

In [None]:
select = SelectFromModel(
    ExtraTreesClassifier(n_estimators=100, random_state=42),
    threshold="median")
select.fit(xtrain_ctv, ytrain)
xtrain_selected = select.transform(xtrain_ctv)
xvalid_selected = select.transform(xvalid_ctv)

In [None]:
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_selected, ytrain)
y_pred=clf.predict(xvalid_selected)
print("Accuracy:",metrics.accuracy_score(yvalid, y_pred))
print("Precision:",metrics.precision_score(yvalid, y_pred))
print("Recall:",metrics.recall_score(yvalid, y_pred))

In [None]:
clf = MultinomialNB()
clf.fit(xtrain_selected, ytrain)
y_pred=clf.predict(xvalid_selected)
print("Accuracy:",metrics.accuracy_score(yvalid, y_pred))
print("Precision:",metrics.precision_score(yvalid, y_pred))
print("Recall:",metrics.recall_score(yvalid, y_pred))

## Iterative Feature Selection

In [None]:
# recursive feature elimination will be used
select = RFE(RandomForestClassifier(n_estimators=100, random_state=42),
            n_features_to_select=40)
select.fit(xtrain_ctv, ytrain)
xtrain_rfe = select.transform(xtrain_ctv)
xvalid_rfe = select.transform(xvalid_ctv)