# Part 1: Feature Selection with `scikit-learn`

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
import pandas as pd
import numpy as np

## Data inspection

In [28]:
data = pd.read_csv('data/tweets.160k.random.csv', encoding='utf-8')
data.head(2)

Unnamed: 0,label,id,date,query,user,text
0,4,1985770747,Sun May 31 17:44:25 PDT 2009,NO_QUERY,vozabala,Getting ready for another week of fun and game...
1,0,2322735567,Wed Jun 24 23:10:08 PDT 2009,NO_QUERY,liannecab,"http://twitpic.com/8cp6u - I want it, sooo bad"


In [30]:
print(len(data.columns))
data['label'].value_counts()

6


4    80259
0    79741
Name: label, dtype: int64

## Define Pipeline Components

In [12]:
# Split data 80:20
xtrain, xtest, ytrain, ytest = train_test_split(data, data['label'], test_size=0.2)

# What does this do?
get_tweet_cols = FunctionTransformer(lambda x: x['text'], validate=False)

# Produce both unigrams and bigraams
bow_vector = CountVectorizer(stop_words='english', lowercase=True, ngram_range=(1,2))

mnb_classifier = MultinomialNB()

## Training

In [13]:
pipeline = Pipeline([('get_tweets', get_tweet_cols), ('bow', bow_vector), ('mnb', mnb_classifier)])
pipeline.fit(xtrain, ytrain)
predicted = pipeline.predict(xtest)
print(accuracy_score(ytest, predicted))

0.7505625


## Feature Selection

### `SelectPercentile` vs. `SelectKBest`
* `SelectPercentile` ranks the best features, by percentile, and excludes features below a certain percentile.
* `SelectKBest` scores the features using a function and then "removes all but the k highest scoring features.

### `f_classif`
Used only for categorical targets and based on the Analysis of Variance (ANOVA) statistical test.

### `chi2`
Performs the chi-square statistic for categorical targets, which is less sensible to the nonlinear relationship between the predictive variable and its target.


In [14]:
feature_select = SelectPercentile(chi2, percentile=5)

pipeline = Pipeline([('get_tweets', get_tweet_cols), ('bow', bow_vector), ('feature_select', feature_select), ('mnb', mnb_classifier)])
pipeline.fit(xtrain, ytrain)
predicted = pipeline.predict(xtest)
print(accuracy_score(ytest, predicted))

0.7521875


In [26]:
feature_select = SelectPercentile(f_classif, percentile=5)

pipeline = Pipeline([('get_tweets', get_tweet_cols), ('bow', bow_vector), ('feature_select', feature_select), ('mnb', mnb_classifier)])
pipeline.fit(xtrain, ytrain)
predicted = pipeline.predict(xtest)
print(accuracy_score(ytest, predicted))

0.75146875


In [34]:
feature_select = SelectKBest(chi2, k=100)

pipeline = Pipeline([('get_tweets', get_tweet_cols), ('bow', bow_vector), ('feature_select', feature_select), ('mnb', mnb_classifier)])
pipeline.fit(xtrain, ytrain)
predicted = pipeline.predict(xtest)
print(accuracy_score(ytest, predicted))

0.67709375


In [35]:
feature_select = SelectKBest(f_classif, k=100)

pipeline = Pipeline([('get_tweets', get_tweet_cols), ('bow', bow_vector), ('feature_select', feature_select), ('mnb', mnb_classifier)])
pipeline.fit(xtrain, ytrain)
predicted = pipeline.predict(xtest)
print(accuracy_score(ytest, predicted))

0.67721875


# Part 2: Weka MAE/RMSE

## Evaluation on full training set vs. 20% of dataset

* MAE on 20% of dataset is substantially higher than full training set.
* RMSE on 20% of dataset is also substantially higher than full training set.

```
=== Evaluation on training data ===
Target                      1-step-ahead
========================================
passenger_numbers
  N                                  103
  Mean absolute error             6.9732
  Root mean squared error          8.442

Total number of instances: 115

=== Evaluation on test data ===
Target                      1-step-ahead
========================================
passenger_numbers
  N                                   29
  Mean absolute error             17.588
  Root mean squared error         20.438

Total number of instances: 29

```

## Evaluation without feature selection

* Performance without feature selection decreases slightly to marginally on both the full training set and 20% of dataset.

```
=== Evaluation on training data ===
Target                      1-step-ahead
========================================
passenger_numbers
  N                                  103
  Mean absolute error             7.0827
  Root mean squared error         8.5459

Total number of instances: 115

=== Evaluation on test data ===
Target                      1-step-ahead
========================================
passenger_numbers
  N                                   29
  Mean absolute error            17.6701
  Root mean squared error        20.4839

Total number of instances: 29

```