# 4 - Improving Model Performance

http://datascience.computingpatterns.com/

## Add better features

In [8]:
%run 4.py

In [9]:
train = load_feature_matrix("../train.csv")
mytrain, mytest = train_test_split(train, test_size = .4)

In [10]:
columns = ["ReputationAtPostCreation", "OwnerUndeletedAnswerCountAtPostTime", "BodyMarkdownLength", "TitleLength"]

In [11]:
features_train, features_test, new_columns = expand_with_bow_features(mytrain, mytest, ["Title"], max_features=200)

In [12]:
new_columns[:10]

[u'Titledevelopment',
 u'Titleservice',
 u'Titlethat',
 u'Titlehas',
 u'Titleinside',
 u'Titlevariable',
 u'Titleworking',
 u'Titleone',
 u'Titleopen',
 u'Titlewith']

In [13]:
features_train.shape

(84163, 219)

In [14]:
def get_score(train, test, model, columns):
    model.fit(X=np.asarray(train[columns]), y = np.asarray(train.OpenStatus).transpose())
    predictions = model.predict_proba(np.asarray(test[columns]))[:,1]
    return log_loss(test.OpenStatus.values, predictions)

In [15]:
get_score(mytrain, mytest, LogisticRegression(C=0.1),  ["BodyMarkdownLength", "TitleLength"])

0.68302754673851596

In [16]:
get_score(features_train, features_test,  LogisticRegression(C=0.1), new_columns + ["BodyMarkdownLength", "TitleLength"])

0.62007036008903393

### Suggestions

In [17]:
# Remove stopwords
from sklearn.feature_extraction import stop_words
stop_words.ENGLISH_STOP_WORDS

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [18]:
# Use bigram
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=1)
analyze = bigram_vectorizer.build_analyzer()

In [19]:
analyze('Bi-grams are cool!') == (['bi', 'grams', 'are', 'cool', 'bi grams', 'grams are', 'are cool'])

True

## Optimize model parameters

In [25]:
param_grid = [{'C': [1, 10, 100, 1000]}, {'verbose': [0, 1]}]
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [26]:
print param_grid

[{'C': [1, 10, 100, 1000]}, {'verbose': [0, 1]}]


In [27]:
clf = GridSearchCV(LogisticRegression(), 
                   param_grid, 
                   cv=10,
                   scoring="log_loss")

In [28]:
clf.fit(np.array(features_train[columns]), features_train.OpenStatus.values)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid=[{'C': [1, 10, 100, 1000]}, {'verbose': [0, 1]}],
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='log_loss', verbose=0)

In [29]:
clf.grid_scores_

[mean: -0.68090, std: 0.00267, params: {'C': 1},
 mean: -0.68090, std: 0.00267, params: {'C': 10},
 mean: -0.68090, std: 0.00267, params: {'C': 100},
 mean: -0.68090, std: 0.00267, params: {'C': 1000},
 mean: -0.68090, std: 0.00267, params: {'verbose': 0},
 mean: -0.68090, std: 0.00267, params: {'verbose': 1}]

## Use ensemble methods 

In [30]:
clf.get_params()

{'cv': 10,
 'error_score': 'raise',
 'estimator': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr',
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0),
 'estimator__C': 1.0,
 'estimator__class_weight': None,
 'estimator__dual': False,
 'estimator__fit_intercept': True,
 'estimator__intercept_scaling': 1,
 'estimator__max_iter': 100,
 'estimator__multi_class': 'ovr',
 'estimator__penalty': 'l2',
 'estimator__random_state': None,
 'estimator__solver': 'liblinear',
 'estimator__tol': 0.0001,
 'estimator__verbose': 0,
 'fit_params': {},
 'iid': True,
 'loss_func': None,
 'n_jobs': 1,
 'param_grid': [{'C': [1, 10, 100, 1000]}, {'verbose': [0, 1]}],
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'score_func': None,
 'scoring': 'log_loss',
 'verbose': 0}

Random forest: ensemble method bagging

Solution: Bagging (Bootstrap aggregating) is a method for creating multiple models (models ensemble) by making different random samples of the original data set. Each of the models is trained with one of those samples. The name bagging is short for bootstrap aggregating. A bootstrap sample is a random sample with replacement. The final result of the different models is done by voting in classification problems or by averaging in regression problems. Bagging works well for instable classifiers, e.g. decision trees and neural networks. A classifier is unstable if small changes in training data lead to significantly different models.

In [31]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators        = 32,     # Number of decision trees
                            min_samples_leaf    = 10,
                            min_samples_split   = 10,
                            n_jobs              = -1)     # Number of cores I'm using in parallel)
rf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=10, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=32, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [33]:
get_score(features_train, features_train,  make_submitter(columns, rf, mytrain, mytest), co)


TypeError: get_score() takes exactly 4 arguments (3 given)

In [11]:
get_score(features_tr, features_test, make_submitter(new_columns + ["BodyMarkdownLength", "TitleLength"], rf))

0.59781208661851404