# Part 6 -- Model Selection

Using pipelines to find the best predictive model for our data.

**Load lib codes**

In [1]:
from os import chdir
chdir('/home/jovyan/work/Analyzing_Unstructured_Data_for_Finance/Analyzing_Unstructured_Data_for_Finance/')

from lib import *
# suppress_warnings()

**Try a bunch of naive models to see which works best with our data. Then, we will GridSearch over our final model and tune its hyperparameters.**

In [2]:
!pip install pymongo

[33mYou are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
X = joblib.load('../Analyzing_Unstructured_Data_for_Finance/data/4.X.pickle')

In [4]:
y = joblib.load('../Analyzing_Unstructured_Data_for_Finance/data/5.y_le.pickle')

In [5]:
X = X['cleaned_text']

**Before transforming any data, make sure to train_test_split so your test data doesn't bleed into your train data**

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [8]:
X_train.shape

(61806,)

In [9]:
y_train.shape

(61806,)

**Transform X's (tweets) into vectors using TF-IDF**

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
tfidf = TfidfVectorizer(ngram_range=(1,3), min_df=20, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [12]:
joblib.dump(tfidf, '../Analyzing_Unstructured_Data_for_Finance/data/6.tfidf_transformer.pickle')

['../Analyzing_Unstructured_Data_for_Finance/data/6.tfidf_transformer.pickle']

In [14]:
joblib.dump(X_train_tfidf, '../Analyzing_Unstructured_Data_for_Finance/data/6.X_train.pickle')

['../Analyzing_Unstructured_Data_for_Finance/data/6.X_train.pickle']

In [15]:
joblib.dump(y_train, '../Analyzing_Unstructured_Data_for_Finance/data/6.y_train.pickle')

['../Analyzing_Unstructured_Data_for_Finance/data/6.y_train.pickle']

In [16]:
joblib.dump(X_test_tfidf, '../Analyzing_Unstructured_Data_for_Finance/data/6.X_test.pickle')

['../Analyzing_Unstructured_Data_for_Finance/data/6.X_test.pickle']

In [17]:
joblib.dump(y_test, '../Analyzing_Unstructured_Data_for_Finance/data/6.y_test.pickle')

['../Analyzing_Unstructured_Data_for_Finance/data/6.y_test.pickle']

In [51]:
# X_tfidf = tfidf.transform(X)

In [52]:
# joblib.dump(X_tfidf, '../Analyzing_Unstructured_Data_for_Finance/data/6.X_tfidf')

['../Analyzing_Unstructured_Data_for_Finance/data/6.X_tfidf']

In [18]:
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [16]:
param_dict = {}

In [17]:
model_dict = {
    'mnb':GridSearchCV(MultinomialNB(),
                             param_grid=param_dict,
                             cv=StratifiedShuffleSplit(n_splits=5, random_state=42)),
    'lr':GridSearchCV(LogisticRegression(),
                             param_grid=param_dict,
                             cv=StratifiedShuffleSplit(n_splits=5, random_state=42)),
    'rf':GridSearchCV(RandomForestClassifier(),
                      param_grid=param_dict,
                      cv=StratifiedShuffleSplit(n_splits=5, random_state=42)),
    'svc':GridSearchCV(SVC(),
                      param_grid=param_dict,
                      cv=StratifiedShuffleSplit(n_splits=5, random_state=42)),
    'knn':GridSearchCV(SVC(),
                      param_grid=param_dict,
                      cv=StratifiedShuffleSplit(n_splits=5, random_state=42)),
}

In [13]:
def fit_all_models(X,y, model_dict):
    for model in model_dict.keys():
        model_dict[model].fit(X,y)
        print("{:5} best score: {}".format(model, model_dict[model].best_score_))

```
mnb   best score: 0.5758291538585989
knn   best score: 0.5617214043035109
rf    best score: 0.5690664940948067
lr    best score: 0.5679339912635496
svc   best score: 0.5617214043035109
1:36:53.311107
```

In [None]:
# start = datetime.now()

# fit_all_models(X_train, y_train, model_dict)

# end = datetime.now()
# print(end - start)

mnb   best score: 0.5758291538585989
knn   best score: 0.5617214043035109
rf    best score: 0.5690664940948067
lr    best score: 0.5679339912635496


In [None]:
# start = datetime.now()

# fit_all_models(X_train, y_train, model_dict)

# end = datetime.now()
# print(end - start)

knn   best score: 0.5617214043035109
lr    best score: 0.5679339912635496
mnb   best score: 0.5758291538585989


# RUN OVERNIGHT

In [None]:
param_dict = {
    'mnb':{
        'alpha': np.linspace(.1,1,5)},
    'lr':{
        'C': np.logspace(-3,3,7)},
    'rf':{
        'max_depth': [5,10,20,30,40],
        'min_samples_leaf': [1,2,5,8],
        'criterion': ['gini','entropy']},
    'svc':{
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']},
    'knn':{
        'n_neighbors': np.arange(5,50,3)}
}

In [None]:
model_dict = {
    'mnb':GridSearchCV(MultinomialNB(),
                             param_grid=param_dict['mnb'],
                             cv=StratifiedShuffleSplit(n_splits=5, random_state=42)),
    'lr':GridSearchCV(LogisticRegression(),
                             param_grid=param_dict['lr'],
                             cv=StratifiedShuffleSplit(n_splits=5, random_state=42)),
    'rf':GridSearchCV(RandomForestClassifier(),
                      param_grid=param_dict['rf'],
                      cv=StratifiedShuffleSplit(n_splits=5, random_state=42)),
    'svc':GridSearchCV(SVC(),
                      param_grid=param_dict['svc'],
                      cv=StratifiedShuffleSplit(n_splits=5, random_state=42)),
    'knn':GridSearchCV(KNeighborsClassifier(),
                      param_grid=param_dict['knn'],
                      cv=StratifiedShuffleSplit(n_splits=5, random_state=42)),
}

In [None]:
start = datetime.now()

fit_all_models(X_train, y_train, model_dict)

end = datetime.now()
print(end - start)

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

In [20]:
tfidf_lr_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,3), min_df=20, stop_words='english')),
    ('lr', LogisticRegression(C=1E10))
])


In [24]:
tfidf_lr_pipe.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=20,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
...ty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [25]:
tfidf_lr_pipe.score(X_test, y_test)

0.55080248511519547

**Logistic Regression on tfidf_svd data**

In [26]:
tfidf_svd_lr_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,3), min_df=20, stop_words='english')),
    ('svd', TruncatedSVD(n_components = 10, random_state=42)),
    ('lr', LogisticRegression(C=1E10))
])


In [27]:
tfidf_svd_lr_pipe.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=20,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
...ty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [28]:
tfidf_svd_lr_pipe.score(X_test, y_test)

0.55966865130727417