# Sentiment Analysis On Airline-Related Twitter Data: Part II

In [1]:
import warnings
warnings.filterwarnings('ignore') # hide majority of Python warnings that may come up along the way

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pickle

import sqlite3

%matplotlib inline

In [2]:
# retrieve pickled files
with open("count_split.pkl", "rb") as count_split:
    X_train_count, X_test_count, y_train, y_test = pickle.load(count_split)
    
with open("tfidf_split.pkl", "rb") as tfidf_split:
    X_train_tfidf, X_test_tfidf, y_train, y_test = pickle.load(tfidf_split)
    
# check shapes of resulting
X_train_count.shape, X_test_count.shape, y_train.shape, y_test.shape

((10863, 9371), (3622, 9371), (10863,), (3622,))

## Grid Search With Top-Performing Models

We'll now attempt some hyperparameter optimization through grid search with the 5 algorithms we chose to move forward with. We'll create parameter grids for each, use scikit-learn's `GridSearchCV` to determine the highest performing parameter combination for each, and then use each "best performing" model to make predictions on the test set.

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

# function to iterate through collection of models and param grids, find optimal parameter values for each]
# outputs a dict of refit optimal param models, which we can use to make predictions with test data
def run_grid_search(models, param_grids, X, y, metric, n_jobs = None):
    result = {}
    
    for model_name in models.keys():
        clf = GridSearchCV(models[model_name], param_grids[model_name], scoring = metric, n_jobs = n_jobs)
        clf.fit(X, y)
        
        result[model_name] = clf.best_estimator_
        
        print("Grid search run for model: {}".format(model_name))
        print("Score for best estimator: {.2f}".format(clf.best_score_))
        print("Best parameters:")
        print(clf.best_params_)
        
    return result

In [4]:
# build collection of parameter grids (for grid search) for each algorithm
param_grids = {
    "logistic": {
        "penalty": ["l2", "elasticnet", "none"],
        "class_weight": [None, "balanced"],
        "random_state": [1],
        "tol": [0.0001, 0.001, 0.01],
        "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
    },
    "sgd": {
        "loss": ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
        "penalty": ["l2", "l1", "elasticnet"], 
        "alpha": [0.0001, 0.001, 0.01],
        "tol": [0.001, 0.01],
        "learning_rate": ["constant", "optimal", "invscaling", "adaptive"],
        "class_weight": [None, "balanced"]
    },
    "forest": {
        "n_estimators": [50, 100, 200],
        "criterion": ["gini", "entropy"],
        "max_depth": [None, 20, 50, 100],
        
    },
    "mult_nb": {
        "alpha": [0, 0.1, 0.5, 1.0],
        "fit_prior": [False, True]
    },
    "ridge": {
        "alpha": [0.1, 0.5, 1.0],
        "normalize": [True, False],
        "max_iter": [None, 50, 100, 200],
        "tol": [0.0001, 0.001, 0.01],
        "class_weight": [None, "balanced"],
        "solver": ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]
    }
}

# create dict of models to move forward with
models = {
    "logistic": LogisticRegression(),
    "sgd": SGDClassifier(),
    "forest": RandomForestClassifier(),
    "mult_nb": MultinomialNB(),
    "ridge": RidgeClassifier()
}

In [None]:
run_grid_search(models, param_grids, X_train_count, y_train, metric = "accuracy")