In [1]:
import os

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# import plotting libraries
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('seaborn-v0_8') # pretty matplotlib plots

import seaborn as sns
sns.set_theme('notebook', style='whitegrid', font_scale=1.25)

# autoload changes in other files, so you don't have to restart the Jupyter kernel each time you make a change to the imported code.
%load_ext autoreload
%autoreload 2

In [3]:
from sklearn.dummy import DummyClassifier
import sklearn.linear_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

In [9]:
from experimentloop import load_data, pipeline, grid_search, output_grid_result, C_GRID

In [5]:
x_NC, y_N = load_data()

In [7]:
output_grid_result('baseline', grid_search(x_NC, y_N, {
    "classify": [DummyClassifier(strategy="constant", constant=0)]
}))

baseline
best score:  0.5
best params:  {'classify': DummyClassifier(constant=0, strategy='constant')}
vocab size:  4510
DummyClassifier(constant=0, strategy='constant')


In [13]:
output_grid_result('count_default', grid_search(x_NC, y_N, {
    "featurize": [CountVectorizer()],
    "classify__C": C_GRID,
}))

count_default
best score:  0.872
best params:  {'classify__C': 3.162, 'featurize': CountVectorizer()}
vocab size:  4510
LogisticRegression(C=3.1622776601683795, max_iter=400)


In [137]:
output_grid_result('count_binary', grid_search(x_NC, y_N, {
    "featurize": [CountVectorizer(binary=True)],
    "classify__C": C_GRID,
}))

count_binary
best score:  0.872
best params:  {'classify__C': 3.162, 'featurize': CountVectorizer(binary=True)}
vocab size:  4510
LogisticRegression(C=3.1622776601683795, max_iter=400)


In [157]:
output_grid_result('tfidf_3chars', grid_search(x_NC, y_N, {
    "featurize": [TfidfVectorizer(lowercase=True, token_pattern=r"(?u)\b\w\w\w+\b")],
    'classify': [LogisticRegression(solver='lbfgs', max_iter=600)],
    "classify__C": np.logspace(-9, 6, 36),
}))

tfidf_3chars
best score:  0.887
best params:  {'classify': LogisticRegression(C=5.623413251903491, max_iter=600), 'classify__C': 5.623, 'featurize': TfidfVectorizer(token_pattern='(?u)\\b\\w\\w\\w+\\b')}
vocab size:  4412
LogisticRegression(C=5.623413251903491, max_iter=600)


In [None]:
output_grid_result('tfidf_3chars', grid_search(x_NC, y_N, {
    "featurize": [TfidfVectorizer(lowercase=True, token_pattern=r"(?u)\b\w\w\w+\b")],
    'classify': [LogisticRegression(solver='lbfgs', max_iter=600)],
    "classify__C": np.logspace(0, 1, 21),
}))

tfidf_3chars
best score:  0.887
best params:  {'classify': LogisticRegression(C=5.623413251903491, max_iter=600), 'classify__C': 5.623, 'featurize': TfidfVectorizer(token_pattern='(?u)\\b\\w\\w\\w+\\b')}
vocab size:  4412
LogisticRegression(C=5.623413251903491, max_iter=600)


In [132]:
import re

In [134]:
words = {}
cv = CountVectorizer().fit(x_NC.loc[:, 'text'])
all_counts = cv.transform(x_NC.loc[:, 'text']).sum(axis=0).A1
ix_to_word = {v: k for k, v in cv.vocabulary_.items()}
count_word_pairs = [(c, ix_to_word[i]) for i, c in enumerate(all_counts)]
[(c, w) for c, w in count_word_pairs if re.match(r'[0-9]', w)]
#' '.join([w for c, w in sorted([(c, w) for c, w in count_word_pairs if c <= 1])])


[(1, '00'),
 (30, '10'),
 (3, '100'),
 (2, '11'),
 (4, '12'),
 (3, '13'),
 (2, '15'),
 (1, '15g'),
 (1, '15pm'),
 (2, '17'),
 (1, '18'),
 (1, '18th'),
 (1, '1928'),
 (1, '1948'),
 (1, '1971'),
 (1, '1973'),
 (1, '1979'),
 (1, '1980'),
 (1, '1986'),
 (1, '1995'),
 (1, '1998'),
 (6, '20'),
 (1, '2000'),
 (1, '2007'),
 (1, '20th'),
 (1, '2160'),
 (1, '24'),
 (2, '25'),
 (4, '30'),
 (1, '30s'),
 (1, '325'),
 (2, '35'),
 (1, '350'),
 (1, '375'),
 (1, '3o'),
 (5, '40'),
 (1, '40min'),
 (1, '42'),
 (1, '44'),
 (2, '45'),
 (1, '4s'),
 (1, '4ths'),
 (4, '50'),
 (1, '5020'),
 (3, '510'),
 (1, '5320'),
 (1, '54'),
 (1, '5lb'),
 (1, '680'),
 (2, '70'),
 (1, '70000'),
 (1, '700w'),
 (2, '80'),
 (1, '80s'),
 (1, '8125'),
 (1, '85'),
 (1, '8525'),
 (1, '8530'),
 (1, '8pm'),
 (6, '90'),
 (1, '95'),
 (1, '99')]

In [34]:
output_grid_result('count_max_90pct_docs', grid_search(x_NC, y_N, {
    "featurize": [CountVectorizer()],
    "featurize__max_df": np.linspace(0.0, 0.2, 50),
    'classify': [LogisticRegression(solver='liblinear', max_iter=400)],
    "classify__C": C_GRID,
}))

count_max_90pct_docs
best score:  0.873
best params:  {'classify': LogisticRegression(max_iter=400, solver='liblinear'), 'classify__C': 1.0, 'featurize': CountVectorizer(max_df=0.10612244897959185), 'featurize__max_df': 0.106}
vocab size:  4501
LogisticRegression(max_iter=400, solver='liblinear')


155 fits failed out of a total of 7750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
155 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/theo/opt/miniconda3/envs/cs135_env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/theo/opt/miniconda3/envs/cs135_env/lib/python3.10/site-packages/sklearn/pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/Users/theo/opt/miniconda3/envs/cs135_env/lib/python3.10/site-packages/sklearn/pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/theo/opt/miniconda3/envs/cs135_en

In [62]:
'website' + x_NC['website_name'] + ' ' + x_NC['text']

0       websiteamazon Oh and I forgot to also mention ...
1              websiteamazon THAT one didn't work either.
2                        websiteamazon Waste of 13 bucks.
3       websiteamazon Product is useless, since it doe...
4       websiteamazon None of the three sizes they sen...
                              ...                        
2395    websiteyelp The sweet potato fries were very n...
2396    websiteyelp I could eat their bruschetta all d...
2397                     websiteyelp Ambience is perfect.
2398    websiteyelp We ordered the duck rare and it wa...
2399    websiteyelp Service was nice and the company w...
Length: 2400, dtype: object

In [63]:
output_grid_result('count_with_website', grid_search(x_NC, y_N, {
    #"extract_text": [FunctionTransformer(lambda x_NC: x_NC.agg(' '.join, axis=1))],
    "extract_text": [FunctionTransformer(lambda x_NC: 'website' + x_NC['website_name'] + ' ' + x_NC['text'])],
    "featurize": [CountVectorizer()],
    "classify__C": C_GRID,
}))

count_with_website
best score:  0.872
best params:  {'classify__C': 3.162, 'extract_text': FunctionTransformer(func=<function <lambda> at 0x1799815a0>), 'featurize': CountVectorizer()}
vocab size:  4513
LogisticRegression(C=3.1622776601683795, max_iter=400)


In [17]:
output_grid_result('count_top_n_words', grid_search(x_NC, y_N, {
    "featurize": [CountVectorizer()],
    "featurize__max_features": np.linspace(2000, 4510, 10, dtype='int'),
    "classify__C": C_GRID,
}))

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


count_top_n_words
best score:  0.872
best params:  {'classify__C': 3.162, 'featurize': CountVectorizer(max_features=3952), 'featurize__max_features': 3952}
vocab size:  3952
LogisticRegression(C=3.1622776601683795, max_iter=400)


In [11]:
output_grid_result('tfidf_default', grid_search(x_NC, y_N, {
    "featurize": [TfidfVectorizer()],
    "classify__C": C_GRID,
}))

tfidf_default
best score:  0.885
best params:  {'classify__C': 10.0, 'featurize': TfidfVectorizer()}
vocab size:  4510
LogisticRegression(C=10.0, max_iter=400)


In [None]:
def clean_text(text):
    import re
    import string
    text = text.lower()
    text = re.sub('[.*?]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('[\d\n]', ' ', text)
    return text