In [2]:
# Import necessary libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import joblib
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

In [4]:
dataset = pd.read_csv('train.csv')

In [5]:
# Get the dataset schema
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      30000 non-null  int64 
 1   text    30000 non-null  object
 2   label   30000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 703.2+ KB


In [6]:
# Analysis
# create a new column with the length of each review
dataset['text_length'] = dataset['text'].apply(len)
# calculate the mean of the new column
mean_length = dataset['text_length'].mean()
# print the mean length
print('Mean review length:', mean_length)


Mean review length: 1193.7237333333333


In [6]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(dataset['text'], dataset['label'], test_size=0.2, random_state=42)

In [22]:
# Define the pipeline 1
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),  # Vectorizer
    ('clf', LogisticRegression(solver='saga')),  # Classifier with saga solver
])

# Define the parameter grid
parameters = {
    'clf__fit_intercept': [False],  # Whether to calculate the intercept for this model
    'clf__C': [10, 1000],  # Inverse of regularization strength; smaller values specify stronger regularization
    # 'clf__penalty': ['elasticnet'],  # Penalty norm used in the regularization
    # 'clf__l1_ratio': [ 0.5]  # Mixing parameter for elastic net regularization
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=5)

In [9]:
# Define the pipeline 2
# Define the range of degrees to test
degrees = [2]

# Define the hyperparameters to test
param_grid = {'polynomialfeatures__degree': degrees,
              'logisticregression__C': [10]}

# Create a pipeline that includes the TfidfVectorizer transformation, PolynomialFeatures transformation and the LogisticRegression model
pipe = make_pipeline(TfidfVectorizer(), PolynomialFeatures(include_bias=False), LogisticRegression(random_state=0))

# Create a GridSearchCV object with the pipeline and the hyperparameters to test
grid_search = GridSearchCV(pipe, param_grid, cv=5)

In [23]:
# Perform grid search
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print(f"\t{param_name}: {best_parameters[param_name]}")

Fitting 5 folds for each of 2 candidates, totalling 10 fits




Best parameters set:
	clf__C: 10
	clf__fit_intercept: False


In [24]:
# Evaluate the model
y_pred = grid_search.predict(X_test)
print(metrics.classification_report(y_test, y_pred))

# Predict sentiment
new_text = ["I love this movie", "I hate this movie"]
print(grid_search.predict(new_text))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      2957
           1       0.88      0.90      0.89      3043

    accuracy                           0.89      6000
   macro avg       0.89      0.89      0.89      6000
weighted avg       0.89      0.89      0.89      6000

[1 0]


In [13]:
# Dump the best model
joblib.dump(grid_search.best_estimator_, 'hugo_dfidf_logreg_saga.bin')


['hugo_dfidf_logreg_saga.bin']

In [15]:

grid_search.best_estimator_.get_params()


{'memory': None,
 'steps': [('vect', TfidfVectorizer()),
  ('clf', LogisticRegression(C=10, solver='saga'))],
 'verbose': False,
 'vect': TfidfVectorizer(),
 'clf': LogisticRegression(C=10, solver='saga'),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.float64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__norm': 'l2',
 'vect__preprocessor': None,
 'vect__smooth_idf': True,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__sublinear_tf': False,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__use_idf': True,
 'vect__vocabulary': None,
 'clf__C': 10,
 'clf__class_weight': None,
 'clf__dual': False,
 'clf__fit_intercept': True,
 'clf__intercept_scaling': 1,
 'clf__l1_ratio': None,
 'clf__max_iter': 100,
 'clf__multi_class': 'auto',
 'clf__n_job