# Q4 Parameter Tuning
### Isaac Tabb

Step 0: Read in the sets.

In [None]:
import pandas as pd
from google.colab import files
uploaded = files.upload()

import io 
train_df = pd.read_csv(io.BytesIO(uploaded['training_set.csv']))
valid_df = pd.read_csv(io.BytesIO(uploaded['validation_set.csv']))

Saving training_set.csv to training_set.csv
Saving validation_set.csv to validation_set.csv


Create the labels set and texts set for both training and validation.

In [None]:
# turn the dataframes into dictionaries
train_dct = train_df.to_dict('records')
valid_dct = valid_df.to_dict('records')

# create two separate lists, the tweets and the labels for both training and validation
train_tweets, train_labels = [], []
for tweet in train_dct:
  train_tweets.append(tweet['text'])
  train_labels.append(tweet['team'])

valid_tweets, valid_labels = [], []
for tweet in valid_dct:
  valid_tweets.append(tweet['text'])
  valid_labels.append(tweet['team'])

First, we will select the best Regularization C Parameter.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# create a numpy array -> list from -3 to 5 incrementing by 1
c_param_exponent = np.arange(-3,6,1).tolist()

# iterate through each setting of the 10^x expondent
for val in c_param_exponent:
  # define out vectorizer
  vectorizer_lg_tfidf = TfidfVectorizer()
  # fit to training set
  X_lg_tfidf = vectorizer_lg_tfidf.fit(train_tweets)
  # transform both training and validation
  vectorized_train_lg_tfidf = vectorizer_lg_tfidf.transform(train_tweets)
  vectorized_valid_lg_tfidf = vectorizer_lg_tfidf.transform(valid_tweets)
  # create logistic regression classifier with C = 10^val
  clf = LogisticRegression(C=(10**val)).fit(vectorized_train_lg_tfidf, train_labels)
  labels_predicted = clf.predict(vectorized_valid_lg_tfidf)

  # output each f1 score
  f1 = f1_score(valid_labels, labels_predicted, average='macro')
  print("C=10^"+str(val))
  print(f"{f1=:.3f}")

C=10^-3
f1=0.210
C=10^-2
f1=0.210
C=10^-1
f1=0.250
C=10^0
f1=0.473
C=10^1
f1=0.515
C=10^2
f1=0.500
C=10^3
f1=0.484
C=10^4
f1=0.475
C=10^5
f1=0.473


The top C values that yield the highest F1-scores are 10^1 (.515) and 10^2 (.500). We will run a parameter grid later on with these C values.

Now let's look at sublinear_tf.

In [None]:
# two possible values are true or false
stf = [True, False]

# iterate through the two values
for val in stf:
  # define out vectorizer, with sublinear_tf = the value
  vectorizer_lg_tfidf = TfidfVectorizer(sublinear_tf=val)
  # fit to train
  X_lg_tfidf = vectorizer_lg_tfidf.fit(train_tweets)
  # transform both
  vectorized_train_lg_tfidf = vectorizer_lg_tfidf.transform(train_tweets)
  vectorized_valid_lg_tfidf = vectorizer_lg_tfidf.transform(valid_tweets)
  # define our logistic regression classifier
  clf = LogisticRegression().fit(vectorized_train_lg_tfidf, train_labels)
  labels_predicted = clf.predict(vectorized_valid_lg_tfidf)

  # output each f1 score
  f1 = f1_score(valid_labels, labels_predicted, average='macro')
  print("sublinear_tf="+str(val))
  print(f"{f1=:.3f}")

sublinear_tf=True
f1=0.473
sublinear_tf=False
f1=0.473


True and False for sublinear_tf yield the same value. When we run our parameter grid, we will evaluate both.

We will now look at max_features.

Let's look at how many features we have in our dataset.

In [None]:
# define a vectorizer
vectorizer_lg_tfidf = TfidfVectorizer()
# fit to train
X_lg_tfidf = vectorizer_lg_tfidf.fit(train_tweets)
# transform both
vectorized_train_lg_tfidf = vectorizer_lg_tfidf.transform(train_tweets)
vectorized_valid_lg_tfidf = vectorizer_lg_tfidf.transform(valid_tweets)
# look at how many features are in our dataset
vectorized_train_lg_tfidf.shape[1]

10395

We have 10395 features. We will run our max features from 315 to 10395 in groups of 315, this will give us an even 33 options.

In [None]:
# create a max_features list from 315 to 10395 in groups of 315 (for an even 33 option split)
max_features = np.arange(315, 10396, 315).tolist()

# iterate through max_features options
for val in max_features:
  # define a vectorizer, with max features set to current value
  vectorizer_lg_tfidf = TfidfVectorizer(max_features=val)
  # fit to train
  X_lg_tfidf = vectorizer_lg_tfidf.fit(train_tweets)
  # transform both
  vectorized_train_lg_tfidf = vectorizer_lg_tfidf.transform(train_tweets)
  vectorized_valid_lg_tfidf = vectorizer_lg_tfidf.transform(valid_tweets)
  # define out classifer
  clf = LogisticRegression().fit(vectorized_train_lg_tfidf, train_labels)
  labels_predicted = clf.predict(vectorized_valid_lg_tfidf)

  # output each f1 score
  f1 = f1_score(valid_labels, labels_predicted, average='macro')
  print("max_features="+str(val))
  print(f"{f1=:.3f}")

max_features=315
f1=0.477
max_features=630
f1=0.488
max_features=945
f1=0.484
max_features=1260
f1=0.493
max_features=1575
f1=0.491
max_features=1890
f1=0.485
max_features=2205
f1=0.483
max_features=2520
f1=0.483
max_features=2835
f1=0.478
max_features=3150
f1=0.475
max_features=3465
f1=0.473
max_features=3780
f1=0.478
max_features=4095
f1=0.476
max_features=4410
f1=0.475
max_features=4725
f1=0.472
max_features=5040
f1=0.473
max_features=5355
f1=0.471
max_features=5670
f1=0.473
max_features=5985
f1=0.474
max_features=6300
f1=0.477
max_features=6615
f1=0.477
max_features=6930
f1=0.477
max_features=7245
f1=0.473
max_features=7560
f1=0.473
max_features=7875
f1=0.473
max_features=8190
f1=0.472
max_features=8505
f1=0.470
max_features=8820
f1=0.467
max_features=9135
f1=0.469
max_features=9450
f1=0.467
max_features=9765
f1=0.469
max_features=10080
f1=0.473
max_features=10395
f1=0.473


The majority of the F1-scores range are within the 4.6 and 4.7 range. A few of the scores though are in 4.8s and 4.9s. The values 630, 945, 1260, 1575, 1890, 2205, and 2520 yield the best F1-scores so we will use these in the final parameter grid.

Finally, our parameter of choice will be Logistic Regression solver.

In [None]:
# these are the solver options for multinomial logistic regression
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']

# iterate through the solver options
for val in solvers:
  # define out vectorizer
  vectorizer_lg_tfidf = TfidfVectorizer()
  # fit to train
  X_lg_tfidf = vectorizer_lg_tfidf.fit(train_tweets)
  # transform both
  vectorized_train_lg_tfidf = vectorizer_lg_tfidf.transform(train_tweets)
  vectorized_valid_lg_tfidf = vectorizer_lg_tfidf.transform(valid_tweets)
  # define out classifier, with solver set to current value
  clf = LogisticRegression(solver=val).fit(vectorized_train_lg_tfidf, train_labels)
  labels_predicted = clf.predict(vectorized_valid_lg_tfidf)

  f1 = f1_score(valid_labels, labels_predicted, average='macro')
  print("solver="+str(val))
  print(f"{f1=:.3f}")

solver=lbfgs
f1=0.473
solver=liblinear
f1=0.437
solver=newton-cg
f1=0.473
solver=sag
f1=0.473
solver=saga
f1=0.471


Every solver other than 'liblinear' works about as well as the others. Now let's run a parameter grid with all of our options.

In [None]:
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm

# here is the parameter grid
param_grid = {'C': [1,2], 'sublinear_tf': [True, False], 
              'max_features': [630,945,1260,1575,1890,2205,2520],
              'solver':['lbfgs','newton-cg','sag','saga']}

# there are a lot of options so we will only save top-5 F1-scores
maxes = [[0,0],[0,0],[0,0],[0,0],[0,0]]
# iterate though the paremeter sets in the parameter grid
for params in tqdm(ParameterGrid(param_grid)):
  # define our vectorizer with sublinear tf and max features set to values in current param. set
  vectorizer_lg_tfidf = TfidfVectorizer(sublinear_tf=params['sublinear_tf'], max_features=params['max_features'])
  # fit to train
  X_lg_tfidf = vectorizer_lg_tfidf.fit(train_tweets)
  # transform both
  vectorized_train_lg_tfidf = vectorizer_lg_tfidf.transform(train_tweets)
  vectorized_valid_lg_tfidf = vectorizer_lg_tfidf.transform(valid_tweets)
  # define our classifier with C and solver set to values in current param. set
  clf = LogisticRegression(C=(10**params['C']), solver=params['solver']).fit(vectorized_train_lg_tfidf, train_labels)
  labels_predicted = clf.predict(vectorized_valid_lg_tfidf)

  # save f1 score
  f1 = f1_score(valid_labels, labels_predicted, average='macro')
  # only keep the f1 score if one of the top 5 best
  if f1 > maxes[4][1]:
    maxes[4][1] = f1
    maxes[4][0] = params
  
  # sort the maxes to make sure we continue having top 5 best
  maxes = sorted(maxes, reverse=True, key=lambda x: x[1])
  

100%|██████████| 112/112 [01:21<00:00,  1.38it/s]


In [None]:
maxes

[[{'C': 1, 'max_features': 945, 'solver': 'lbfgs', 'sublinear_tf': True},
  0.5172317039592175],
 [{'C': 1, 'max_features': 945, 'solver': 'lbfgs', 'sublinear_tf': False},
  0.5169657658259035],
 [{'C': 1, 'max_features': 1260, 'solver': 'lbfgs', 'sublinear_tf': True},
  0.5161177249359677],
 [{'C': 1, 'max_features': 2520, 'solver': 'lbfgs', 'sublinear_tf': True},
  0.5160558147233408],
 [{'C': 1, 'max_features': 945, 'solver': 'newton-cg', 'sublinear_tf': False},
  0.5160116171829892]]

Our best result is C=10^1, max_features=945, solver='lbfgs', sublinear_tf='True with score an F1-score of .517.

Now let's look at all of scoring metrics on the LogisticRegression with these tuned parameters.

In [None]:
# define out final vectorizer
vectorizer_lg_tfidf = TfidfVectorizer(sublinear_tf=True, max_features=945)
# fit to train
X_lg_tfidf = vectorizer_lg_tfidf.fit(train_tweets)
# transform both
vectorized_train_lg_tfidf = vectorizer_lg_tfidf.transform(train_tweets)
vectorized_valid_lg_tfidf = vectorizer_lg_tfidf.transform(valid_tweets)
# define the final classifier
clf = LogisticRegression(C=(10**1), solver='lbfgs').fit(vectorized_train_lg_tfidf, train_labels)
labels_predicted = clf.predict(vectorized_valid_lg_tfidf)

And let's loo

In [None]:
accuracy = accuracy_score(valid_labels, labels_predicted)
print(f"{accuracy=:.3f}")

precision = precision_score(valid_labels, labels_predicted, average='macro')
print(f"{precision=:.3f}")

recall = recall_score(valid_labels, labels_predicted, average='macro')
print(f"{recall=:.3f}")

f1 = f1_score(valid_labels, labels_predicted, average='macro')
print(f"{f1=:.3f}")

accuracy=0.754
precision=0.599
recall=0.475
f1=0.517
