### Train Logistic Regression

- Joel Stremmel
- 04-09-23

##### About

Train logistic regression on the formatted data using K-Fold Cross-Validation and save the scores.

##### Imports

In [22]:
import os
import re
import glob
import pickle
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

##### Set Parameters

In [23]:
# Set sequence lengths for distribution plot and sentiment model
dist_max_seq_len = 32768

C = 0.1
seed = 42

##### Load Formatted Data

In [24]:
with open('data/X_folds.pkl', 'rb') as f:
    X_folds = pickle.load(f)

with open('data/y_folds.pkl', 'rb') as f:
    y_folds = pickle.load(f)

In [25]:
len(X_folds)

10

In [26]:
y_folds.values()

dict_values([[1, 0, 0, 1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 1, 1, 1], [1, 1, 0, 1, 1, 1, 0], [1, 0, 0, 0, 1, 1, 0, 0], [1, 1, 1, 0, 1, 1, 0], [1, 1, 1, 0, 0, 1, 1, 1], [1, 1, 0, 0, 0, 1, 0], [0, 0, 0], [1, 1, 0, 0, 1, 1], [1, 1, 1, 1, 0, 1, 1]])

In [36]:

assert len(X_folds) == len(y_folds), "Expected the same number of folds in X and y."
X = list(X_folds.values())
y = list(y_folds.values())

for i in range(len(X)):

    tfidf_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        strip_accents="unicode",
        analyzer="word",
        token_pattern=r"\w{1,}",
        stop_words="english",
        ngram_range=(2, 3),
        norm="l2",
        min_df=0.0001,
        max_df=0.5,
        smooth_idf=False
    )
    
    X_train, y_train = np.concatenate(X[0:i] + X[i+1:]), np.concatenate(y[0:i] + y[i+1:])
    indices = np.arange(len(y_train))
    np.random.shuffle(indices)
    X_train, y_train = X_train[indices], y_train[indices]
    
    X_test, y_test = X[i], y[i]
    
    train_features = tfidf_vectorizer.fit_transform(X_train)
    test_features = tfidf_vectorizer.transform(X_test)
    
    lr = LogisticRegression(
        solver="saga",
        fit_intercept=True,
        max_iter=5000,
        penalty="l1",
        C=C,
        class_weight=None,
        random_state=seed
    )
    
    lr.fit(train_features, y_train)
    
    print(len(lr.coef_[0]))
    print(len(tfidf_vectorizer.get_feature_names_out()))

22311
22311
22092
22092
23734
23734
21764
21764
23155
23155
22533
22533
24001
24001
24831
24831
24276
24276
22476
22476
