## Logistic Regression

In [26]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.preprocessing import MaxAbsScaler
from scipy.sparse import csr_matrix, vstack, hstack
from sklearn.model_selection import GridSearchCV

### Data Prep for ML

In [27]:
train = pd.read_pickle("train.pkl").dropna()
test = pd.read_pickle("test.pkl").dropna()

In [28]:
#One hot encoding
encoded_train = pd.get_dummies(train, columns = ['weighted_parent_sentiment_score','weighted_comment_sentiment_score'], drop_first=True)
encoded_test = pd.get_dummies(test, columns = ['weighted_parent_sentiment_score','weighted_comment_sentiment_score'], drop_first=True)

In [29]:
train_parent_tdidf_csr = vstack(encoded_train["parent_comment_tdidf"])
test_parent_tdidf_csr = vstack(encoded_test["parent_comment_tdidf"])

train_tdidf_csr = vstack(encoded_train["comment_tdidf"])
test_tdidf_csr = vstack(encoded_test["comment_tdidf"])

train_parent_bow_csr = vstack(encoded_train["parent_comment_bow"])
test_parent_bow_csr = vstack(encoded_test["parent_comment_bow"])

train_bow_csr = vstack(encoded_train["comment_bow"])
test_bow_csr = vstack(encoded_test["comment_bow"])

In [30]:
encoded_train.columns

Index(['comment', 'author', 'subreddit', 'score', 'ups', 'downs', 'date',
       'created_utc', 'parent_comment', 'comment_tokens',
       'parent_comment_tokens', 'comment_score', 'parent_comment_score',
       'comment_word_count', 'parent_comment_word_count',
       'comment_token_count', 'parent_comment_token_count',
       'comment_unique_word_count', 'parent_comment_unique_word_count',
       'comment_unique_token_count', 'parent_comment_unique_token_count',
       'comment_stopword_count', 'parent_comment_stopword_count',
       'comment_mean_word_length', 'parent_comment_mean_word_length',
       'comment_mean_token_length', 'parent_comment_mean_token_length',
       'comment_char_count', 'parent_comment_char_count',
       'comment_punctuation_count', 'parent_comment_punctuation_count',
       'comment_hashtag_count', 'parent_comment_hashtag_count',
       'comment_number_count', 'parent_comment_number_count', 'comment_bow',
       'parent_comment_bow', 'comment_tdidf', 'paren

In [31]:
list_of_features = ['score','ups','downs',
       'comment_word_count','parent_comment_word_count',
       'comment_token_count', 'parent_comment_token_count',
       'comment_unique_word_count', 'parent_comment_unique_word_count',
       'comment_unique_token_count', 'parent_comment_unique_token_count',
       'comment_stopword_count', 'parent_comment_stopword_count',
       'comment_mean_word_length', 'parent_comment_mean_word_length',
       'comment_mean_token_length', 'parent_comment_mean_token_length',
       'comment_char_count', 'parent_comment_char_count',
       'comment_punctuation_count', 'parent_comment_punctuation_count',
       'comment_hashtag_count', 'parent_comment_hashtag_count',
       'comment_number_count', 'parent_comment_number_count',
       'weighted_parent_sentiment_score_neutral',
       'weighted_parent_sentiment_score_positive',
       'weighted_comment_sentiment_score_neutral',
       'weighted_comment_sentiment_score_positive']

In [32]:
X_train_gen_features = csr_matrix(encoded_train[list_of_features])
y_train_LR = encoded_train['label']

X_test_gen_features = csr_matrix(encoded_test[list_of_features])
y_test_LR = encoded_test['label']

### Baseline 1: General Features

In [33]:
#Data Specific Prep
X_train_LR = X_train_gen_features

scaler = MaxAbsScaler()
X_train_LR = scaler.fit_transform(X_train_LR)


In [34]:
k = 5

model = LogisticRegression(max_iter = 500)
kf = KFold(n_splits=k, shuffle=True, random_state=42)

cross_val_scores = []
for train_index, val_index in kf.split(X_train_LR): 
    X_train, X_val = X_train_LR[train_index], X_train_LR[val_index]
    y_train, y_val = y_train_LR.iloc[train_index,], y_train_LR.iloc[val_index,]
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the validation data
    y_pred = model.predict(X_val)
    
    # Calculate accuracy and store it in the list
    accuracy = accuracy_score(y_val, y_pred)
    cross_val_scores.append(accuracy)

mean_accuracy = sum(cross_val_scores) / k
std_accuracy = np.std(cross_val_scores)

print(f"Mean Accuracy: {mean_accuracy}")
print(f"Standard Deviation of Accuracy: {std_accuracy}")

Mean Accuracy: 0.549073778456603
Standard Deviation of Accuracy: 0.0011866425387839022


### Baseline 2: Comment BoW

In [35]:
#Data Specific Prep
X_train_LR = train_bow_csr

scaler = MaxAbsScaler()
X_train_LR = scaler.fit_transform(X_train_LR)

In [36]:
k = 5

model = LogisticRegression(max_iter = 500)
kf = KFold(n_splits=k, shuffle=True, random_state=42)

cross_val_scores = []
for train_index, val_index in kf.split(X_train_LR): 
    X_train, X_val = X_train_LR[train_index], X_train_LR[val_index]
    y_train, y_val = y_train_LR.iloc[train_index,], y_train_LR.iloc[val_index,]
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the validation data
    y_pred = model.predict(X_val)
    
    # Calculate accuracy and store it in the list
    accuracy = accuracy_score(y_val, y_pred)
    cross_val_scores.append(accuracy)

mean_accuracy = sum(cross_val_scores) / k
std_accuracy = np.std(cross_val_scores)

print(f"Mean Accuracy: {mean_accuracy}")
print(f"Standard Deviation of Accuracy: {std_accuracy}")

Mean Accuracy: 0.6559869405583454
Standard Deviation of Accuracy: 0.002837919243242117


### Baseline 3: Comment TD-IDF

In [37]:
#Data Specific Prep
X_train_LR = train_tdidf_csr

scaler = MaxAbsScaler()
X_train_LR = scaler.fit_transform(X_train_LR)

In [38]:
k = 5

model = LogisticRegression(max_iter = 500)
kf = KFold(n_splits=k, shuffle=True, random_state=42)

cross_val_scores = []
for train_index, val_index in kf.split(X_train_LR): 
    X_train, X_val = X_train_LR[train_index], X_train_LR[val_index]
    y_train, y_val = y_train_LR.iloc[train_index,], y_train_LR.iloc[val_index,]
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the validation data
    y_pred = model.predict(X_val)
    
    # Calculate accuracy and store it in the list
    accuracy = accuracy_score(y_val, y_pred)
    cross_val_scores.append(accuracy)

mean_accuracy = sum(cross_val_scores) / k
std_accuracy = np.std(cross_val_scores)

print(f"Mean Accuracy: {mean_accuracy}")
print(f"Standard Deviation of Accuracy: {std_accuracy}")

Mean Accuracy: 0.6562352661068654
Standard Deviation of Accuracy: 0.0017712936494375434


### Baseline 4: General Features + BoW

In [39]:
X_train_LR = hstack([X_train_gen_features,train_bow_csr])

scaler = MaxAbsScaler()
X_train_LR = scaler.fit_transform(X_train_LR)

In [40]:
k = 5

model = LogisticRegression(max_iter = 500)
kf = KFold(n_splits=k, shuffle=True, random_state=42)

cross_val_scores = []
for train_index, val_index in kf.split(X_train_LR): 
    X_train, X_val = X_train_LR[train_index], X_train_LR[val_index]
    y_train, y_val = y_train_LR.iloc[train_index,], y_train_LR.iloc[val_index,]
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the validation data
    y_pred = model.predict(X_val)
    
    # Calculate accuracy and store it in the list
    accuracy = accuracy_score(y_val, y_pred)
    cross_val_scores.append(accuracy)

mean_accuracy = sum(cross_val_scores) / k
std_accuracy = np.std(cross_val_scores)

print(f"Mean Accuracy: {mean_accuracy}")
print(f"Standard Deviation of Accuracy: {std_accuracy}")

Mean Accuracy: 0.6607484256002715
Standard Deviation of Accuracy: 0.0023205634651901974


### Baseline 5: General Features + TD-IDF

In [41]:
X_train_LR = hstack([X_train_gen_features,train_tdidf_csr])

scaler = MaxAbsScaler()
X_train_LR = scaler.fit_transform(X_train_LR)

In [42]:
k = 5

model = LogisticRegression(max_iter = 500)
kf = KFold(n_splits=k, shuffle=True, random_state=42)

cross_val_scores = []
for train_index, val_index in kf.split(X_train_LR): 
    X_train, X_val = X_train_LR[train_index], X_train_LR[val_index]
    y_train, y_val = y_train_LR.iloc[train_index,], y_train_LR.iloc[val_index,]
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the validation data
    y_pred = model.predict(X_val)
    
    # Calculate accuracy and store it in the list
    accuracy = accuracy_score(y_val, y_pred)
    cross_val_scores.append(accuracy)

mean_accuracy = sum(cross_val_scores) / k
std_accuracy = np.std(cross_val_scores)

print(f"Mean Accuracy: {mean_accuracy}")
print(f"Standard Deviation of Accuracy: {std_accuracy}")

Mean Accuracy: 0.6611209176806286
Standard Deviation of Accuracy: 0.0020015926105550713


### Baseline 6: Gen Features + Comment TDIDF + Parent TDIDF

In [43]:
X_train_LR = hstack([X_train_gen_features,train_tdidf_csr,train_parent_tdidf_csr])

scaler = MaxAbsScaler()
X_train_LR = scaler.fit_transform(X_train_LR)

In [44]:
k = 5

model = LogisticRegression(max_iter = 500)
kf = KFold(n_splits=k, shuffle=True, random_state=42)

cross_val_scores = []
for train_index, val_index in kf.split(X_train_LR): 
    X_train, X_val = X_train_LR[train_index], X_train_LR[val_index]
    y_train, y_val = y_train_LR.iloc[train_index,], y_train_LR.iloc[val_index,]
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the validation data
    y_pred = model.predict(X_val)
    
    # Calculate accuracy and store it in the list
    accuracy = accuracy_score(y_val, y_pred)
    cross_val_scores.append(accuracy)

mean_accuracy = sum(cross_val_scores) / k
std_accuracy = np.std(cross_val_scores)

print(f"Mean Accuracy: {mean_accuracy}")
print(f"Standard Deviation of Accuracy: {std_accuracy}")

Mean Accuracy: 0.6544101625725147
Standard Deviation of Accuracy: 0.0036873570540180743


### Baseline 7: Gen + Parent TDIDF

In [45]:
X_train_LR = hstack([X_train_gen_features,train_parent_tdidf_csr])

scaler = MaxAbsScaler()
X_train_LR = scaler.fit_transform(X_train_LR)

In [46]:
k = 5

model = LogisticRegression(max_iter = 500)
kf = KFold(n_splits=k, shuffle=True, random_state=42)

cross_val_scores = []
for train_index, val_index in kf.split(X_train_LR): 
    X_train, X_val = X_train_LR[train_index], X_train_LR[val_index]
    y_train, y_val = y_train_LR.iloc[train_index,], y_train_LR.iloc[val_index,]
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the validation data
    y_pred = model.predict(X_val)
    
    # Calculate accuracy and store it in the list
    accuracy = accuracy_score(y_val, y_pred)
    cross_val_scores.append(accuracy)

mean_accuracy = sum(cross_val_scores) / k
std_accuracy = np.std(cross_val_scores)

print(f"Mean Accuracy: {mean_accuracy}")
print(f"Standard Deviation of Accuracy: {std_accuracy}")

Mean Accuracy: 0.568635008845028
Standard Deviation of Accuracy: 0.0013539634505594297


### Hyper Parameter Tuning with Grid Search CV

In [47]:
### Using gen features and tdidf
X_train_LR = hstack([X_train_gen_features,train_tdidf_csr])

scaler = MaxAbsScaler()
X_train_LR = scaler.fit_transform(X_train_LR)

In [48]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [1000] #allow for convergence for all solvers
}

In [49]:
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy', verbose=4)
grid_search.fit(X_train_LR, y_train_LR)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END C=0.001, max_iter=1000, penalty=l1, solver=liblinear;, score=0.527 total time=   0.2s
[CV 2/5] END C=0.001, max_iter=1000, penalty=l1, solver=liblinear;, score=0.522 total time=   0.1s
[CV 3/5] END C=0.001, max_iter=1000, penalty=l1, solver=liblinear;, score=0.523 total time=   0.1s
[CV 4/5] END C=0.001, max_iter=1000, penalty=l1, solver=liblinear;, score=0.526 total time=   0.1s
[CV 5/5] END C=0.001, max_iter=1000, penalty=l1, solver=liblinear;, score=0.521 total time=   0.1s
[CV 1/5] END C=0.001, max_iter=1000, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END C=0.001, max_iter=1000, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END C=0.001, max_iter=1000, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END C=0.001, max_iter=1000, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 5/5] END C=0.001, max_iter=1000, penalty=l1, solver=lbfgs;, score=n

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END C=100, max_iter=1000, penalty=l2, solver=lbfgs;, score=0.642 total time=  24.9s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END C=100, max_iter=1000, penalty=l2, solver=lbfgs;, score=0.642 total time=  25.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END C=100, max_iter=1000, penalty=l2, solver=lbfgs;, score=0.643 total time=  25.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END C=100, max_iter=1000, penalty=l2, solver=lbfgs;, score=0.647 total time=  24.9s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
30 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dxcas\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dxcas\anaconda3\Lib\sit

[CV 5/5] END C=100, max_iter=1000, penalty=l2, solver=lbfgs;, score=0.647 total time=  24.6s


In [50]:
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Best score achieved during grid search
best_score = grid_search.best_score_
print("Best Score:", best_score)

Best Parameters: {'C': 1, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
Best Score: 0.6653795446863355


In [51]:
# Best estimator (the fitted model with the best parameters)
best_estimator = grid_search.best_estimator_
print("Best Estimator:", best_estimator)

# Results for all parameter combinations
cv_results = grid_search.cv_results_

Best Estimator: LogisticRegression(C=1, max_iter=1000, penalty='l1', solver='liblinear')


In [52]:
mean_test_scores = cv_results['mean_test_score']
std_test_scores = cv_results['std_test_score']
params = cv_results['params']

pd.set_option('display.max_colwidth',None)
results_df = pd.DataFrame({'Params':params,'Mean Score':mean_test_scores,'STD':std_test_scores})
results_df["Params"] = results_df["Params"].apply(lambda x: ', '.join([f'{key}: {value}' for key, value in x.items()]))
results_df.sort_values('STD', inplace=True)
results_df.sort_values('Mean Score', ascending=False,inplace=True)
results_df


#for mean_score, std_score, param in zip(mean_test_scores, std_test_scores, params):
    #print(f"Mean Score: {mean_score:.3f} (±{std_score:.3f}) for params: {param}")

Unnamed: 0,Params,Mean Score,STD
12,"C: 1, max_iter: 1000, penalty: l1, solver: liblinear",0.66538,0.002526
15,"C: 1, max_iter: 1000, penalty: l2, solver: lbfgs",0.662114,0.001566
14,"C: 1, max_iter: 1000, penalty: l2, solver: liblinear",0.662102,0.001593
10,"C: 0.1, max_iter: 1000, penalty: l2, solver: liblinear",0.661363,0.002492
11,"C: 0.1, max_iter: 1000, penalty: l2, solver: lbfgs",0.661344,0.002461
18,"C: 10, max_iter: 1000, penalty: l2, solver: liblinear",0.649171,0.001682
19,"C: 10, max_iter: 1000, penalty: l2, solver: lbfgs",0.649121,0.001673
16,"C: 10, max_iter: 1000, penalty: l1, solver: liblinear",0.647594,0.002023
20,"C: 100, max_iter: 1000, penalty: l1, solver: liblinear",0.644111,0.001985
23,"C: 100, max_iter: 1000, penalty: l2, solver: lbfgs",0.643944,0.00212
