In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

from collections import Counter
from helpers import resample_data, svd_reduce_and_combine, get_submission_ready
# https://scikit-learn.org/1.5/modules/generated/sklearn.linear_model.LogisticRegression.html

In [2]:
X_train_full = pd.read_csv("data/components/tfidf/X_train_full.csv")
y_train_full = pd.read_csv("data/components/y_train.csv")
X_test_full = pd.read_csv("data/components/tfidf/X_test_full.csv")
y_test = pd.read_csv("data/components/y_test.csv")
submission = pd.read_csv("data/components/tfidf/submission_full.csv")

In [62]:
X_train_full

Unnamed: 0,ability,able,absolutely,across,act,acted,acting,action,actor,actors,...,york,you,young,younger,Helpful,Unhelpful,SummarySentiment,CleanedTextSentiment,ProductAvgScore,UserAvgScore
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,-0.5719,-0.8781,3.200000,3.781513
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,-0.8151,-0.8834,2.500000,2.916667
2,0.0,0.0,0.117048,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0,17,0.0000,0.6808,3.750000,1.772727
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.181319,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,2,0,0.8271,0.9081,3.750000,3.888889
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0.2732,-0.1027,3.690909,2.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1188263,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0.6597,0.8957,3.346154,3.833333
1188264,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,11,1,0.4404,0.9685,3.461538,3.000000
1188265,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0.2498,0.8481,4.165680,4.400000
1188266,0.0,0.0,0.000000,0.0,0.0,0.0,0.017685,0.018773,0.0,0.0,...,0.0,0.0,0.0,0.0,3,0,0.8360,0.9966,4.132075,4.692308


In [7]:
from sklearn.decomposition import TruncatedSVD

def svd_reduce_and_combine(X_train, end=500, components=250):
    """ 
        end is number of columns that is tf-idf (500 by default)
        components is the svd deconstruction lim (250 by default)
    """
    X_train_tfidf = X_train.iloc[:, :end]
    X_train_non_tfidf = X_train.iloc[:, end:]

    svd = TruncatedSVD(n_components=components, random_state=123)
    X_train_tfidf_reduced = svd.fit_transform(X_train_tfidf)

    X_train_reduced = np.hstack((X_train_tfidf_reduced, X_train_non_tfidf.values))

    columns = [f'SVD_{i+1}' for i in range(components)] + list(X_train_non_tfidf.columns)
    X_train_reduced = pd.DataFrame(X_train_reduced, columns=columns)

    return X_train_reduced

In [8]:
X_train_svd = svd_reduce_and_combine(X_train_full, end=1251, components=600)
X_train_svd.head(1)

Unnamed: 0,SVD_1,SVD_2,SVD_3,SVD_4,SVD_5,SVD_6,SVD_7,SVD_8,SVD_9,SVD_10,...,SVD_597,SVD_598,SVD_599,SVD_600,Helpful,Unhelpful,SummarySentiment,CleanedTextSentiment,ProductAvgScore,UserAvgScore
0,0.211409,-0.112633,-0.090526,-0.051251,0.156629,0.011992,0.011334,-0.02971,-0.043539,0.035935,...,-0.014844,0.025081,-0.042618,0.010993,0.0,0.0,-0.5719,-0.8781,3.2,3.781513


In [21]:
X_test_svd = svd_reduce_and_combine(X_test_full, end=1251, components=600)
X_test_svd.head(1)

Unnamed: 0,SVD_1,SVD_2,SVD_3,SVD_4,SVD_5,SVD_6,SVD_7,SVD_8,SVD_9,SVD_10,...,SVD_597,SVD_598,SVD_599,SVD_600,Helpful,Unhelpful,SummarySentiment,CleanedTextSentiment,ProductAvgScore,UserAvgScore
0,0.07639,-0.013696,0.023456,-0.003444,0.001449,0.024261,-0.039077,0.011237,-0.006785,-0.016443,...,0.002287,0.003329,-0.004944,-0.006252,0.0,0.0,0.0,0.7269,3.448276,5.0


In [29]:
submission_svd = svd_reduce_and_combine(submission, end=1251, components=600)

In [55]:
def feature_eng(df):
    df['UserAndProduct'] = (df['ProductAvgScore'] + df['UserAvgScore']) / 2
    df['OverallSentiment'] = (df['SummarySentiment'] + df['CleanedTextSentiment']) / 2
    df['Helpfulness'] = df['Helpful'] / (df['Unhelpful'] + df['Helpful'])
    df['Helpfulness'] = df['Helpfulness'].fillna(0)

    df = df.drop(columns=['ProductAvgScore', 'UserAvgScore', 'SummarySentiment', 'CleanedTextSentiment', 'Helpful', 'Unhelpful'], axis=1)
    return df

In [9]:
X_train_svd, y_train_full = resample_data(0.5, X_train_svd,y_train_full)
X_train_svd.shape

new training size is (594132, 607)


(594132, 606)

In [56]:
X_train_svd_eng = feature_eng(X_train_svd)
X_test_svd_eng = feature_eng(X_test_svd)

In [57]:
# confirm no NaNs left
nan_rows = X_test_svd_eng[X_test_svd_eng.isnull().any(axis=1)]
(nan_rows)

Unnamed: 0,SVD_1,SVD_2,SVD_3,SVD_4,SVD_5,SVD_6,SVD_7,SVD_8,SVD_9,SVD_10,...,SVD_594,SVD_595,SVD_596,SVD_597,SVD_598,SVD_599,SVD_600,UserAndProduct,OverallSentiment,Helpfulness


In [14]:
def cv_logistic_regression(x_train, y_train):
    # Range of C values to test, from 0.01 to 10
    C_values = np.logspace(-2, 1, 10)
    scores = []
    
    for C in C_values: 
        print("Testing C value:", C)
        clf = LogisticRegression(C=C, max_iter=1000, solver='lbfgs') 

        print("Performing cross val...")

        score = cross_val_score(clf, x_train, y_train, cv=2, scoring='accuracy')
        print(f"Mean score: {score.mean()}")

        scores.append(score.mean())
    
    plt.plot(C_values, scores, marker='o')
    plt.xscale('log')  # Log scale for better visualization
    plt.xlabel('C Value for Logistic Regression')
    plt.ylabel('Cross-Validated Accuracy')
    plt.title('Logistic Regression Performance by Regularization Strength')
    plt.show()
    
    # Find the best C based on the highest accuracy
    max_value = max(scores)
    best_C = C_values[scores.index(max_value)]
    
    print(f"Best accuracy score: {max_value:.4f} with C: {best_C}")
    return best_C, max_value

In [15]:
mv = cv_logistic_regression(X_train_svd, y_train_full)
mv

Testing C value: 0.01
Performing cross val...
Mean score: 0.654689530272734
Testing C value: 0.021544346900318832
Performing cross val...
Mean score: 0.6652040287343554
Testing C value: 0.046415888336127774
Performing cross val...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Mean score: 0.6714097203988338
Testing C value: 0.1
Performing cross val...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Mean score: 0.6747507287942747
Testing C value: 0.21544346900318834
Performing cross val...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Mean score: 0.6763210868965146
Testing C value: 0.46415888336127775
Performing cross val...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


KeyboardInterrupt: 



In [53]:
import pickle
with open('log_reg_model.pkl', 'wb') as file:
    pickle.dump(clf, file)

In [66]:
clf_2 = LogisticRegression(C=0.1, 
                            max_iter=1000, 
                            solver='lbfgs',
                            n_jobs = -1,
                            random_state=123) 

clf_2.fit(X_train_full, y_train_full)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [60]:
y_pred = clf_2.predict(X_test_svd_eng)
y_pred

array([5., 5., 5., ..., 5., 5., 3.])

In [36]:
submission_pred = clf_2.predict(submission_svd)
submission_pred

array([4., 5., 4., ..., 5., 5., 5.])

In [37]:
get_submission_ready(submission_pred)

Saved as submission.csv success!


In [27]:
# EVALUATION FUNCTION
def evaluate(y_pred):
    accuracy = accuracy_score(y_test, y_pred) * 100

    print(confusion_matrix(y_test, y_pred))
    print(f"Accuracy: {accuracy:.2f}%")
    print(f"prediction set: {Counter(y_pred)}")

In [61]:
evaluate(y_pred)

[[  7587   2189   2487   1831   4144]
 [  3007   2395   4524   3687   4323]
 [  2131   2051   8364  11104  11566]
 [  1485   1214   6588  20653  37105]
 [  1876    881   4252  16223 135400]]
Accuracy: 58.71%
prediction set: Counter({np.float64(5.0): 192538, np.float64(4.0): 53498, np.float64(3.0): 26215, np.float64(1.0): 16086, np.float64(2.0): 8730})
