## Logistic Regression

In [188]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.preprocessing import MaxAbsScaler
from scipy.sparse import csr_matrix, vstack, hstack

### Data Prep for ML

In [189]:
train = pd.read_pickle("train.pkl").dropna()
test = pd.read_pickle("test.pkl").dropna()

In [190]:
#One hot encoding
encoded_train = pd.get_dummies(train, columns = ['weighted_parent_sentiment_score','weighted_comment_sentiment_score'], drop_first=True)
encoded_test = pd.get_dummies(test, columns = ['weighted_parent_sentiment_score','weighted_comment_sentiment_score'], drop_first=True)

In [191]:
train_parent_tdidf_csr = vstack(encoded_train["parent_comment_tdidf"])
test_parent_tdidf_csr = vstack(encoded_test["parent_comment_tdidf"])

train_tdidf_csr = vstack(encoded_train["comment_tdidf"])
test_tdidf_csr = vstack(encoded_test["comment_tdidf"])

train_parent_bow_csr = vstack(encoded_train["parent_comment_bow"])
test_parent_bow_csr = vstack(encoded_test["parent_comment_bow"])

train_bow_csr = vstack(encoded_train["comment_bow"])
test_bow_csr = vstack(encoded_test["comment_bow"])

In [192]:
encoded_train.columns

Index(['comment', 'author', 'subreddit', 'score', 'ups', 'downs', 'date',
       'created_utc', 'parent_comment', 'comment_tokens',
       'parent_comment_tokens', 'comment_score', 'parent_comment_score',
       'comment_word_count', 'parent_comment_word_count',
       'comment_token_count', 'parent_comment_token_count',
       'comment_unique_word_count', 'parent_comment_unique_word_count',
       'comment_unique_token_count', 'parent_comment_unique_token_count',
       'comment_stopword_count', 'parent_comment_stopword_count',
       'comment_mean_word_length', 'parent_comment_mean_word_length',
       'comment_mean_token_length', 'parent_comment_mean_token_length',
       'comment_char_count', 'parent_comment_char_count',
       'comment_punctuation_count', 'parent_comment_punctuation_count',
       'comment_hashtag_count', 'parent_comment_hashtag_count',
       'comment_number_count', 'parent_comment_number_count', 'comment_bow',
       'parent_comment_bow', 'comment_tdidf', 'paren

In [193]:
list_of_features = ['score','ups','downs',
       'comment_word_count','parent_comment_word_count',
       'comment_token_count', 'parent_comment_token_count',
       'comment_unique_word_count', 'parent_comment_unique_word_count',
       'comment_unique_token_count', 'parent_comment_unique_token_count',
       'comment_stopword_count', 'parent_comment_stopword_count',
       'comment_mean_word_length', 'parent_comment_mean_word_length',
       'comment_mean_token_length', 'parent_comment_mean_token_length',
       'comment_char_count', 'parent_comment_char_count',
       'comment_punctuation_count', 'parent_comment_punctuation_count',
       'comment_hashtag_count', 'parent_comment_hashtag_count',
       'comment_number_count', 'parent_comment_number_count',
       'weighted_parent_sentiment_score_neutral',
       'weighted_parent_sentiment_score_positive',
       'weighted_comment_sentiment_score_neutral',
       'weighted_comment_sentiment_score_positive']

In [194]:
X_train_gen_features = csr_matrix(encoded_train[list_of_features])
y_train_LR = encoded_train['label']

X_test_gen_features = csr_matrix(encoded_test[list_of_features])
y_test_LR = encoded_test['label']

### Baseline 1: General Features

In [195]:
#Data Specific Prep
X_train_LR = X_train_gen_features

scaler = MaxAbsScaler()
X_train_LR = scaler.fit_transform(X_train_LR)


In [196]:
k = 5

model = LogisticRegression(solver='saga', max_iter = 1000)
kf = KFold(n_splits=k, shuffle=True, random_state=42)

cross_val_scores = []
for train_index, val_index in kf.split(X_train_LR): 
    X_train, X_val = X_train_LR[train_index], X_train_LR[val_index]
    y_train, y_val = y_train_LR.iloc[train_index,], y_train_LR.iloc[val_index,]
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the validation data
    y_pred = model.predict(X_val)
    
    # Calculate accuracy and store it in the list
    accuracy = accuracy_score(y_val, y_pred)
    cross_val_scores.append(accuracy)

mean_accuracy = sum(cross_val_scores) / k
std_accuracy = np.std(cross_val_scores)

print(f"Mean Accuracy: {mean_accuracy}")
print(f"Standard Deviation of Accuracy: {std_accuracy}")

Mean Accuracy: 0.5491668979599857
Standard Deviation of Accuracy: 0.0012069456637832066


### Baseline 2: Comment BoW

In [197]:
#Data Specific Prep
X_train_LR = train_bow_csr

scaler = MaxAbsScaler()
X_train_LR = scaler.fit_transform(X_train_LR)

In [198]:
k = 5

model = LogisticRegression(solver='saga', max_iter = 1000)
kf = KFold(n_splits=k, shuffle=True, random_state=42)

cross_val_scores = []
for train_index, val_index in kf.split(X_train_LR): 
    X_train, X_val = X_train_LR[train_index], X_train_LR[val_index]
    y_train, y_val = y_train_LR.iloc[train_index,], y_train_LR.iloc[val_index,]
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the validation data
    y_pred = model.predict(X_val)
    
    # Calculate accuracy and store it in the list
    accuracy = accuracy_score(y_val, y_pred)
    cross_val_scores.append(accuracy)

mean_accuracy = sum(cross_val_scores) / k
std_accuracy = np.std(cross_val_scores)

print(f"Mean Accuracy: {mean_accuracy}")
print(f"Standard Deviation of Accuracy: {std_accuracy}")

Mean Accuracy: 0.6559993561709694
Standard Deviation of Accuracy: 0.002844977393892126


### Baseline 3: Comment TD-IDF

In [199]:
#Data Specific Prep
X_train_LR = train_tdidf_csr

scaler = MaxAbsScaler()
X_train_LR = scaler.fit_transform(X_train_LR)

In [200]:
k = 5

model = LogisticRegression(solver='saga', max_iter = 1000)
kf = KFold(n_splits=k, shuffle=True, random_state=42)

cross_val_scores = []
for train_index, val_index in kf.split(X_train_LR): 
    X_train, X_val = X_train_LR[train_index], X_train_LR[val_index]
    y_train, y_val = y_train_LR.iloc[train_index,], y_train_LR.iloc[val_index,]
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the validation data
    y_pred = model.predict(X_val)
    
    # Calculate accuracy and store it in the list
    accuracy = accuracy_score(y_val, y_pred)
    cross_val_scores.append(accuracy)

mean_accuracy = sum(cross_val_scores) / k
std_accuracy = np.std(cross_val_scores)

print(f"Mean Accuracy: {mean_accuracy}")
print(f"Standard Deviation of Accuracy: {std_accuracy}")

Mean Accuracy: 0.6562290582042054
Standard Deviation of Accuracy: 0.0017472335992932341


### Baseline 4: General Features + BoW

In [201]:
X_train_LR = hstack([X_train_gen_features,train_bow_csr])

scaler = MaxAbsScaler()
X_train_LR = scaler.fit_transform(X_train_LR)

In [202]:
k = 5

model = LogisticRegression(solver='saga', max_iter = 1000)
kf = KFold(n_splits=k, shuffle=True, random_state=42)

cross_val_scores = []
for train_index, val_index in kf.split(X_train_LR): 
    X_train, X_val = X_train_LR[train_index], X_train_LR[val_index]
    y_train, y_val = y_train_LR.iloc[train_index,], y_train_LR.iloc[val_index,]
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the validation data
    y_pred = model.predict(X_val)
    
    # Calculate accuracy and store it in the list
    accuracy = accuracy_score(y_val, y_pred)
    cross_val_scores.append(accuracy)

mean_accuracy = sum(cross_val_scores) / k
std_accuracy = np.std(cross_val_scores)

print(f"Mean Accuracy: {mean_accuracy}")
print(f"Standard Deviation of Accuracy: {std_accuracy}")

Mean Accuracy: 0.6607794658843569
Standard Deviation of Accuracy: 0.002248445257500324


### Baseline 5: General Features + TD-IDF

In [203]:
X_train_LR = hstack([X_train_gen_features,train_tdidf_csr])

scaler = MaxAbsScaler()
X_train_LR = scaler.fit_transform(X_train_LR)

In [204]:
k = 5

model = LogisticRegression(solver='saga', max_iter = 1000)
kf = KFold(n_splits=k, shuffle=True, random_state=42)

cross_val_scores = []
for train_index, val_index in kf.split(X_train_LR): 
    X_train, X_val = X_train_LR[train_index], X_train_LR[val_index]
    y_train, y_val = y_train_LR.iloc[train_index,], y_train_LR.iloc[val_index,]
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the validation data
    y_pred = model.predict(X_val)
    
    # Calculate accuracy and store it in the list
    accuracy = accuracy_score(y_val, y_pred)
    cross_val_scores.append(accuracy)

mean_accuracy = sum(cross_val_scores) / k
std_accuracy = np.std(cross_val_scores)

print(f"Mean Accuracy: {mean_accuracy}")
print(f"Standard Deviation of Accuracy: {std_accuracy}")

Mean Accuracy: 0.6610712542666517
Standard Deviation of Accuracy: 0.0020377135405951733


### Baseline 6: Gen Features + Comment TDIDF + Parent TDIDF

In [206]:
X_train_LR = hstack([X_train_gen_features,train_tdidf_csr,train_parent_tdidf_csr])

scaler = MaxAbsScaler()
X_train_LR = scaler.fit_transform(X_train_LR)

In [207]:
k = 5

model = LogisticRegression(solver='saga', max_iter = 1000)
kf = KFold(n_splits=k, shuffle=True, random_state=42)

cross_val_scores = []
for train_index, val_index in kf.split(X_train_LR): 
    X_train, X_val = X_train_LR[train_index], X_train_LR[val_index]
    y_train, y_val = y_train_LR.iloc[train_index,], y_train_LR.iloc[val_index,]
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the validation data
    y_pred = model.predict(X_val)
    
    # Calculate accuracy and store it in the list
    accuracy = accuracy_score(y_val, y_pred)
    cross_val_scores.append(accuracy)

mean_accuracy = sum(cross_val_scores) / k
std_accuracy = np.std(cross_val_scores)

print(f"Mean Accuracy: {mean_accuracy}")
print(f"Standard Deviation of Accuracy: {std_accuracy}")

Mean Accuracy: 0.654434994761244
Standard Deviation of Accuracy: 0.003712193442411324


### Baseline 7: Gen + Parent TDIDF

In [208]:
X_train_LR = hstack([X_train_gen_features,train_parent_tdidf_csr])

scaler = MaxAbsScaler()
X_train_LR = scaler.fit_transform(X_train_LR)

In [209]:
k = 5

model = LogisticRegression(solver='saga', max_iter = 1000)
kf = KFold(n_splits=k, shuffle=True, random_state=42)

cross_val_scores = []
for train_index, val_index in kf.split(X_train_LR): 
    X_train, X_val = X_train_LR[train_index], X_train_LR[val_index]
    y_train, y_val = y_train_LR.iloc[train_index,], y_train_LR.iloc[val_index,]
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the validation data
    y_pred = model.predict(X_val)
    
    # Calculate accuracy and store it in the list
    accuracy = accuracy_score(y_val, y_pred)
    cross_val_scores.append(accuracy)

mean_accuracy = sum(cross_val_scores) / k
std_accuracy = np.std(cross_val_scores)

print(f"Mean Accuracy: {mean_accuracy}")
print(f"Standard Deviation of Accuracy: {std_accuracy}")

Mean Accuracy: 0.5686163857151365
Standard Deviation of Accuracy: 0.001434463462199886
