In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
import pandas as pd

In [14]:
train_data = pd.read_csv("data/baseline_processed_train_data.csv")
pretest_data = pd.read_csv("data/baseline_processed_pretest_data.csv")
test_data = pd.read_csv("data/baseline_processed_test_data.csv")

print(test_data.head())

sample_size = 10000

train_data = train_data.sample(n=sample_size, random_state=42)
pretest_data = pretest_data.sample(n=sample_size, random_state=42)
sampltest_dataed_test_data = test_data.sample(n=sample_size, random_state=42)

   Unnamed: 0                                             tokens  label
0           0  phones modern humans today are always on their...      0
1           1  this essay will explain if drivers should or s...      0
2           2  driving while the use of cellular devices toda...      0
3           3  phones & driving drivers should not be able to...      0
4           4  cell phone operation while driving the ability...      0


In [15]:
def get_text(row):
    return row['tokens']

X_train = train_data.apply(get_text, axis=1)
y_train = train_data['label']
X_pretest = pretest_data.apply(get_text, axis=1)
y_pretest = pretest_data['label']
X_test = test_data.apply(get_text, axis=1)
y_test = test_data['label']

In [16]:
# Standard logistic regression

standard_lr_model = LogisticRegression()

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  
    ('lr', standard_lr_model),      
])

pipeline.fit(X_train, y_train)
y_train_pred = pipeline.predict(X_train)
y_pretest_pred = pipeline.predict(X_pretest)
y_test_pred = pipeline.predict(X_test)
train_f1_score = f1_score(y_train, y_train_pred)
pretest_f1_score = f1_score(y_pretest, y_pretest_pred)
test_f1_score = f1_score(y_test, y_test_pred)

print("F1 score on training data:", train_f1_score)
print("F1 score on pretest data:", pretest_f1_score)
print("F1 score on test data:", test_f1_score)

F1 score on training data: 0.8957587845368686
F1 score on pretest data: 0.879761626548259
F1 score on test data: 0.8178383885884474


In [17]:
# Regularized logistic regression (lasso)

lasso_lr_model = LogisticRegression(penalty='l1', solver='liblinear')

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  
    ('lr', lasso_lr_model),      
])

pipeline.fit(X_train, y_train)
y_train_pred = pipeline.predict(X_train)
y_pretest_pred = pipeline.predict(X_pretest)
y_test_pred = pipeline.predict(X_test)
train_f1_score = f1_score(y_train, y_train_pred)
pretest_f1_score = f1_score(y_pretest, y_pretest_pred)
test_f1_score = f1_score(y_test, y_test_pred)

print("F1 score on training data:", train_f1_score)
print("F1 score on pretest data:", pretest_f1_score)
print("F1 score on test data:", test_f1_score)

F1 score on training data: 0.8855674366396881
F1 score on pretest data: 0.8744230086400757
F1 score on test data: 0.8226466360204098


In [18]:
# Regularized logistic regression (ridge)

ridge_lr_model = LogisticRegression(penalty='l2')

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  
    ('lr', ridge_lr_model),      
])

pipeline.fit(X_train, y_train)
y_train_pred = pipeline.predict(X_train)
y_pretest_pred = pipeline.predict(X_pretest)
y_test_pred = pipeline.predict(X_test)
train_f1_score = f1_score(y_train, y_train_pred)
pretest_f1_score = f1_score(y_pretest, y_pretest_pred)
test_f1_score = f1_score(y_test, y_test_pred)

print("F1 score on training data:", train_f1_score)
print("F1 score on pretest data:", pretest_f1_score)
print("F1 score on test data:", test_f1_score)

F1 score on training data: 0.8957587845368686
F1 score on pretest data: 0.879761626548259
F1 score on test data: 0.8178383885884474


In [19]:
# Regularized logistic regression (elastic net)

elastic_net_lr_model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  
    ('lr', elastic_net_lr_model),      
])

pipeline.fit(X_train, y_train)
y_train_pred = pipeline.predict(X_train)
y_pretest_pred = pipeline.predict(X_pretest)
y_test_pred = pipeline.predict(X_test)
train_f1_score = f1_score(y_train, y_train_pred)
pretest_f1_score = f1_score(y_pretest, y_pretest_pred)
test_f1_score = f1_score(y_test, y_test_pred)

print("F1 score on training data:", train_f1_score)
print("F1 score on pretest data:", pretest_f1_score)
print("F1 score on test data:", test_f1_score)

F1 score on training data: 0.8843505659936982
F1 score on pretest data: 0.8771909324608553
F1 score on test data: 0.8149814840856888
