In [24]:
import numpy as np
import pandas as pd
from collections import Counter
import requests
import os
import timeit

In [25]:
class LogisticRegression:
    def __init__(self, lr = 0.001, n_iters = 1000):
        self.lr = lr
        self.n_iters = n_iters
        self.weights = None
        self.bias = None
        
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.random.uniform(-1, 1, n_features)
        self.bias = np.random.uniform(-1, 1, 1)[0]
        
        #gradient descent
        for i in range(self.n_iters):
            lin_model = np.dot(X, self.weights) + self.bias
            y_pred = self._sigmoid(lin_model)
            
            dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))
            db = (1 / n_samples) * np.sum(y_pred - y)
            
            self.weights -= self.lr * dw
            self.bias -= self.lr * db
            
        
    def predict(self, X, threshold):
        lin_model = np.dot(X, self.weights) + self.bias
        y_pred = self._sigmoid(lin_model)
        y_pred_cls = [1 if i > threshold else -1 for i in y_pred]
        return pd.DataFrame({'Probability': y_pred, 'Class': y_pred_cls})
    
    #helper sigmoid function
    def _sigmoid(self, linear):
        return 1 / (1 + np.exp(-linear))

In [26]:
def test_metrics(actual, pred):
    actual = pd.Series(actual)
    pred = pd.Series(pred)
    
    TP = ((actual == 1) & (pred == 1)).sum()
    FP = ((actual != 1) & (pred == 1)).sum()
    TN = ((actual != 1) & (pred != 1)).sum()
    FN = ((actual == 1) & (pred != 1)).sum()
    
    accuracy = (actual == pred).mean()
    precision = TP / (pred == 1).sum()
    recall = TP / (actual == 1).sum()
    f1 = (2 * precision * recall) / (precision + recall)   
    
    
    print({"accuracy":accuracy,"precision":precision,"recall":recall, "F1 Score":f1})
    confusion_mat = pd.DataFrame({'Actually Positive': [TP, FN], 
                                  'Actually Negative': [FP, TN]},
                                index = ['Predicted Positive', 'Predicted Negative'])    
    
    return confusion_mat

## Feature Set 1 - Positive and Negative Counts

In [27]:
train_df = pd.read_csv('train1.csv')
test_df = pd.read_csv('test1.csv')

In [28]:
X_train = train_df.drop(['Unnamed: 0', 'Label'], axis = 1)
X_test = test_df.drop(['Unnamed: 0', 'Label'], axis = 1)
y_train = train_df.Label
y_test = test_df.Label

In [31]:
start_time = timeit.default_timer()

lr = LogisticRegression(lr = 0.001, n_iters = 1000)
lr.fit(X_train, y_train)
pred = lr.predict(X_test, threshold = 0.5)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

test_metrics(y_test, pred['Class'])

Time (minutes) elapsed for this cell: 0.027248027916357387
{'accuracy': 0.5912, 'precision': 0.9247391952309985, 'recall': 0.19856, 'F1 Score': 0.32692307692307687}


Unnamed: 0,Actually Positive,Actually Negative
Predicted Positive,2482,202
Predicted Negative,10018,12298


## Adding Interactions

In [32]:
X_train2 = X_train.copy()
X_test2 = X_test.copy()

In [33]:
X_train2["Interaction_posc_negc"] = X_train2.Positive_counts*X_train2.Negative_counts
X_test2["Interaction_posc_negc"] = X_test2.Positive_counts*X_test2.Negative_counts
X_train2 = X_train2[["Positive_counts","Negative_counts","Interaction_posc_negc"]]
X_test2 = X_test2[["Positive_counts","Negative_counts","Interaction_posc_negc"]]

In [34]:
start_time = timeit.default_timer()

lr2 = LogisticRegression(lr = 0.001, n_iters = 1000)
lr2.fit(X_train2, y_train)
pred2 = lr2.predict(X_test2, threshold = 0.5)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

test_metrics(y_test, pred2['Class'])

Time (minutes) elapsed for this cell: 0.028207414533244445
{'accuracy': 0.61684, 'precision': 0.8929244013989777, 'recall': 0.26552, 'F1 Score': 0.40932354936178084}


Unnamed: 0,Actually Positive,Actually Negative
Predicted Positive,3319,398
Predicted Negative,9181,12102


## Squared Terms

In [35]:
X_train3 = X_train2.copy()
X_test3 = X_test2.copy()

In [36]:
X_train3["Positive_counts2"] = X_train3.Positive_counts**2
X_train3["Negative_counts2"] = X_train3.Negative_counts**2
X_test3["Positive_counts2"] = X_test3.Positive_counts**2
X_test3["Negative_counts2"] = X_test3.Negative_counts**2

In [37]:
start_time = timeit.default_timer()

lr3 = LogisticRegression(lr = 0.001, n_iters = 1000)
lr3.fit(X_train3, y_train)
pred3 = lr3.predict(X_test3, threshold = 0.5)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

test_metrics(y_test, pred3['Class'])



Time (minutes) elapsed for this cell: 0.029927547866342743
{'accuracy': 0.66136, 'precision': 0.870635795663359, 'recall': 0.37904, 'F1 Score': 0.5281462490246349}


Unnamed: 0,Actually Positive,Actually Negative
Predicted Positive,4738,704
Predicted Negative,7762,11796


## Cubed Terms

In [38]:
X_train4 = X_train3.copy()
X_test4 = X_test3.copy()

In [39]:
X_train4["Positive_counts3"] = X_train3.Positive_counts**3
X_train4["Negative_counts3"] = X_train3.Negative_counts**3
X_test4["Positive_counts3"] = X_test3.Positive_counts**3
X_test4["Negative_counts3"] = X_test3.Negative_counts**3

In [40]:
start_time = timeit.default_timer()

lr4 = LogisticRegression(lr = 0.001, n_iters = 1000)
lr4.fit(X_train4, y_train)
pred4 = lr4.predict(X_test4, threshold = 0.5)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

test_metrics(y_test, pred4['Class'])



Time (minutes) elapsed for this cell: 0.03181885458325269
{'accuracy': 0.696, 'precision': 0.8333333333333334, 'recall': 0.49, 'F1 Score': 0.6171284634760705}


Unnamed: 0,Actually Positive,Actually Negative
Predicted Positive,6125,1225
Predicted Negative,6375,11275


## Probability Threshold Tuning

In [42]:
probs = [x * 0.1 for x in range(0, 10) if x != 0]
probs_df = pd.DataFrame({'Probability Threshold': [], 'Accuracy': []})
for i in range(len(probs)):
    pred_prob = lr4.predict(X_test4, threshold = probs[i])
    probs_df = pd.concat([probs_df,
                         pd.DataFrame({'Probability Threshold': [probs[i]], 'Accuracy': [(pred_prob['Class'] == y_test_stan).mean()]})])
    
probs_df.reset_index(inplace = True)
probs_df



Unnamed: 0,Probability Threshold,Accuracy
0,0.1,0.696
0,0.2,0.696
0,0.3,0.696
0,0.4,0.696
0,0.5,0.696
0,0.6,0.696
0,0.7,0.696
0,0.8,0.696
0,0.9,0.696


In [51]:
highest_acc_index = probs_df.idxmax(axis = 0)['Accuracy']
highest_acc_thresh = list(probs_df['Probability Threshold'])[highest_acc_index]
test_metrics(y_test, lr4.predict(X_test4, threshold = highest_acc_thresh)['Class'])

{'accuracy': 0.696, 'precision': 0.8328804347826086, 'recall': 0.4904, 'F1 Score': 0.6173212487411883}




Unnamed: 0,Actually Positive,Actually Negative
Predicted Positive,6130,1230
Predicted Negative,6370,11270


## Adjectives

In [52]:
train2 = pd.read_csv('train2.csv')
test2 = pd.read_csv('test2.csv')

In [53]:
train2 = train2.drop("Unnamed: 0",axis=1)
test2 = test2.drop("Unnamed: 0",axis=1)

In [54]:
X_train_adj = train2.drop('Label', axis = 1)
X_test_adj = test2.drop('Label', axis = 1)
y_train_adj = train2.Label
y_test_adj = test2.Label

In [56]:
start_time = timeit.default_timer()

lr_adj = LogisticRegression(lr = 0.001, n_iters = 1000)
lr_adj.fit(X_train_adj, y_train_adj)
pred_adj = lr_adj.predict(X_test_adj, threshold = 0.5)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

test_metrics(y_test_adj, pred_adj['Class'])

Time (minutes) elapsed for this cell: 10.78126423395006
{'accuracy': 0.51004, 'precision': 0.5147076057658502, 'recall': 0.35136, 'F1 Score': 0.41762943945228925}


Unnamed: 0,Actually Positive,Actually Negative
Predicted Positive,4392,4141
Predicted Negative,8108,8359


## Probability Threshold Tuning

In [57]:
probs = [x * 0.1 for x in range(0, 10) if x != 0]
probs_df = pd.DataFrame({'Probability Threshold': [], 'Accuracy': []})
for i in range(len(probs)):
    pred_prob = lr_adj.predict(X_test_adj, threshold = probs[i])
    probs_df = pd.concat([probs_df,
                         pd.DataFrame({'Probability Threshold': [probs[i]], 'Accuracy': [(pred_prob['Class'] == y_test_stan).mean()]})])
    
probs_df.reset_index(inplace = True)
probs_df

Unnamed: 0,Probability Threshold,Accuracy
0,0.1,0.50876
0,0.2,0.50896
0,0.3,0.51072
0,0.4,0.5112
0,0.5,0.51004
0,0.6,0.50844
0,0.7,0.506
0,0.8,0.5062
0,0.9,0.50548


In [64]:
highest_acc_index = probs_df.idxmax(axis = 0)['Accuracy']
highest_acc_thresh = list(probs_df['Probability Threshold'])[highest_acc_index]
test_metrics(y_test, lr_adj.predict(X_test_adj, threshold = highest_acc_thresh)['Class'])

{'accuracy': 0.5112, 'precision': 0.514297385620915, 'recall': 0.40288, 'F1 Score': 0.4518212811771039}


Unnamed: 0,Actually Positive,Actually Negative
Predicted Positive,5036,4756
Predicted Negative,7464,7744


## Stanford

In [65]:
stanford_train = pd.read_csv('standford_train.csv')
stanford_test = pd.read_csv('stanford_test.csv')

In [66]:
stanford_train = stanford_train.drop("Unnamed: 0",axis=1)
stanford_test = stanford_test.drop("Unnamed: 0",axis=1)

In [67]:
X_train_stan = stanford_train.drop('Label', axis = 1)
X_test_stan = stanford_test.drop('Label', axis = 1)
y_train_stan = stanford_train.Label
y_test_stan = stanford_test.Label

In [68]:
start_time = timeit.default_timer()

lr_stan = LogisticRegression(lr = 0.001, n_iters = 1000)
lr_stan.fit(X_train_stan, y_train_stan)
pred_stan = lr_stan.predict(X_test_stan, threshold = 0.5)

elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

test_metrics(y_test_stan, pred_stan['Class'])

Time (minutes) elapsed for this cell: 0.8860726735331507
{'accuracy': 0.48968, 'precision': 0.48788277287244036, 'recall': 0.41552, 'F1 Score': 0.4488032489415018}


Unnamed: 0,Actually Positive,Actually Negative
Predicted Positive,5194,5452
Predicted Negative,7306,7048


## Probability Threshold Tuning

In [69]:
probs_df = pd.DataFrame({'Probability Threshold': [], 'Accuracy': []})
for i in range(len(probs)):
    pred_prob = lr_stan.predict(X_test_stan, threshold = probs[i])
    probs_df = pd.concat([probs_df,
                         pd.DataFrame({'Probability Threshold': [probs[i]], 'Accuracy': [(pred_prob['Class'] == y_test_stan).mean()]})])
    
probs_df.reset_index(inplace = True)
probs_df    

Unnamed: 0,index,Probability Threshold,Accuracy
0,0,0.1,0.49228
1,0,0.2,0.4898
2,0,0.3,0.48936
3,0,0.4,0.48912
4,0,0.5,0.48968
5,0,0.6,0.49196
6,0,0.7,0.49168
7,0,0.8,0.49324
8,0,0.9,0.4964


In [70]:
highest_acc_index = probs_df.idxmax(axis = 0)['Accuracy']
highest_acc_thresh = list(probs_df['Probability Threshold'])[highest_acc_index]
test_metrics(y_test, lr_stan.predict(X_test_stan, threshold = highest_acc_thresh)['Class'])

{'accuracy': 0.4964, 'precision': 0.4941588785046729, 'recall': 0.30456, 'F1 Score': 0.3768560681053257}


Unnamed: 0,Actually Positive,Actually Negative
Predicted Positive,3807,3897
Predicted Negative,8693,8603
