In [1]:
import pandas as pd
import numpy as np
import timeit

from numpy.linalg import inv

## LDA Classifier

In [2]:
# returns a dictionary of priors (one key, value pair for each category)
def compute_priors(df, response):
    y = df[response]
    priors_dict = {}
    priors = dict(y.value_counts(normalize=True))
    return priors

# returns a dictionary of mu vectors (one key, value pair for each category)
def compute_mu_vectors(df, response):
    y = df[response]
    mu_vectors = {}
    for category_k in y.unique():
        df_k = df[y == category_k] # df with y = category_k
        X_k = df_k.drop(response, axis=1)
        mu_vectors[category_k] = dict(X_k.mean())
    return mu_vectors

# returns the inverse of the covariance matrix
def compute_inv_sigma(df, response):
    X = df.drop(response, axis=1)
    return inv(X.cov())

# returns the classification of a single obs
def classify_obs(x_i, y, mu_vectors, priors, inv_sigma):
        prob_dict = {}
        for category_k in y.unique():
            mu_k = pd.Series(mu_vectors[category_k]).to_numpy()
            first_term = x_i.transpose().dot(inv_sigma).dot(mu_k)
            second_term = .5 * mu_k.transpose().dot(inv_sigma).dot(mu_k)
            third_term = np.log(priors[category_k])
            prob_k = first_term - second_term + third_term
            prob_dict[category_k] = prob_k

        best_class, max_prob = next(iter(prob_dict.items()))
        for class_k, prob_k in prob_dict.items():
            if max_prob < prob_k:
                max_prob = prob_k
                best_class = class_k
        return best_class

class LDA:
    
    def fit(self, X_train, y_train):
        df_train = X_train.copy()
        response = y_train.name
        df_train[response] = y_train
        self.y = df_train[response]
        self.priors = compute_priors(df_train, response)
        self.mu_vectors = compute_mu_vectors(df_train, response)
        self.inv_sigma = compute_inv_sigma(df_train, response)
        
    def predict(self, df_test):
        y_pred = {}
        for i in range(len(df_test)):
            x_i = df_test.loc[i, :].to_numpy()
            y_pred[i] = classify_obs(x_i, self.y, self.mu_vectors, self.priors, self.inv_sigma)
        return pd.Series(y_pred)
    

## Function to Compute Confusion Matrix

In [3]:
def produce_confusion_matrix(y_pred, y_test):
    res = pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

    res_positives = res[res["Actual"] == 1]
    res_negatives = res[res["Actual"] == -1]

    positives_dict = dict(res_positives["Prediction"].value_counts(normalize=True))
    TPs = positives_dict[1]
    FNs = positives_dict[-1]

    negatives_dict = dict(res_negatives["Prediction"].value_counts(normalize=True))
    TNs = negatives_dict[-1]
    FPs = negatives_dict[1]

    positives = pd.Series([TPs, FNs])
    negatives = pd.Series([FPs, TNs])

    confusion_matrix = pd.DataFrame({"Actually Positive": positives, "Actually Negative": negatives})
    confusion_matrix.rename({0: "Prediced Positive", 1: "Predicted Negative"}, inplace=True)
    return confusion_matrix


# Train and Test (Feature Set #1)

In [48]:
df_train = pd.read_csv("train1.csv").drop("Unnamed: 0",axis=1)
df_test = pd.read_csv("test1.csv").drop("Unnamed: 0",axis=1)

X_train = df_train.drop("Label", axis=1)
y_train = df_train["Label"]
X_test = df_test.drop("Label", axis=1)
y_test = df_test["Label"]

## LDA Classifier Results

In [None]:
model = LDA()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = (y_pred == y_test).mean()
print("Accuracy:", acc)

In [None]:
confusion_matrix = produce_confusion_matrix(y_pred, y_test)
confusion_matrix

## Adding Interactions

In [None]:
X_train2 = X_train.copy()
X_test2 = X_test.copy()
y_train2 = y_train.copy()
y_test2 = y_test.copy()

X_train2["Interaction_posc_negc"] = X_train2.Positive_counts*X_train2.Negative_counts
X_test2["Interaction_posc_negc"] = X_test2.Positive_counts*X_test2.Negative_counts

In [21]:
model2 = LDA()
model2.fit(X_train2, y_train2)
y_pred2 = model2.predict(X_test2)
acc = (y_pred2 == y_test2).mean()
print("Accuracy:", acc)

0.73484

In [25]:
confusion_matrix = produce_confusion_matrix(y_pred, y_test)
confusion_matrix

Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,0.7208,0.25944
Predicted Negative,0.2792,0.74056


Squaring/Cubing Terms did not further improve the accuracy

# Train and Test (Feature Set #2)

In [4]:
start_time = timeit.default_timer()

df_train = pd.read_csv("train2.csv").drop("Unnamed: 0",axis=1)
df_test = pd.read_csv("test2.csv").drop("Unnamed: 0",axis=1)

X_train = df_train.drop("Label", axis=1)
y_train = df_train["Label"]
X_test = df_test.drop("Label", axis=1)
y_test = df_test["Label"]

elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

Time (minutes) elapsed for this cell: 0.23385001229999033


In [6]:
df_train.head()

Unnamed: 0,familiar,suspense,essential,delectable,fairy,majestic,gorgeous,gown,predictable,titular,...,raucous,limo,recognisable,relate,ease,penultimate,refused,cal,anythingbr,Label
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [22]:
df_train["shit"].value_counts()

KeyError: 'fuck'

## LDA Classifier Results

In [5]:
start_time = timeit.default_timer()

model = LDA()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = (y_pred == y_test).mean()
print("Accuracy:", acc)

elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

LinAlgError: Singular matrix

# Train and Test (Stanford Feature Set)

In [27]:
start_time = timeit.default_timer()

df_train = pd.read_csv("stanford_train.csv").drop("Unnamed: 0",axis=1)
df_test = pd.read_csv("stanford_test.csv").drop("Unnamed: 0",axis=1)

X_train = df_train.drop("Label", axis=1)
y_train = df_train["Label"]
X_test = df_test.drop("Label", axis=1)
y_test = df_test["Label"]

elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

## LDA Classifier Results

In [28]:
start_time = timeit.default_timer()

model = LDA()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = (y_pred == y_test).mean()
print("Accuracy:", acc)

elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

0.49692

In [43]:
confusion_matrix = produce_confusion_matrix(y_pred, y_test)
confusion_matrix

Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,0.39904,0.4052
Predicted Negative,0.60096,0.5948
