## Important notes: 
Before running this code, please ensures that you have set up the following directory structure:
```
/root
  |-- baseline_logreg_model.ipynb
  |-- /dataset
    |-- train.csv
    |-- test.csv
    |-- /features
        |-- glove.6B.100d.txt
```

## Load Dataset

In [5]:
import pandas as pd

# read the data
train_data = pd.read_csv("data_code/dataset/fixed_data_2/train.csv")
test_data = pd.read_csv("data_code/dataset/fixed_data_2/test.csv")

train_data.shape, test_data.shape

((430563, 3), (101309, 3))

## Feature Engineering

In [2]:
# numerical features
from nltk.tokenize import word_tokenize

def extract_numerical_features(text: str):
    numerical_features = {}
    
    numerical_features["text_length"] = len(text)
    numerical_features["special_characters_count"] = sum([1 for char in text if not char.isalnum()])
    numerical_features["digit_count"] = sum([1 for char in text if char.isdigit()])
    
    word_tokens = word_tokenize(text)
    numerical_features["word_count"] = len(word_tokens)
    numerical_features["unique_word_count"] = len(set(word_tokens))
    
    return numerical_features


training_numerical_features = train_data["Document"].apply(extract_numerical_features).apply(pd.Series)
testing_numerical_features = test_data["Document"].apply(extract_numerical_features).apply(pd.Series)
    

In [3]:
# psycholinguistic features (polarity & subjectivity)
from textblob import TextBlob

def extract_psycholinguistic_features(text: str):
    psycho_features = {}
    
    text_blob = TextBlob(text)
    psycho_features["polarity"] = text_blob.sentiment.polarity
    psycho_features["subjectivity"] = text_blob.sentiment.subjectivity
    
    return psycho_features

training_psycholinguistic_features = train_data["Document"].apply(extract_psycholinguistic_features).apply(pd.Series)
testing_psycholinguistic_features = test_data["Document"].apply(extract_psycholinguistic_features).apply(pd.Series)

In [4]:
# readability features
import textstat

def extract_readability_features(text: str):
    readability_features = {}
    
    readability_features["gunning_fog_index"] = textstat.gunning_fog(text)
    readability_features["coleman_liau_index"] = textstat.coleman_liau_index(text)
    readability_features['linsear_write'] = textstat.linsear_write_formula(text)
    
    # Dale-Chall Readability Score
    readability_features['dale_chall_score'] = textstat.dale_chall_readability_score(text)
    
    # Flesch Reading Ease
    readability_features['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
    
    # Spache Readability Score
    readability_features['spache'] = textstat.spache_readability(text)
    
    # Automated Readability Index
    readability_features['automated_readability_index'] = textstat.automated_readability_index(text)
    
    # Smog Index
    readability_features['smog_index'] = textstat.smog_index(text)
    
    # Flesch-Kincaid Grade Level
    readability_features['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text)
    
    # RIX Readability
    readability_features['rix'] = textstat.rix(text)
    
    # McAlpine EFLAW Readability
    readability_features['mcalpine_eflaw'] = textstat.mcalpine_eflaw(text)
    
    # Gulpease Readability
    readability_features['gulpease'] = textstat.gulpease_index(text)
    
    return readability_features


training_readability_features = train_data["Document"].apply(extract_readability_features).apply(pd.Series)
testing_readability_features = test_data["Document"].apply(extract_readability_features).apply(pd.Series)

In [9]:
# NER features (count of named entities)
# TAKE NOTE THAT THIS PART IS EXTREMELY SLOW...
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_ner_features(text: str):
    doc = nlp(text)
    ner_features = {}
    
    ner_features["ner_count"] = len(doc.ents)
    
    return ner_features


training_ner_features = train_data["Document"].apply(extract_ner_features).apply(pd.Series)
testing_ner_features = test_data["Document"].apply(extract_ner_features).apply(pd.Series)

KeyboardInterrupt: 

In [5]:
# Empath features
from empath import Empath

lexicon = Empath()
desired_categories = ['economics',
 'business',
 'money',
 'government',
 'negotiate',
 'payment',
 'banking',
 'valuable',
 'gain',
 'leader']

def extract_empath_features(text: str):
    empath_features = lexicon.analyze(text, categories=desired_categories)
    return empath_features


training_empath_features = train_data["Document"].apply(extract_empath_features).apply(pd.Series).rename(columns=lambda x: x + "_empath")
testing_empath_features = test_data["Document"].apply(extract_empath_features).apply(pd.Series).rename(columns=lambda x: x + "_empath")

In [9]:
# only select the top 5 empath categories in training data that have the highest correlation with the target ("Label")
empath_correlation = training_empath_features.corrwith(train_data["Label"]).sort_values(ascending=False)
top_empath_categories = empath_correlation.head(10).index.tolist()

In [10]:
top_empath_categories

['economics',
 'business',
 'money',
 'government',
 'negotiate',
 'payment',
 'banking',
 'valuable',
 'gain',
 'leader']

In [6]:
# tf-idf features (unigram)
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=10, stop_words="english")

training_uni_tf_idf_features = pd.DataFrame(tfidf_vectorizer.fit_transform(train_data["Document"]).toarray(), columns=[tf_idf_feature_name + "_uni" for tf_idf_feature_name in tfidf_vectorizer.get_feature_names_out()])
testing_uni_tf_idf_features = pd.DataFrame(tfidf_vectorizer.transform(test_data["Document"]).toarray(), columns=[tf_idf_feature_name + "_uni" for tf_idf_feature_name in tfidf_vectorizer.get_feature_names_out()])

In [7]:
# tf-idf features (bigram)
tfidf_vectorizer = TfidfVectorizer(max_features=10, stop_words="english", ngram_range=(2, 2))

training_bi_tf_idf_features = pd.DataFrame(tfidf_vectorizer.fit_transform(train_data["Document"]).toarray(), columns=tfidf_vectorizer.get_feature_names_out() + "_bi")
testing_bi_tf_idf_features = pd.DataFrame(tfidf_vectorizer.transform(test_data["Document"]).toarray(), columns=tfidf_vectorizer.get_feature_names_out() + "_bi")

In [8]:
# glove features (must use only glove 50 due to memory constraints)

import numpy as np

glove_dimensions = 100

def load_glove_embeddings():
    embedding_dict = dict()
    with open(f"./dataset/features/glove.6B.{glove_dimensions}d.txt", "r", encoding="utf-8") as file:
        for line in file:
            values = line.split()
            word = values[0]
            vectors = np.asarray(values[1:], dtype="float32")
            embedding_dict[word] = vectors
            
    return embedding_dict

glove_embeddings = load_glove_embeddings()

In [16]:
def get_glove_word_embedding(word: str):
    return glove_embeddings.get(word, np.zeros(glove_dimensions))


def extract_glove_features(text: str):
    # only look for the average of the embeddings
    embeddings = [get_glove_word_embedding(word) for word in word_tokenize(text)]
    avg_embeddings = pd.Series(np.mean(embeddings, axis=0), index=[f"avg_glove_{i}" for i in range(glove_dimensions)])
    sum_embeddings = pd.Series(np.sum(embeddings, axis=0), index=[f"sum_glove_{i}" for i in range(glove_dimensions)])
    
    return pd.concat([avg_embeddings, sum_embeddings])


training_glove_features = train_data["Document"].apply(extract_glove_features)
testing_glove_features = test_data["Document"].apply(extract_glove_features)


## Scaling & Randomization

In [17]:
# combine all extracted features into a single dataframe

training_features = pd.concat([training_numerical_features, training_psycholinguistic_features, training_readability_features, training_empath_features, training_uni_tf_idf_features, training_bi_tf_idf_features, training_glove_features], axis=1)
testing_features = pd.concat([testing_numerical_features, testing_psycholinguistic_features, testing_readability_features, testing_empath_features, testing_uni_tf_idf_features, testing_bi_tf_idf_features, testing_glove_features], axis=1)

In [18]:
# standard normalization of the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

training_features = pd.DataFrame(scaler.fit_transform(training_features), columns=training_features.columns)
testing_features = pd.DataFrame(scaler.transform(testing_features), columns=testing_features.columns)

In [19]:
# randomize the training data
training_features = pd.concat([training_features, train_data["Label"]], axis=1).sample(frac=1).reset_index(drop=True)
training_labels = training_features["Label"]
training_features = training_features.drop("Label", axis=1)

# save the features
# training_features.to_csv("./dataset/features/training_features.csv", index=False)

## Model Training

In [20]:
# logistic regression model

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

logistic_regression = LogisticRegression(random_state=42, max_iter=1000)

logistic_regression.fit(training_features, training_labels)

# cross validation to evaluate the model
cv_score = cross_val_score(logistic_regression, training_features, training_labels, cv=5, scoring="f1_macro").mean()

print(f"Cross validation score: {cv_score}")

# print the classification report
print(classification_report(test_data["Label"], logistic_regression.predict(testing_features)))

Cross validation score: 0.7842524991915567
              precision    recall  f1-score   support

           0       0.75      0.62      0.68     29073
           1       0.86      0.92      0.89     72236

    accuracy                           0.83    101309
   macro avg       0.80      0.77      0.78    101309
weighted avg       0.83      0.83      0.83    101309



In [6]:
import lime 
from lime.lime_tabular import LimeTabularExplainer
from lime.lime_text import LimeTextExplainer
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
vectorizer = CountVectorizer(max_features=100, stop_words="english")
train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)

c = make_pipeline(vectorizer, LogisticRegression(random_state=42, max_iter=1000))
c.fit(train_data["Document"], train_data["Label"])
print(c.predict_proba(test_data))


[[0.34012086 0.65987914]
 [0.34012086 0.65987914]
 [0.34012086 0.65987914]]


In [14]:
print(c.predict_proba(test_data["Document"]))

[[0.2342539  0.7657461 ]
 [0.34012086 0.65987914]
 [0.30858267 0.69141733]
 ...
 [0.34012086 0.65987914]
 [0.23085021 0.76914979]
 [0.34012086 0.65987914]]


In [16]:
explainer = LimeTextExplainer(class_names= ['0', '1'])
exp = explainer.explain_instance(test_data["Document"].loc[7], c.predict_proba, num_features=3)
rows = test_data["Document"].index.size
print(exp.as_list())
exp.show_in_notebook(text=False)


[('Obama', -0.18973470954958593), ('said', 0.18638138255318937), ('New', 0.07443816782748798)]


In [24]:
import csv
csv_file = "initial.csv"
with open(csv_file, 'a', newline='') as file:
    writer = csv.writer(file)
    print("Total: " + str(rows))
    for i in range(rows):
        exp = explainer.explain_instance(test_data["Document"].loc[i], c.predict_proba, num_features=3)
        writer.writerows(exp.as_list())
        print("Current: " + str(i))


Total: 101309
Current: 0
Current: 1
Current: 2
Current: 3
Current: 4
Current: 5
Current: 6
Current: 7
Current: 8
Current: 9
Current: 10
Current: 11
Current: 12
Current: 13
Current: 14
Current: 15
Current: 16
Current: 17
Current: 18
Current: 19
Current: 20
Current: 21
Current: 22
Current: 23
Current: 24
Current: 25
Current: 26
Current: 27
Current: 28
Current: 29
Current: 30
Current: 31
Current: 32
Current: 33
Current: 34
Current: 35
Current: 36
Current: 37
Current: 38
Current: 39
Current: 40
Current: 41
Current: 42
Current: 43
Current: 44
Current: 45
Current: 46
Current: 47
Current: 48
Current: 49
Current: 50
Current: 51
Current: 52
Current: 53
Current: 54
Current: 55
Current: 56
Current: 57
Current: 58
Current: 59
Current: 60
Current: 61
Current: 62
Current: 63
Current: 64
Current: 65
Current: 66
Current: 67
Current: 68
Current: 69
Current: 70
Current: 71
Current: 72
Current: 73
Current: 74
Current: 75
Current: 76
Current: 77
Current: 78
Current: 79
Current: 80
Current: 81
Current: 82
