# Install Required Python Modules

In [None]:
! pip install beautifulsoup4 lxml nltk spacy textblob emoji pandas openpyxl scikit-learn xgboost

In [None]:
! python -m spacy download en_core_web_sm

# Data Clean

## Load Modules and Config

In [34]:
import emoji
import nltk
import numpy
import pickle
import re
import spacy
import string

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from pandas import read_excel
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from textblob import TextBlob

In [None]:
# Download stopwords
nltk.download("stopwords")
nltk.download("punkt")

# Load spaCy for Lemmatization
nlp = spacy.load("en_core_web_sm")

## Define Text Cleaning Functions

In [3]:
def clean_text(text):
    # Remove HTML tag
    text = BeautifulSoup(text, "lxml").text
 
    # Remove URL
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
 
    # Remove @user and #tag
    text = re.sub(r"@\w+|#\w+", "", text)
 
    # Replace emoji
    text = emoji.demojize(text)
 
    # Remove punctuations
    text = text.translate(str.maketrans("", "", string.punctuation))
 
    # Remove numbers
    text = re.sub(r"\d+", "", text)
 
    # Lower the text
    text = text.lower()
 
    # Lemmatization
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc])
 
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    text = " ".join([word for word in text.split() if word not in stop_words])
 
    # Correct
    text = str(TextBlob(text).correct())
 
    return text
 
# Test
raw_text = "Hello! 😊 This is a <b>test</b> message. Visit: https://example.com #AI @user123"
cleaned_text = clean_text(raw_text)
print("Raw:", raw_text)
print("Clean:", cleaned_text)

Raw: Hello! 😊 This is a <b>test</b> message. Visit: https://example.com #AI @user123
Clean: hello smilingfacewithsmilingeye test message visit


## Load Datasets and Clean Fileds

In [84]:
dataset = read_excel("./merged_codes.xlsx"); dataset
dataset["CleanText"] = dataset["comment"].apply(clean_text)

In [85]:
# score are equal to upvotes
numpy.sum(dataset["score"] == dataset["upvotes"])

400

In [86]:
dataset["score_no_negative"] = dataset["score"] + dataset["score"].min() * -1
dataset['subreddit_label'] = LabelEncoder().fit(dataset["subreddit"]).transform(dataset["subreddit"])
dataset['search_term_label'] = LabelEncoder().fit(dataset["search_term"]).transform(dataset["search_term"])

In [87]:
with open("dataset.pickle", "wb") as f:
    pickle.dump(dataset, f)

## Create unigrams and TF-IDF

In [88]:
with open("dataset.pickle", "rb") as f:
    dataset = pickle.load(f)
    
count_vect = CountVectorizer(stop_words = "english", decode_error = "ignore")
text_counts = count_vect.fit_transform(dataset["CleanText"])

tfidf_transformer = TfidfTransformer()
text_tfidf = tfidf_transformer.fit_transform(text_counts)

# Downvotes are constant
x_counts = hstack((dataset[["score_no_negative", "subreddit_label", "search_term_label"]].values, text_counts))
x_tfidf = hstack((dataset[["score_no_negative", "subreddit_label", "search_term_label"]].values, text_tfidf))

## Split Train and Test Sets

In [89]:
x_train_counts, x_test_counts, x_train_tfidf, x_test_tfidf, y_train, y_test = train_test_split(x_counts, x_tfidf, dataset["outcome"], test_size = 0.2, random_state = 1)

# Train and Test Model

## Load Modules and Config

In [90]:
import os

from pandas import DataFrame
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from xgboost import XGBClassifier

## Define Batch Functions

### Functions: Transform Labels for `XGBClassifier`

In [91]:
def transform_to_int(y):
    if y == "oppose":
        return 0
    if y == "favor":
        return 1
    if y == "neutral":
        return 2

def transform_to_label(y):
    if y == 0:
        return "oppose"
    if y == 1:
        return "favor"
    if y == 2:
        return "neutral"

def transform_y(y, model_tag):
    if model_tag == "xgboost.XGBClassifier":
        return y.apply(transform_to_int)
    return y

### Functions: Train and Evaluate

In [92]:
def train(model, x, y):
    return model.fit(x, y)

def predict(fit, x):
    return fit.predict(x)

def train_and_evaluate(model, x, y):
    fit = train(model, x, y)
    pred = predict(fit, x)
    gof = classification_report(y, pred, output_dict = True)
    return fit, gof

# Test
fit, gof = train_and_evaluate(XGBClassifier(), x_train_counts, y_train.apply(transform_to_int)); gof

{'0': {'precision': 0.9378531073446328,
  'recall': 0.9485714285714286,
  'f1-score': 0.9431818181818182,
  'support': 175},
 '1': {'precision': 0.925531914893617,
  'recall': 0.9560439560439561,
  'f1-score': 0.9405405405405406,
  'support': 91},
 '2': {'precision': 0.9183673469387755,
  'recall': 0.8333333333333334,
  'f1-score': 0.8737864077669903,
  'support': 54},
 'accuracy': 0.93125,
 'macro avg': {'precision': 0.9272507897256751,
  'recall': 0.9126495726495727,
  'f1-score': 0.919169588829783,
  'support': 320},
 'weighted avg': {'precision': 0.9310610461728868,
  'recall': 0.93125,
  'f1-score': 0.9307202293450777,
  'support': 320}}

### Function: Test and Evaluate

In [93]:
def test_and_evaluate(fit, x, y):
    pred = predict(fit, x)
    gof = classification_report(y, pred, output_dict = True)
    return gof

# Test
gof = test_and_evaluate(fit, x_test_counts, y_test.apply(transform_to_int)); gof

{'0': {'precision': 0.5192307692307693,
  'recall': 0.6428571428571429,
  'f1-score': 0.574468085106383,
  'support': 42},
 '1': {'precision': 0.2857142857142857,
  'recall': 0.3157894736842105,
  'f1-score': 0.3,
  'support': 19},
 '2': {'precision': 0.42857142857142855,
  'recall': 0.15789473684210525,
  'f1-score': 0.23076923076923078,
  'support': 19},
 'accuracy': 0.45,
 'macro avg': {'precision': 0.4111721611721612,
  'recall': 0.37218045112781956,
  'f1-score': 0.3684124386252046,
  'support': 80},
 'weighted avg': {'precision': 0.44223901098901097,
  'recall': 0.45,
  'f1-score': 0.4276534369885434,
  'support': 80}}

### Functions: Format Good-of-fitness

In [94]:
def transform_gof(gof, model_tag):
    if model_tag == "xgboost.XGBClassifier":
        return {
            transform_to_label(0): gof["0"],
            transform_to_label(1): gof["1"],
            transform_to_label(2): gof["2"],
            "accuracy": gof["accuracy"],
            "macro avg": gof["macro avg"],
            "weighted avg": gof["weighted avg"]
        }
    return gof

def format_gof_long(gof, model_tag, data_tag, set_tag):
    records = []
    for item in gof:
        if item == "accuracy":
            record = {
                "data": data_tag,
                "model": model_tag,
                "set": set_tag,
                "class": "accuracy",
                "metric": "accuracy",
                "value": gof[item]
            }
            records.append(record)
        else:
            record = []
            for metric in ["precision", "recall", "f1-score", "support"]:
                record.append({
                    "data": data_tag,
                    "model": model_tag,
                    "set": set_tag,
                    "class": item,
                    "metric": metric,
                    "value": gof[item][metric]
                })
                records.extend(record)
    return records

def format_gof_wide(gof, model_tag, data_tag, set_tag):
    records = []
    for item in gof:
        if item == "accuracy":
            record = {
                "data": data_tag,
                "model": model_tag,
                "set": set_tag,
                "class": "accuracy",
                "accuracy": gof[item]
            }
            records.append(record)
        else:
            record = {
                "data": data_tag,
                "model": model_tag,
                "set": set_tag,
                "class": item
            }
            for metric in ["precision", "recall", "f1-score", "support"]:
                record.update({
                    metric: gof[item][metric]
                })
            records.append(record)
    return records

# Test
DataFrame(format_gof_wide(
    transform_gof(gof, "xgboost.XGBClassifier"), 
    model_tag = "xgboost.XGBClassifier", 
    data_tag = "counts",
    set_tag = "test"
))

Unnamed: 0,data,model,set,class,precision,recall,f1-score,support,accuracy
0,counts,xgboost.XGBClassifier,test,oppose,0.519231,0.642857,0.574468,42.0,
1,counts,xgboost.XGBClassifier,test,favor,0.285714,0.315789,0.3,19.0,
2,counts,xgboost.XGBClassifier,test,neutral,0.428571,0.157895,0.230769,19.0,
3,counts,xgboost.XGBClassifier,test,accuracy,,,,,0.45
4,counts,xgboost.XGBClassifier,test,macro avg,0.411172,0.37218,0.368412,80.0,
5,counts,xgboost.XGBClassifier,test,weighted avg,0.442239,0.45,0.427653,80.0,


### Functions: Batch Works

In [95]:
def create_model(model_tag):
    if model_tag == "sklearn.ensemble.ExtraTreesClassifier":
        return ExtraTreesClassifier()
    if model_tag == "sklearn.ensemble.RandomForestClassifier":
        return RandomForestClassifier()
    if model_tag == "sklearn.linear_model.LogisticRegression":
        return LogisticRegression()
    if model_tag == "sklearn.linear_model.RidgeClassifier":
        return RidgeClassifier()
    if model_tag == "sklearn.neighbors.KNeighborsClassifier":
        return KNeighborsClassifier()
    if model_tag == "sklearn.neighbors.NearestCentroid":
        return NearestCentroid()
    if model_tag == "sklearn.neural_network.MLPClassifier":
        return MLPClassifier()
    if model_tag == "sklearn.naive_bayes.BernoulliNB":
        return BernoulliNB()
    if model_tag == "sklearn.naive_bayes.MultinomialNB":
        return MultinomialNB()
    if model_tag == "sklearn.svm.LinearSVC":
        return LinearSVC()
    if model_tag == "sklearn.tree.DecisionTreeClassifier":
        return DecisionTreeClassifier()
    if model_tag == "sklearn.tree.ExtraTreeClassifier":
        return ExtraTreeClassifier()
    if model_tag == "xgboost.XGBClassifier":
        return XGBClassifier()

def train_test_evaluate(model_tag, data, data_tag):
    x_train, y_train, x_test, y_test = data
    y_train, y_test = transform_y(y_train, model_tag), transform_y(y_test, model_tag)

    model = create_model(model_tag)

    fit, train_gof = train_and_evaluate(model, x_train, y_train)
    train_gof_long = format_gof_long(train_gof, model_tag, data_tag, "train")
    train_gof_wide = format_gof_wide(train_gof, model_tag, data_tag, "train")

    test_gof = test_and_evaluate(fit, x_test, y_test)
    test_gof_long = format_gof_long(test_gof, model_tag, data_tag, "test")
    test_gof_wide = format_gof_wide(test_gof, model_tag, data_tag, "test")
    
    return fit, train_gof_long, train_gof_wide, test_gof_long, test_gof_wide

## Fit Models

### Defined All Models and Datasets

In [96]:
model_tags = [
    "sklearn.ensemble.ExtraTreesClassifier",
    "sklearn.ensemble.RandomForestClassifier",
    "sklearn.linear_model.LogisticRegression",
    "sklearn.linear_model.RidgeClassifier",
    "sklearn.neighbors.KNeighborsClassifier",
    "sklearn.neighbors.NearestCentroid",
    "sklearn.neural_network.MLPClassifier",
    "sklearn.naive_bayes.BernoulliNB",
    "sklearn.naive_bayes.MultinomialNB",
    "sklearn.svm.LinearSVC",
    "sklearn.tree.DecisionTreeClassifier",
    "sklearn.tree.ExtraTreeClassifier",
    "xgboost.XGBClassifier"
]

data_tags = ["counts", "tfidf"]

### Run!

In [97]:
if not os.path.exists("./Results"):
    os.makedirs("./Results")

for data_tag in data_tags:
    if data_tag == "counts":
        data = x_train_counts, y_train, x_test_counts, y_test
    if data_tag == "tfidf":
        data = x_train_tfidf, y_train, x_test_tfidf, y_test
    for model_tag in model_tags:
        pickle_file = f"./Results/{data_tag}+{model_tag}.pickle"
        if os.path.exists(pickle_file):
            print(f"Skipping {data_tag} using {model_tag}: already exists.")
            continue
        print(f"Processing {data_tag} using {model_tag}: ", end = "")
        result = train_test_evaluate(model_tag, data, data_tag)
        with open(pickle_file, "wb") as f:
            pickle.dump(result, f)
        print("done.")

Processing counts using sklearn.ensemble.ExtraTreesClassifier: done.
Processing counts using sklearn.ensemble.RandomForestClassifier: done.
Processing counts using sklearn.linear_model.LogisticRegression: done.
Processing counts using sklearn.linear_model.RidgeClassifier: done.
Processing counts using sklearn.neighbors.KNeighborsClassifier: done.
Processing counts using sklearn.neighbors.NearestCentroid: done.
Processing counts using sklearn.neural_network.MLPClassifier: 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


done.
Processing counts using sklearn.naive_bayes.BernoulliNB: done.
Processing counts using sklearn.naive_bayes.MultinomialNB: done.
Processing counts using sklearn.svm.LinearSVC: done.
Processing counts using sklearn.tree.DecisionTreeClassifier: done.
Processing counts using sklearn.tree.ExtraTreeClassifier: done.
Processing counts using xgboost.XGBClassifier: 



done.
Processing tfidf using sklearn.ensemble.ExtraTreesClassifier: done.
Processing tfidf using sklearn.ensemble.RandomForestClassifier: done.
Processing tfidf using sklearn.linear_model.LogisticRegression: done.
Processing tfidf using sklearn.linear_model.RidgeClassifier: done.
Processing tfidf using sklearn.neighbors.KNeighborsClassifier: 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


done.
Processing tfidf using sklearn.neighbors.NearestCentroid: done.
Processing tfidf using sklearn.neural_network.MLPClassifier: 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


done.
Processing tfidf using sklearn.naive_bayes.BernoulliNB: done.
Processing tfidf using sklearn.naive_bayes.MultinomialNB: done.
Processing tfidf using sklearn.svm.LinearSVC: done.
Processing tfidf using sklearn.tree.DecisionTreeClassifier: done.
Processing tfidf using sklearn.tree.ExtraTreeClassifier: done.
Processing tfidf using xgboost.XGBClassifier: done.


### Check Results

In [98]:
results_long, results_wide = [], []

for data_tag in data_tags:
    for model_tag in model_tags:
        pickle_file = f"./Results/{data_tag}+{model_tag}.pickle"
        with open(pickle_file, "rb") as f:
            fit, train_gof_long, train_gof_wide, test_gof_long, test_gof_wide = pickle.load(f)
        results_long.extend(train_gof_long)
        results_long.extend(test_gof_long)
        results_wide.extend(train_gof_wide)
        results_wide.extend(test_gof_wide)

DataFrame(results_wide).to_csv("results_wide.csv", index = False)
DataFrame(results_long).to_csv("results_long.csv", index = False)