In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
from parseit.data import load_pickle
import pandas as pd

pd.options.display.min_rows = 100
#pd.options.display.max_rows = set larger than dataset
pd.options.display.max_columns = 20
%matplotlib inline
mpl.rcParams["figure.figsize"] = 15,8

pickle_df = load_pickle("comments.p")
#pickle_df = load_pickle("okt-5-10728c.p")
#pickle_df = load_pickle("okt-5-43896c.p")
#pickle_df = load_pickle("okt-11-4788c.p")

In [2]:
# Make a copy to work from original
df = pickle_df.copy()

# remove noise
df = df[df["body"] != "[removed]"]
df = df.reset_index()

# Feature extraction
from modules.bag_of_words import BagOfWords
from modules.bigram_of_words import BigramOfWords
from modules.tfidf import TfIdf

df = BigramOfWords.score(df)
#df = BagOfWords.score(df)
#df = TfIdf.score(df)
display(df)

# benfords law
# https://www.youtube.com/watch?v=etx0k1nLn78

Unnamed: 0,index,body,subreddit,submission,label,! !,! &,! 'm,! ),! 2,...,” teenager,” thing,” type,” unnerved,” without,” ―,” ’,” “,♥️ lonely,🖕trump supporter
0,0,This is the canonical explanation for the TP h...,AskReddit,You go to sleep on the 31st of December 2020 a...,20399,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,Sell all the stocks in February and buy Tesla ...,AskReddit,You go to sleep on the 31st of December 2020 a...,37791,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,"idk, cry, invest, reset, repeat.",AskReddit,You go to sleep on the 31st of December 2020 a...,43314,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,Don't have sex with so-and-so because they hav...,AskReddit,You go to sleep on the 31st of December 2020 a...,9216,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,Start a twitch/youtube channel that predicts 2...,AskReddit,You go to sleep on the 31st of December 2020 a...,45018,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,5,I would go to the doctor in early January to c...,AskReddit,You go to sleep on the 31st of December 2020 a...,10035,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,6,I'm gonna be real sad because I just spent the...,AskReddit,You go to sleep on the 31st of December 2020 a...,31027,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,7,Get a haircut in February.,AskReddit,You go to sleep on the 31st of December 2020 a...,63821,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,8,Tell Nevada to start counting mail in ballots ...,AskReddit,You go to sleep on the 31st of December 2020 a...,12816,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,9,Call my brother since he died in the first wee...,AskReddit,You go to sleep on the 31st of December 2020 a...,8362,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import MinMaxScaler, KBinsDiscretizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn import metrics
import numpy as np

# Drop indexs more from lower score and less from higher score
# https://stackoverflow.com/questions/28556942/pandas-remove-rows-at-random-without-shuffling-dataset/28557333#28557333
# np.random.seed(10)


y = df[["label"]]
X = df.drop(["body", "submission", "subreddit", "label"], axis=1)


# Bin all of y first
# uniform = All bins in each feature have identical widths.
# quantile = All bins in each feature have the same number of points.
# kmeans = Values in each bin have the same nearest center of a 1D k-means cluster.
total_bins = 2
for bins in range(2, total_bins+1, 2):
    # kmeans seems right: https://scikit-learn.org/stable/auto_examples/preprocessing/plot_discretization_strategies.html
    est = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy="kmeans") #
    est.fit(y)
    yt = pd.DataFrame(data=est.transform(y), columns=["y"])

    # Drop some data
    to_drop = yt[yt.y == 0][:10000]
    yt_filtered = yt.drop(index=to_drop.index)
    X_filtered = X.drop(index=to_drop.index)


    # Bad idea to do random split with so skewed data?
    #data = scaler.transform(data)
    #X_train, X_validate, y_train, y_validate = train_test_split(X, yt, test_size=0.7) 

    # https://stats.stackexchange.com/questions/131255/class-imbalance-in-supervised-machine-learning
    # https://datascience.stackexchange.com/questions/32818/train-test-split-of-unbalanced-dataset-classification/32820#32820
    # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html
    skf = StratifiedKFold(n_splits=2, random_state=42, shuffle=True)
    # skf.get_n_splits(X, yt)
    balanced_accuracies = []
    accuracies = []
    predictions = np.array([])
    true_values = np.array([])

    for train_index, test_index in skf.split(X_filtered, yt_filtered):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_validate = X_filtered.iloc[train_index], X_filtered.iloc[test_index]
        y_train, y_true = yt_filtered.iloc[train_index], yt_filtered.iloc[test_index]

        #scaler = MinMaxScaler(min=0, max=1)
        scaler = MinMaxScaler()
        scaler.fit(X_train, y_train) # Don't cheat: fit only training data. y is ignored, but needs same shape[rows]

        model = MultinomialNB()

        # Train the model using the training sets - scale only features (?)
        model.fit(scaler.transform(X_train), y_train)

        # Predic the model
        y_pred = model.predict(scaler.transform(X_validate))

        accuracy = metrics.accuracy_score(y_true, y_pred, normalize=True)
        # TODO: balanced - https://stackoverflow.com/questions/55838262/why-does-cross-validation-give-consistently-higher-scores-than-normal-fitting-an
        balanced_accuracy = metrics.balanced_accuracy_score(y_true, y_pred, adjusted=False)
        # TODO: https://stats.stackexchange.com/questions/99667/naive-bayes-with-unbalanced-classes

        accuracies.append(accuracy)
        balanced_accuracies.append(balanced_accuracy)
        predictions = np.append(predictions, y_pred)
        true_values = np.append(true_values, y_true)

        # TODO: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.balanced_accuracy_score.html

    results = pd.DataFrame(data={"pred": predictions, "true": true_values})
    counts = pd.DataFrame(data={
        f"true-{bins}": results["true"].value_counts(dropna=False),
        f"pred-{bins}": results["pred"].value_counts(normalize=False, dropna=False),
        f"percent-{bins}": results["pred"].value_counts(normalize=True, dropna=False)
    })
    counts = pd.concat([pd.DataFrame(data=[{
        f"true-{bins}": len(df) - results["true"].value_counts(dropna=False).sum(),
        f"pred-{bins}": len(df) - results["pred"].value_counts(dropna=False).sum(),
        f"percent-{bins}": (len(df) - results["pred"].value_counts(dropna=False).sum()) / (len(df) - results["true"].value_counts(dropna=False).sum())
    }]), counts], ignore_index=True)

    print(f"\n\nMean: {np.mean(accuracies)} \n Mean balanced: {np.mean(balanced_accuracies)}")
    display(counts)




Mean: 0.7659698025551686 
 Mean balanced: 0.6220527556734453


Unnamed: 0,true-2,pred-2,percent-2
0,500,500,1.0
1,87,112,0.903226
2,37,12,0.096774




Mean: 0.42702284165698795 
 Mean balanced: 0.35428849902534115


Unnamed: 0,true-4,pred-4,percent-4
0,500,500,1.0
1,49,110,0.887097
2,57,4,0.032258
3,17,8,0.064516
4,1,2,0.016129




Mean: 0.3790166473093302 
 Mean balanced: 0.2343137254901961


Unnamed: 0,true-6,pred-6,percent-6
0,500,500,1.0
1,41,106,0.854839
2,53,12,0.096774
3,14,2,0.016129
4,8,2,0.016129
5,7,1,0.008065
6,1,1,0.008065




Mean: 0.2177700348432056 
 Mean balanced: 0.09939713064713064


Unnamed: 0,true-8,pred-8,percent-8
0,500,500.0,1.0
1,4,5.0,0.040323
2,45,71.0,0.572581
3,38,8.0,0.064516
4,13,12.0,0.096774
5,11,20.0,0.16129
6,5,3.0,0.024194
7,7,5.0,0.040323
8,1,,




Mean: 0.23965419501133786 
 Mean balanced: 0.1052498019409784


Unnamed: 0,true-10,pred-10,percent-10
0,478,478.0,1.0
1,52,77.0,0.527397
2,31,29.0,0.19863
3,33,3.0,0.020548
4,12,27.0,0.184932
5,5,3.0,0.020548
6,4,2.0,0.013699
7,3,2.0,0.013699
8,5,3.0,0.020548
9,1,,
