In [50]:
import matplotlib as mpl
import matplotlib.pyplot as plt
from modules.tfidf_custom import TfIdfCustom
from modules.tfidf import TfIdf
from modules.word_stats import WordStats
from modules.similarity import Similarity
from models.knnreg import KnnReg
from modules.data import load_pickle
from modules.stats import score_distribution_plot, words_count_plot, tfidf_custom_score_plot, topic_similarity_score_plot
import pandas as pd
# from scipy.stats import binned_statistic
import sys
import argparse

pd.options.display.min_rows = 100
#pd.options.display.max_rows = set larger than dataset
pd.options.display.max_columns = 20
%matplotlib inline
mpl.rcParams["figure.figsize"] = 15,8

pickle_df = load_pickle("okt-5-43896c.p")

In [29]:
df = pickle_df.copy()

In [30]:
# remove noise
df = df[df["body"] != "[removed]"]
df = df.reset_index()

In [31]:
similarity = Similarity()
similarity.score(df) # "topic_similarity"

In [None]:
word_stats = WordStats()
df = word_stats.score(df) # "words_count", "stop_words_count", "bad_words_count", "bad_words"

In [None]:
tfidf_custom = TfIdfCustom()
df = tfidf_custom.score(df) # "tfidf_custom_score"

In [None]:
tfidf = TfIdf()
df = tfidf.score(df) # "tfidf_score"

In [32]:

inspect = df.loc[[1201]]
print(inspect[["submission"]].to_string())
print(inspect[["body"]].to_string())
display(inspect[["topic_similarity", "tfidf_score", "words_count", "stop_words_count", "bad_words_count", "bad_words", "label"]])

idxr = 1200
display(df[["body", "submission", "topic_similarity", "tfidf_score", "words_count", "stop_words_count", "bad_words_count", "bad_words", "label"]][idxr:idxr+16])

                                        submission
1201  What's the dumbest way you've gotten a scar?
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

Unnamed: 0,topic_similarity,tfidf_score,words_count,stop_words_count,bad_words_count,bad_words,label
1201,0.010885,8.612723,131.0,106.0,0.0,,1082.0


Unnamed: 0,body,submission,topic_similarity,tfidf_score,words_count,stop_words_count,bad_words_count,bad_words,label
1200,I have a white dot from where a wasp stung me ...,What's the dumbest way you've gotten a scar?,0.0,2.792818,8.0,7.0,0.0,,428.0
1201,I punched a pint glass one night when I was dr...,What's the dumbest way you've gotten a scar?,0.010885,8.612723,131.0,106.0,0.0,,1082.0
1202,My government stored 2700 tons of ammonium nit...,What's the dumbest way you've gotten a scar?,0.020518,5.322606,32.0,23.0,0.0,,461.0
1203,"I'm late to the party, but I have an actual st...",What's the dumbest way you've gotten a scar?,0.068577,12.46515,358.0,296.0,2.0,"ass,shit",89.0
1204,Bit myself. I saw something I didn't want to f...,What's the dumbest way you've gotten a scar?,0.048192,3.71511,35.0,19.0,0.0,,516.0
1205,Was probably 10 when I fought with my younger ...,What's the dumbest way you've gotten a scar?,0.025059,4.630866,26.0,25.0,1.0,snatch,240.0
1206,I always see landscapers using weed-whackers w...,What's the dumbest way you've gotten a scar?,0.018832,5.818649,47.0,28.0,1.0,bullshit,81.0
1207,Reaching into the oven to pull out a pie. My f...,What's the dumbest way you've gotten a scar?,0.044559,4.795856,31.0,33.0,0.0,,71.0
1208,I didn’t want to go to work one day when I was...,What's the dumbest way you've gotten a scar?,0.068785,4.627119,29.0,30.0,0.0,,232.0
1209,Tripped over a fucking football and destroyed ...,What's the dumbest way you've gotten a scar?,0.02863,4.066667,24.0,19.0,1.0,fuck,179.0


In [49]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import MinMaxScaler, KBinsDiscretizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn import metrics
import numpy as np

# Drop indexs more from lower score and less from higher score
# https://stackoverflow.com/questions/28556942/pandas-remove-rows-at-random-without-shuffling-dataset/28557333#28557333
# np.random.seed(10)

feature_list = ["topic_similarity", "tfidf_score", "words_count", "stop_words_count", "bad_words_count"]

#display(data)

X = df[feature_list]
y = df[["label"]]

# X = df[df.label > 1000][feature_list]
# y = df[df.label > 1000][["label"]]
# print(y.shape)
# print(X.shape)

# Bin all of y first
# uniform = All bins in each feature have identical widths.
# quantile = All bins in each feature have the same number of points.
# kmeans = Values in each bin have the same nearest center of a 1D k-means cluster.
bins = 4
est = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy="quantile") #
est.fit(y)
yt = pd.DataFrame(data=est.transform(y), columns=["y"])

y_filtered_drop = yt[yt.y == 0][:41000]
yt_filtered = yt.drop(index=y_filtered_drop.index)
X_filtered = X.drop(index=y_filtered_drop.index)

#print(y_filtered_drop.shape)
#print(yt_filtered.shape)
#print(X_filtered.shape)

display(yt_filtered["y"].value_counts())


#df["est_label"] = yt
#display(pd.DataFrame(data=yt, columns=["y"])["y"].value_counts())
#display(df[[*feature_list, "label", "est_label"]])
#display(df["label"])


# Bad idea to do random split with so skewed data   
#data = scaler.transform(data)
#X_train, X_validate, y_train, y_validate = train_test_split(X, yt, test_size=0.7) 


# https://stats.stackexchange.com/questions/131255/class-imbalance-in-supervised-machine-learning
# https://datascience.stackexchange.com/questions/32818/train-test-split-of-unbalanced-dataset-classification/32820#32820
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=False)
# skf.get_n_splits(X, yt)
balanced_accuracies = []
accuracies = []
predictions = np.array([])
true_values = np.array([])

for train_index, test_index in skf.split(X_filtered, yt_filtered):
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_validate = X_filtered.iloc[train_index], X_filtered.iloc[test_index]
    y_train, y_true = yt_filtered.iloc[train_index], yt_filtered.iloc[test_index]

    #scaler = MinMaxScaler(min=0, max=1)
    scaler = MinMaxScaler()
    scaler.fit(X_train, y_train) # Don't cheat: fit only training data. y is ignored, but needs same shape[rows]

    model = MultinomialNB()

    # Train the model using the training sets - scale only features (?)
    model.fit(scaler.transform(X_train), y_train)

    # Predic the model
    y_pred = model.predict(scaler.transform(X_validate))

    # At this point, the model can classify all 0 positively, but no more
    accuracy = metrics.accuracy_score(y_true, y_pred, normalize=True)
    # TODO: balanced - https://stackoverflow.com/questions/55838262/why-does-cross-validation-give-consistently-higher-scores-than-normal-fitting-an
    balanced_accuracy = metrics.balanced_accuracy_score(y_true, y_pred, adjusted=False)
    # TODO: https://stats.stackexchange.com/questions/99667/naive-bayes-with-unbalanced-classes

    # display
    
    accuracies.append(accuracy)
    balanced_accuracies.append(balanced_accuracy)
    predictions = np.append(predictions, y_pred)
    true_values = np.append(true_values, y_true)
    # print(accuracy)

    # accuracy_zeros = metrics.accuracy_score(y_validate, [0]*len(y_validate), normalize=True)
    # print(accuracy_zeros)

# TODO: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.balanced_accuracy_score.html

print("\n\nMean:", np.mean(accuracies))
print("\n\nMean balanced:", np.mean(balanced_accuracies))
results = pd.DataFrame(data={"pred": predictions, "true": true_values})
counts = pd.DataFrame(data={"pred": results["pred"].value_counts(), "true": results["true"].value_counts()})
display(counts)
#results["pred_values"].value_counts()
#results["true_values"].value_counts()
#display(results[["pred_values", "true_values"]])


1.0    13372
2.0    11525
3.0    10620
Name: y, dtype: int64



Mean: 0.37691808466921095


Mean balanced: 0.33384902171611464


Unnamed: 0,pred,true
1.0,35441,13372
2.0,38,11525
3.0,38,10620
