In [3]:
import matplotlib as mpl
import matplotlib.pyplot as plt
from modules.tfidf_custom import TfIdfCustom
from modules.tfidf import TfIdf
from modules.word_stats import WordStats
from modules.similarity import Similarity
from modules.bag_of_words import BagOfWords
from modules.bigram_of_words import BigramOfWords
from models.knnreg import KnnReg
from modules.data import load_pickle
from modules.stats import score_distribution_plot, words_count_plot, tfidf_custom_score_plot, topic_similarity_score_plot
import pandas as pd
# from scipy.stats import binned_statistic
import sys
import argparse

pd.options.display.min_rows = 100
#pd.options.display.max_rows = set larger than dataset
pd.options.display.max_columns = 20
%matplotlib inline
mpl.rcParams["figure.figsize"] = 15,8

pickle_df = load_pickle("okt-5-10728c.p")
#pickle_df = load_pickle("okt-5-43896c.p")

In [4]:
df = pickle_df.copy()

# Remove unused columns
df = df.drop(["reichenbach_tense", "emoticons", "aspectual_class", "common_word", "letter_count", "tfidf_custom_score", "length", "sentences"], axis="columns")

# remove noise
df = df[df["body"] != "[removed]"]
df = df.reset_index()

In [5]:
word_stats = WordStats()
df = word_stats.score(df) # "words_count", "stop_words_count", "bad_words_count", "bad_words"

In [6]:
similarity = Similarity()
similarity.score(df) # "topic_similarity"

Unnamed: 0,index,body,subreddit,submission,tfidf_score,words_count,stop_words_count,bad_words_count,bad_words,topic_similarity,label
0,0.0,i guess he did outlive his life expectancy by ...,science,Physicist Stephen Hawking dies aged 76,0.0,7.0,4.0,0.0,,0.000000,8256.0
1,1.0,Woah. One of those deaths you always saw comin...,science,Physicist Stephen Hawking dies aged 76,0.0,10.0,5.0,0.0,,0.000000,9470.0
2,2.0,“We have this one life to appreciate the grand...,science,Physicist Stephen Hawking dies aged 76,0.0,37.0,30.0,0.0,,0.065305,12774.0
3,3.0,“I have lived with the prospect of an early de...,science,Physicist Stephen Hawking dies aged 76,0.0,22.0,24.0,0.0,,0.000000,27266.0
4,4.0,Remember to look up at the stars and not down ...,science,Physicist Stephen Hawking dies aged 76,0.0,30.0,32.0,0.0,,0.000000,9766.0
5,5.0,An absolutely monumental mind. The world mourn...,science,Physicist Stephen Hawking dies aged 76,0.0,9.0,2.0,0.0,,0.000000,4173.0
6,6.0,A sad day for scientists and science enthusias...,science,Physicist Stephen Hawking dies aged 76,0.0,39.0,25.0,0.0,,0.026171,12466.0
7,7.0,"Very sad. Let's celebrate his life though, he ...",science,Physicist Stephen Hawking dies aged 76,0.0,16.0,5.0,0.0,,0.000000,4184.0
8,9.0,"pour one out for the OG wheelin and dealin, ru...",science,Physicist Stephen Hawking dies aged 76,0.0,19.0,9.0,0.0,,0.092231,3717.0
9,10.0,"""The greatest enemy of knowledge is not ignora...",science,Physicist Stephen Hawking dies aged 76,0.0,12.0,8.0,0.0,,0.000000,10231.0


In [7]:
tfidf = TfIdf()
df = tfidf.score(df) # "tfidf_score"

In [8]:
bigram_of_words = BigramOfWords()
df = bigram_of_words.score(df) # "bigram_of_words"

In [9]:

#inspect = df.loc[[1201]]
#print(inspect[["submission"]].to_string())
#print(inspect[["body"]].to_string())
#display(inspect[["topic_similarity", "tfidf_score", "words_count", "stop_words_count", "bad_words_count", "bad_words", "label"]])

#idxr = 1200
#display(df[["body", "submission", "topic_similarity", "tfidf_score", "words_count", "stop_words_count", "bad_words_count", "bad_words", "label"]][idxr:idxr+16])

In [26]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import MinMaxScaler, KBinsDiscretizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn import metrics
import numpy as np

# Drop indexs more from lower score and less from higher score
# https://stackoverflow.com/questions/28556942/pandas-remove-rows-at-random-without-shuffling-dataset/28557333#28557333
# np.random.seed(10)

feature_list = ["topic_similarity", "tfidf_score", "words_count", "stop_words_count", "bad_words_count"]

#display(data)

X = df[feature_list]
y = df[["label"]]

# X = df[df.label > 1000][feature_list]
# y = df[df.label > 1000][["label"]]
# print(y.shape)
# print(X.shape)

# Bin all of y first
# uniform = All bins in each feature have identical widths.
# quantile = All bins in each feature have the same number of points.
# kmeans = Values in each bin have the same nearest center of a 1D k-means cluster.
bins = 5
est = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy="uniform") #
est.fit(y)
yt = pd.DataFrame(data=est.transform(y), columns=["y"])

display(est.transform(y))

y_filtered_drop = yt[yt.y == 0][:9900]
yt_filtered = yt.drop(index=y_filtered_drop.index)
X_filtered = X.drop(index=y_filtered_drop.index)

#print(y_filtered_drop.shape)
#print(yt_filtered.shape)
#print(X_filtered.shape)

display(yt_filtered["y"].value_counts())


#df["est_label"] = yt
#display(pd.DataFrame(data=yt, columns=["y"])["y"].value_counts())
#display(df[[*feature_list, "label", "est_label"]])
#display(df["label"])


# Bad idea to do random split with so skewed data   
#data = scaler.transform(data)
#X_train, X_validate, y_train, y_validate = train_test_split(X, yt, test_size=0.7) 


# https://stats.stackexchange.com/questions/131255/class-imbalance-in-supervised-machine-learning
# https://datascience.stackexchange.com/questions/32818/train-test-split-of-unbalanced-dataset-classification/32820#32820
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html
skf = StratifiedKFold(n_splits=50, random_state=42, shuffle=True)
# skf.get_n_splits(X, yt)
balanced_accuracies = []
accuracies = []
predictions = np.array([])
true_values = np.array([])

for train_index, test_index in skf.split(X_filtered, yt_filtered):
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_validate = X_filtered.iloc[train_index], X_filtered.iloc[test_index]
    y_train, y_true = yt_filtered.iloc[train_index], yt_filtered.iloc[test_index]

    #scaler = MinMaxScaler(min=0, max=1)
    scaler = MinMaxScaler()
    scaler.fit(X_train, y_train) # Don't cheat: fit only training data. y is ignored, but needs same shape[rows]

    model = MultinomialNB()

    # Train the model using the training sets - scale only features (?)
    model.fit(scaler.transform(X_train), y_train)

    # Predic the model
    y_pred = model.predict(scaler.transform(X_validate))

    # At this point, the model can classify all 0 positively, but no more
    accuracy = metrics.accuracy_score(y_true, y_pred, normalize=True)
    # TODO: balanced - https://stackoverflow.com/questions/55838262/why-does-cross-validation-give-consistently-higher-scores-than-normal-fitting-an
    balanced_accuracy = metrics.balanced_accuracy_score(y_true, y_pred, adjusted=False)
    # TODO: https://stats.stackexchange.com/questions/99667/naive-bayes-with-unbalanced-classes

    # display
    
    accuracies.append(accuracy)
    balanced_accuracies.append(balanced_accuracy)
    predictions = np.append(predictions, y_pred)
    true_values = np.append(true_values, y_true)
    # print(accuracy)

    # accuracy_zeros = metrics.accuracy_score(y_validate, [0]*len(y_validate), normalize=True)
    # print(accuracy_zeros)

# TODO: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.balanced_accuracy_score.html

print("\n\nMean:", np.mean(accuracies))
print("\n\nMean balanced:", np.mean(balanced_accuracies))
results = pd.DataFrame(data={"pred": predictions, "true": true_values})
counts = pd.DataFrame(data={"pred": results["pred"].value_counts(), "true": results["true"].value_counts()})
display(counts)
#results["pred_values"].value_counts()
#results["true_values"].value_counts()
#display(results[["pred_values", "true_values"]])


array([[0.],
       [1.],
       [1.],
       ...,
       [0.],
       [0.],
       [0.]])

1.0    171
2.0     49
3.0      6
4.0      4
Name: y, dtype: int64



Mean: 0.7440000000000001


Mean balanced: 0.47


Unnamed: 0,pred,true
1.0,230.0,171
2.0,,49
3.0,,6
4.0,,4
