In [1]:
# Import libs
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn import metrics
from numpy import mean
from numpy import std
from math import sqrt
from sklearn.preprocessing import MinMaxScaler, StandardScaler, KBinsDiscretizer
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
import math

# Load pickle
from parseit.data import load_pickle
pickle_with_other_features = load_pickle(f"data-16k-dec-3-other-features.p")

In [3]:
df = pickle_with_other_features.copy()

# Define parameters
k = 50
bins = 10
test_size = 0.3
X = df[["top-cos-sim", "cos-sim", "tfidf-mean", "wc", "sw", "bw", "smil+", "smil-", "smil&", "nam", "lnk"]]
#X = dfwf[["top-cos-sim", "wc", "bw", "smil+"]]
y = df[["label"]]


# Bin all of y first
# uniform = All bins in each feature have identical widths.
# quantile = All bins in each feature have the same number of points.
# kmeans = Values in each bin have the same nearest center of a 1D k-means cluster.
#est = KBinsDiscretizer(n_bins=bins, encode="ordinal", strategy="uniform") #
#est.fit(y)
#y = pd.DataFrame(data=est.transform(y), index=y.index)

#drop = y[y == 0][100:]
#y = y.drop(index=drop.index)
#X = X.drop(index=drop.index)
#display(y)

X_unscaled_train, X_unscaled_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

# All variables must be in the same scale: normalization or min-max-scaler
# How to scale so that each distance is meaningful: https://medium.com/analytics-vidhya/why-is-scaling-required-in-knn-and-k-means-8129e4d88ed7
scaler = StandardScaler()
#scaler = MinMaxScaler()
scaler.fit(X_unscaled_train)
X_train = scaler.transform(X_unscaled_train)


# Try with SKF
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
knn = KNeighborsClassifier(n_neighbors=k)
n_scores = cross_val_score(knn, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score="raise")
# report model performance
print("Accuracy (SFK): %.2f%% (%.2f%%)\n" % (mean(n_scores)*100, std(n_scores)*100))


# Try without SKF
knn = KNeighborsClassifier(n_neighbors=k)
clf = knn.fit(X_train, y_train)
y_pred = clf.predict(scaler.transform(X_unscaled_valid))
#print(y_pred)
accuracy = metrics.accuracy_score(y_valid, y_pred, normalize=False)
accuracy_norm = metrics.accuracy_score(y_valid, y_pred, normalize=True)
print(f'Accuracy: {"%.2f" % (accuracy_norm * 100)}% ({accuracy} comments of {math.floor(len(y.index)*test_size)}) for k={k}\n')

# Try with LinearSVC
svc = LinearSVC(random_state=0, tol=1e-05)
clf = svc.fit(X_train, y_train)
# Scale X_valid after fitting the model: https://datascience.stackexchange.com/questions/38395/standardscaler-before-and-after-splitting-data
y_pred = clf.predict(scaler.transform(X_unscaled_valid))
accuracy = metrics.accuracy_score(y_valid, y_pred, normalize=False)
accuracy_norm = metrics.accuracy_score(y_valid, y_pred, normalize=True)
print(f'Linear SVC Accuracy: {"%.2f" % (accuracy_norm * 100)}% ({accuracy} comments of {math.floor(len(y.index)*test_size)})')

# Try different ranges for k
k_range = range(1, k, math.floor(k/10))
for kk in k_range:
    knn = KNeighborsClassifier(n_neighbors=kk)
    clf = knn.fit(X_train, y_train)
    y_pred = clf.predict(scaler.transform(X_unscaled_valid))
    accuracy = metrics.accuracy_score(y_valid, y_pred, normalize=False)
    accuracy_norm = metrics.accuracy_score(y_valid, y_pred, normalize=True)
    print(f'Accuracy: {"%.2f" % (accuracy_norm * 100)}% ({accuracy} comments of {math.floor(len(y.index)*test_size)}) for k={kk}')
    


display(y.value_counts())

Accuracy (SFK): 14.56% (0.36%)

Accuracy: 15.24% (1991 comments of 13066) for k=50

[   1    2 1257 ...    1    1    1]
Linear SVC Accuracy: 16.07% (2100 comments of 13066)
Accuracy: 6.30% (823 comments of 13066) for k=1
Accuracy: 12.44% (1626 comments of 13066) for k=6
Accuracy: 13.95% (1823 comments of 13066) for k=11
Accuracy: 14.31% (1870 comments of 13066) for k=16
Accuracy: 15.01% (1961 comments of 13066) for k=21
Accuracy: 15.00% (1960 comments of 13066) for k=26
Accuracy: 15.13% (1977 comments of 13066) for k=31
Accuracy: 14.97% (1956 comments of 13066) for k=36
Accuracy: 14.85% (1940 comments of 13066) for k=41
Accuracy: 15.08% (1970 comments of 13066) for k=46


label
 1       6451
 2       6362
 3       3959
 4       1672
 5       1615
         ... 
 7375       1
 7379       1
 7381       1
 7396       1
-92         1
Length: 4838, dtype: int64