In [93]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import pickle
import joblib
import numpy as np

In [94]:
# import data
with open("./data/content_features_cv.pkl","rb") as f:
    content_features_df = pickle.load(f)
    
# drop nan values 
content_features_df = content_features_df.drop("smog", axis=1) # due to alot of NaN
content_features_df = content_features_df.drop("dale_chall", axis=1) # due to alot of NaN
content_features_df = content_features_df.dropna() # due to alot of NaN

# drop values with 0 words per sentence
print(content_features_df.shape)
content_features_df = content_features_df[content_features_df["words_per_sentence_median"] != 0]
print(content_features_df.shape)

content_features_df.head()


(64482, 11)
(64482, 11)


Unnamed: 0,label,verbs_third_person,verbs_others,words_per_sentence_median,num_of_sentences,adverbs_rate,nouns_rate,adjectives_rate,verbs_third_person_rate,verbs_others_rate,automatic_readability
0,1,85,38,13.0,60,0.3,3.633333,1.666667,1.416667,0.633333,8.968798
3,0,137,35,19.0,62,0.451613,6.080645,2.403226,2.209677,0.564516,13.495336
4,1,27,4,25.0,13,0.384615,7.846154,3.538462,2.076923,0.307692,17.923524
5,1,16,13,14.0,15,0.333333,5.533333,2.266667,1.066667,0.866667,15.275581
8,1,22,16,18.0,15,0.133333,5.2,1.533333,1.466667,1.066667,8.987693


In [95]:
train_dataset, test_dataset = train_test_split(content_features_df,test_size=0.2,train_size=0.8)
print(train_dataset.shape)
print(test_dataset.shape)


train_dataset_x = train_dataset.drop("label",axis=1).values
train_dataset_y = train_dataset["label"].values

test_dataset_x = test_dataset.drop("label",axis=1).values
test_dataset_y = test_dataset["label"].values


(51585, 11)
(12897, 11)


In [96]:
# output of best model
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(1000, weights='distance',metric="l2")) # best
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)
joblib.dump(knn_pipeline, './data/knn_pipeline_ari.joblib')
    

0.797859967434287


['./data/knn_pipeline_ari.joblib']

In [56]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="rbf"))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7738233697759169


In [57]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="sigmoid"))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.5540823447313329


In [58]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="poly"))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7301698069318446


In [59]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="rbf",C=1.1))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7735132201287122


In [60]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="rbf",C=1.5))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7739784445995193


In [61]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="rbf",C=0.5))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7715747848336822


In [62]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="rbf",C=0.1))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7669225401256106


In [63]:
nb_pipeline = make_pipeline(StandardScaler(), GaussianNB())
nb_pipeline.fit(train_dataset_x,train_dataset_y)
score = nb_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.6033961386368923


In [64]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier())
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7475381871753121


In [65]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(1000,weights='distance'))
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.8059238582616113


In [66]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(2000, weights='distance',metric="l1"))
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.8046057222609909


In [67]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(3000, weights='distance',metric="l1"))
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.8008063890827324


In [68]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(1000, weights='distance',metric="l2"))
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.8059238582616113


In [69]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(2000, weights='distance',metric="l2")) # best
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7997208653175157


In [70]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(1000, weights='distance',metric="cosine"))
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7992556408467085
