In [1]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
# import data
with open("./data/content_features_cv.pkl","rb") as f:
    content_features_df = pickle.load(f)
    
content_features_df = content_features_df.drop("automatic_readability", axis=1) # due to NaN
content_features_df = content_features_df.drop("proper_nouns", axis=1) # due to 0
content_features_df = content_features_df.drop("conjunctions", axis=1) # due to 0
content_features_df = content_features_df.drop("proper_nouns_rate", axis=1) # due to 0
content_features_df = content_features_df.drop("conjunctions_rate", axis=1) # due to 0


# drop values with 0 words per sentence
print(content_features_df.shape)
content_features_df = content_features_df[content_features_df["words_per_sentence_median"] != 0]
print(content_features_df.shape)

content_features_df.head()


(72134, 11)
(72122, 11)


Unnamed: 0,label,verbs_third_person,verbs_others,words_per_sentence_variance,words_per_sentence_median,num_of_sentences,adverbs_rate,nouns_rate,adjectives_rate,verbs_third_person_rate,verbs_others_rate
0,1,85,38,77.94887,13.0,60,0.3,3.633333,1.666667,1.416667,0.633333
1,1,0,1,24.5,4.5,2,0.5,1.0,0.0,0.0,0.5
2,1,3,4,124.333333,13.5,4,0.25,4.75,2.0,0.75,1.0
3,0,137,35,213.760973,19.0,62,0.451613,6.080645,2.403226,2.209677,0.564516
4,1,27,4,341.935897,25.0,13,0.384615,7.846154,3.538462,2.076923,0.307692


In [3]:
train_dataset, test_dataset = train_test_split(content_features_df,test_size=0.2,train_size=0.8)
print(train_dataset.shape)
print(test_dataset.shape)


train_dataset_x = train_dataset.drop("label",axis=1).values
train_dataset_y = train_dataset["label"].values

test_dataset_x = test_dataset.drop("label",axis=1).values
test_dataset_y = test_dataset["label"].values


(57697, 11)
(14425, 11)


In [37]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="rbf"))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7302599653379549


In [38]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="sigmoid"))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.5566724436741768


In [39]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="poly"))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7020450606585789


In [4]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="rbf",C=1.1))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7350433275563258


In [6]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="rbf",C=1.5))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7353899480069324


In [5]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="rbf",C=0.5))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7324090121317157


In [8]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="rbf",C=0.1))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7247833622183709


In [40]:
nb_pipeline = make_pipeline(StandardScaler(), GaussianNB())
nb_pipeline.fit(train_dataset_x,train_dataset_y)
score = nb_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.5417677642980936


In [43]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier())
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7008665511265164


In [48]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(1000,weights='distance'))
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7773310225303293


In [51]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(2000, weights='distance',metric="l1"))
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7826689774696707


In [9]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(3000, weights='distance',metric="l1"))
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.785580589254766


In [52]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(1000, weights='distance',metric="l2"))
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7773310225303293


In [10]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(2000, weights='distance',metric="l2")) # best
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7830155979202773


In [53]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(1000, weights='distance',metric="cosine"))
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7719237435008666
