In [1]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import pickle
import joblib

In [2]:
# import data
with open("./data/content_features_cv.pkl","rb") as f:
    content_features_df = pickle.load(f)
    
content_features_df = content_features_df.drop("smog", axis=1) # due to NaN
content_features_df = content_features_df.drop("dale_chall", axis=1) # due to NaN
content_features_df = content_features_df.drop("automatic_readability", axis=1) # due to NaN


# drop values with 0 words per sentence
print(content_features_df.shape)
content_features_df = content_features_df[content_features_df["words_per_sentence_median"] != 0]
print(content_features_df.shape)

content_features_df.head()


(72134, 10)
(72122, 10)


Unnamed: 0,label,verbs_third_person,verbs_others,words_per_sentence_median,num_of_sentences,adverbs_rate,nouns_rate,adjectives_rate,verbs_third_person_rate,verbs_others_rate
0,1,85,38,13.0,60,0.3,3.633333,1.666667,1.416667,0.633333
1,1,0,1,4.5,2,0.5,1.0,0.0,0.0,0.5
2,1,3,4,13.5,4,0.25,4.75,2.0,0.75,1.0
3,0,137,35,19.0,62,0.451613,6.080645,2.403226,2.209677,0.564516
4,1,27,4,25.0,13,0.384615,7.846154,3.538462,2.076923,0.307692


In [3]:
train_dataset, test_dataset = train_test_split(content_features_df,test_size=0.2,train_size=0.8)
print(train_dataset.shape)
print(test_dataset.shape)
    


train_dataset_x = train_dataset.drop("label",axis=1).values
train_dataset_y = train_dataset["label"].values

test_dataset_x = test_dataset.drop("label",axis=1).values
test_dataset_y = test_dataset["label"].values

with open("./data/knn_train.pkl","wb") as f:
    pickle.dump(train_dataset,f)
with open("./data/knn_test.pkl","wb") as f:
    pickle.dump(test_dataset,f)

(57697, 10)
(14425, 10)


In [4]:
# output of best model
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(1000, weights='distance',metric="l2")) # best
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)
joblib.dump(knn_pipeline, './data/knn_pipeline.joblib')
    

0.7771923743500867


['./data/knn_pipeline.joblib']

In [5]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="rbf"))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7115424610051994


In [6]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="sigmoid"))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.5448180242634315


In [7]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="poly"))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.6802772963604853


In [8]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="rbf",C=1.1))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.711473136915078


In [9]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="rbf",C=1.5))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7118890814558059


In [10]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="rbf",C=0.5))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7100173310225303


In [11]:
svc_pipeline = make_pipeline(StandardScaler(), SVC(gamma="auto",kernel="rbf",C=0.1))
svc_pipeline.fit(train_dataset_x,train_dataset_y)
score = svc_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7083535528596188


In [12]:
nb_pipeline = make_pipeline(StandardScaler(), GaussianNB())
nb_pipeline.fit(train_dataset_x,train_dataset_y)
score = nb_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.5807279029462739


In [13]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier())
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.6955979202772964


In [14]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(1000,weights='distance'))
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7771923743500867


In [15]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(2000, weights='distance',metric="l1"))
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7744887348353553


In [16]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(3000, weights='distance',metric="l1"))
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7718544194107453


In [17]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(1000, weights='distance',metric="l2"))
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7771923743500867


In [18]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(2000, weights='distance',metric="l2")) # best
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7728249566724437


In [19]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(1000, weights='distance',metric="cosine"))
knn_pipeline.fit(train_dataset_x,train_dataset_y)
score = knn_pipeline.score(test_dataset_x, test_dataset_y)
print(score)

0.7688734835355286
