In [3]:
%matplotlib inline 
import pandas as pd
import numpy as np
from keras.preprocessing import text, sequence
from sklearn import preprocessing , metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
np.random.seed(0)

In [20]:
df = pd.read_csv("../data/normalized_texts_labels.csv",encoding="utf-8")
df = df[["normalized_title","normalized_text","fake"]]
df.columns = ["titles","texts","labels"]
print("# of NaN of texts:" + str(df["texts"].isnull().sum()))
print("# of NaN of labels:" + str(df["labels"].isnull().sum()))
print("# of NaN of titles:" + str(df["titles"].isnull().sum()))
df = df.dropna()
print("dataset size:" + str(df.shape))

# of NaN of texts:109
# of NaN of labels:0
# of NaN of titles:7
dataset size:(26484, 3)


In [8]:
label_encoder = preprocessing.LabelBinarizer()
label_encoder.fit(df["labels"])
labels_encoded = label_encoder.transform(df["labels"])
y = df["labels"].values
X = df["texts"].values

# TFIDF

In [6]:
skf = StratifiedKFold(n_splits=5,random_state=42)

In [9]:
scores = {"train_acc":[],"val_acc":[],"train_auc":[],"val_auc":[]}
i = 0
for train_index, test_index in skf.split(X, y):
    print("CV round %d..." % i)
    i += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    feature_extractor_tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
    feature_extractor_tfv.fit(X_train)
    X_train, X_test = feature_extractor_tfv.transform(X_train), \
                      feature_extractor_tfv.transform(X_test)
    clf = LogisticRegression(C=4, dual=True)
    clf.fit(X=X_train,y=y_train)
    val_pred_prob = clf.predict(X=X_test)
    scores["val_acc"].append(metrics.accuracy_score(y_true=y_test,y_pred=(val_pred_prob>0.5)))
    scores["val_auc"].append(metrics.roc_auc_score(y_test,val_pred_prob))
    train_pred_prob = clf.predict(X=X_train)
    scores["train_acc"].append(metrics.accuracy_score(y_true=y_train,y_pred=(train_pred_prob>0.5)))
    scores["train_auc"].append(metrics.roc_auc_score(y_train,train_pred_prob))
df_scores = pd.DataFrame(scores)
df_scores.index.name = "CV round"
df_scores = df_scores.T
df_scores["mean"] = df_scores.mean(axis=1)
df_scores["std"] = df_scores.std(axis=1)
df_scores

CV round 0...
CV round 1...
CV round 2...
CV round 3...
CV round 4...


CV round,0,1,2,3,4,mean,std
train_acc,0.99797,0.998159,0.997923,0.997499,0.997782,0.997867,0.00022
val_acc,0.929219,0.900887,0.929772,0.936178,0.933157,0.925843,0.012727
train_auc,0.997743,0.997938,0.997612,0.997181,0.997474,0.99759,0.000255
val_auc,0.919292,0.899691,0.92484,0.923617,0.928062,0.919101,0.010104


# TFIDF_NR

In [10]:
def pr(x, y_i, y):
    p = x[y == y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)

def get_mdl(x, y):
    #y = y.values
    r = np.log(pr(x, 1,y) / pr(x, 0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [11]:
scores = {"train_acc":[],"val_acc":[],"train_auc":[],"val_auc":[]}
i = 0
for train_index, test_index in skf.split(X, y):
    print("CV round %d..." % i)
    i += 1
    ###
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    feature_extractor_tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
    feature_extractor_tfv.fit(X_train)
    X_train, X_test = feature_extractor_tfv.transform(X_train), \
                      feature_extractor_tfv.transform(X_test)
    ###
    m, r = get_mdl(X_train, y_train)
    val_pred_prob = m.predict_proba(X_test.multiply(r))[:, 1]
    scores["val_acc"].append(metrics.accuracy_score(y_true=y_test,y_pred=(val_pred_prob>0.5)))
    scores["val_auc"].append(metrics.roc_auc_score(y_test,val_pred_prob))
    train_pred_prob = m.predict_proba(X_train.multiply(r))[:, 1]
    scores["train_acc"].append(metrics.accuracy_score(y_true=y_train,y_pred=(train_pred_prob>0.5)))
    scores["train_auc"].append(metrics.roc_auc_score(y_train,train_pred_prob))
df_scores = pd.DataFrame(scores)
df_scores.index.name = "CV round"
df_scores = df_scores.T
df_scores["mean"] = df_scores.mean(axis=1)
df_scores["std"] = df_scores.std(axis=1)
df_scores

CV round 0...
CV round 1...
CV round 2...
CV round 3...
CV round 4...


CV round,0,1,2,3,4,mean,std
train_acc,0.98296,0.983103,0.984377,0.978714,0.982679,0.982367,0.001917
val_acc,0.939034,0.919766,0.935813,0.931835,0.939577,0.933205,0.007264
train_auc,0.998756,0.998583,0.998688,0.998119,0.998543,0.998538,0.000222
val_auc,0.988479,0.972682,0.9836,0.993213,0.985393,0.984674,0.006825


# Count Vector

In [12]:
skf = StratifiedKFold(n_splits=5,random_state=42)

In [14]:
scores = {"train_acc":[],"val_acc":[],"train_auc":[],"val_auc":[]}
i = 0
for train_index, test_index in skf.split(X, y):
    print("CV round %d..." % i)
    i += 1
    ###
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    feature_extractor_ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',max_features=None,
            ngram_range=(1, 2), stop_words = 'english')
    feature_extractor_ctv.fit(X_train)
    X_train, X_test = feature_extractor_ctv.transform(X_train), \
                      feature_extractor_ctv.transform(X_test)
    ###
    clf = LogisticRegression(C=4, dual=True)
    clf.fit(X=X_train,y=y_train)
    val_pred_prob = clf.predict(X=X_test)
    scores["val_acc"].append(metrics.accuracy_score(y_true=y_test,y_pred=(val_pred_prob>0.5)))
    scores["val_auc"].append(metrics.roc_auc_score(y_test,val_pred_prob))
    train_pred_prob = clf.predict(X=X_train)
    scores["train_acc"].append(metrics.accuracy_score(y_true=y_train,y_pred=(train_pred_prob>0.5)))
    scores["train_auc"].append(metrics.roc_auc_score(y_train,train_pred_prob))
df_scores = pd.DataFrame(scores)
df_scores.index.name = "CV round"
df_scores = df_scores.T
df_scores["mean"] = df_scores.mean(axis=1)
df_scores["std"] = df_scores.std(axis=1)
df_scores

CV round 0...
CV round 1...
CV round 2...
CV round 3...
CV round 4...


CV round,0,1,2,3,4,mean,std
train_acc,0.999953,0.999953,0.999953,0.999953,1.0,0.999962,1.9e-05
val_acc,0.921857,0.870493,0.899188,0.939955,0.917107,0.90972,0.023519
train_auc,0.99996,0.99996,0.99996,0.99996,1.0,0.999968,1.6e-05
val_auc,0.91949,0.876694,0.903798,0.933793,0.918319,0.910419,0.019351


# Count vector + NB

In [16]:
scores = {"train_acc":[],"val_acc":[],"train_auc":[],"val_auc":[]}
i = 0
for train_index, test_index in skf.split(X, y):
    print("CV round %d..." % i)
    i += 1
    ###
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    feature_extractor_ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',max_features=None,
            ngram_range=(1, 2), stop_words = 'english')
    feature_extractor_ctv.fit(X_train)
    X_train, X_test = feature_extractor_ctv.transform(X_train), \
                      feature_extractor_ctv.transform(X_test)
    ###
    m, r = get_mdl(X_train, y_train)
    val_pred_prob = m.predict_proba(X_test.multiply(r))[:, 1]
    scores["val_acc"].append(metrics.accuracy_score(y_true=y_test,y_pred=(val_pred_prob>0.5)))
    scores["val_auc"].append(metrics.roc_auc_score(y_test,val_pred_prob))
    train_pred_prob = m.predict_proba(X_train.multiply(r))[:, 1]
    scores["train_acc"].append(metrics.accuracy_score(y_true=y_train,y_pred=(train_pred_prob>0.5)))
    scores["train_auc"].append(metrics.roc_auc_score(y_train,train_pred_prob))
df_scores = pd.DataFrame(scores)
df_scores.index.name = "CV round"
df_scores = df_scores.T
df_scores["mean"] = df_scores.mean(axis=1)
df_scores["std"] = df_scores.std(axis=1)
df_scores

CV round 0...
CV round 1...
CV round 2...
CV round 3...
CV round 4...


CV round,0,1,2,3,4,mean,std
train_acc,0.999953,0.999953,0.999906,0.999906,0.999953,0.999934,2.312148e-05
val_acc,0.939034,0.901265,0.930149,0.942787,0.938822,0.930411,0.01515138
train_auc,1.0,1.0,1.0,1.0,1.0,1.0,4.522032e-09
val_auc,0.978838,0.963263,0.977127,0.985753,0.980592,0.977115,0.007503807


# texts = body + titles

In [23]:
df = pd.read_csv("../data/normalized_texts_labels.csv",encoding="utf-8")
df = df[["normalized_title","normalized_text","fake"]]
df.columns = ["titles","texts","labels"]
df["texts"] = df["texts"] + df["titles"]
df = df[["texts","labels"]]
print("# of NaN of texts:" + str(df["texts"].isnull().sum()))
print("# of NaN of labels:" + str(df["labels"].isnull().sum()))
df = df.dropna()
print("dataset size:" + str(df.shape))
label_encoder = preprocessing.LabelBinarizer()
label_encoder.fit(df["labels"])
labels_encoded = label_encoder.transform(df["labels"])
y = df["labels"].values
X = df["texts"].values

# of NaN of texts:116
# of NaN of labels:0
dataset size:(26484, 2)


In [24]:
# TFIDF
skf = StratifiedKFold(n_splits=5,random_state=42)
scores = {"train_acc":[],"val_acc":[],"train_auc":[],"val_auc":[]}
i = 0
for train_index, test_index in skf.split(X, y):
    print("CV round %d..." % i)
    i += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    feature_extractor_tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
    feature_extractor_tfv.fit(X_train)
    X_train, X_test = feature_extractor_tfv.transform(X_train), \
                      feature_extractor_tfv.transform(X_test)
    clf = LogisticRegression(C=4, dual=True)
    clf.fit(X=X_train,y=y_train)
    val_pred_prob = clf.predict(X=X_test)
    scores["val_acc"].append(metrics.accuracy_score(y_true=y_test,y_pred=(val_pred_prob>0.5)))
    scores["val_auc"].append(metrics.roc_auc_score(y_test,val_pred_prob))
    train_pred_prob = clf.predict(X=X_train)
    scores["train_acc"].append(metrics.accuracy_score(y_true=y_train,y_pred=(train_pred_prob>0.5)))
    scores["train_auc"].append(metrics.roc_auc_score(y_train,train_pred_prob))
df_scores = pd.DataFrame(scores)
df_scores.index.name = "CV round"
df_scores = df_scores.T
df_scores["mean"] = df_scores.mean(axis=1)
df_scores["std"] = df_scores.std(axis=1)
df_scores

CV round 0...
CV round 1...
CV round 2...
CV round 3...
CV round 4...


CV round,0,1,2,3,4,mean,std
train_acc,0.998018,0.998395,0.998159,0.997499,0.997687,0.997952,0.000322
val_acc,0.931106,0.898999,0.926185,0.932402,0.930891,0.923917,0.012636
train_auc,0.99771,0.998137,0.997866,0.997163,0.997395,0.997654,0.000343
val_auc,0.921539,0.896496,0.921088,0.919195,0.925351,0.916734,0.010314


In [25]:
# TFIDF + PR
scores = {"train_acc":[],"val_acc":[],"train_auc":[],"val_auc":[]}
i = 0
for train_index, test_index in skf.split(X, y):
    print("CV round %d..." % i)
    i += 1
    ###
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    feature_extractor_tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
    feature_extractor_tfv.fit(X_train)
    X_train, X_test = feature_extractor_tfv.transform(X_train), \
                      feature_extractor_tfv.transform(X_test)
    ###
    m, r = get_mdl(X_train, y_train)
    val_pred_prob = m.predict_proba(X_test.multiply(r))[:, 1]
    scores["val_acc"].append(metrics.accuracy_score(y_true=y_test,y_pred=(val_pred_prob>0.5)))
    scores["val_auc"].append(metrics.roc_auc_score(y_test,val_pred_prob))
    train_pred_prob = m.predict_proba(X_train.multiply(r))[:, 1]
    scores["train_acc"].append(metrics.accuracy_score(y_true=y_train,y_pred=(train_pred_prob>0.5)))
    scores["train_auc"].append(metrics.roc_auc_score(y_train,train_pred_prob))
df_scores = pd.DataFrame(scores)
df_scores.index.name = "CV round"
df_scores = df_scores.T
df_scores["mean"] = df_scores.mean(axis=1)
df_scores["std"] = df_scores.std(axis=1)
df_scores

CV round 0...
CV round 1...
CV round 2...
CV round 3...
CV round 4...


CV round,0,1,2,3,4,mean,std
train_acc,0.981922,0.983056,0.983764,0.977865,0.981924,0.981706,0.002045
val_acc,0.932427,0.909194,0.932603,0.92636,0.93429,0.926975,0.009289
train_auc,0.998521,0.998524,0.998582,0.997956,0.998406,0.998398,0.000228
val_auc,0.987736,0.968155,0.982117,0.992946,0.984003,0.982991,0.008289


In [26]:
# Count Vector
scores = {"train_acc":[],"val_acc":[],"train_auc":[],"val_auc":[]}
i = 0
for train_index, test_index in skf.split(X, y):
    print("CV round %d..." % i)
    i += 1
    ###
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    feature_extractor_ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',max_features=None,
            ngram_range=(1, 2), stop_words = 'english')
    feature_extractor_ctv.fit(X_train)
    X_train, X_test = feature_extractor_ctv.transform(X_train), \
                      feature_extractor_ctv.transform(X_test)
    ###
    clf = LogisticRegression(C=4, dual=True)
    clf.fit(X=X_train,y=y_train)
    val_pred_prob = clf.predict(X=X_test)
    scores["val_acc"].append(metrics.accuracy_score(y_true=y_test,y_pred=(val_pred_prob>0.5)))
    scores["val_auc"].append(metrics.roc_auc_score(y_test,val_pred_prob))
    train_pred_prob = clf.predict(X=X_train)
    scores["train_acc"].append(metrics.accuracy_score(y_true=y_train,y_pred=(train_pred_prob>0.5)))
    scores["train_auc"].append(metrics.roc_auc_score(y_train,train_pred_prob))
df_scores = pd.DataFrame(scores)
df_scores.index.name = "CV round"
df_scores = df_scores.T
df_scores["mean"] = df_scores.mean(axis=1)
df_scores["std"] = df_scores.std(axis=1)
df_scores

CV round 0...
CV round 1...
CV round 2...
CV round 3...
CV round 4...


CV round,0,1,2,3,4,mean,std
train_acc,1.0,1.0,1.0,1.0,1.0,1.0,0.0
val_acc,0.926954,0.871437,0.8973,0.942598,0.919562,0.91157,0.02481
train_auc,1.0,1.0,1.0,1.0,1.0,1.0,0.0
val_auc,0.924151,0.877636,0.902134,0.937114,0.920388,0.912284,0.020623


In [27]:
# Count Vector + NB
scores = {"train_acc":[],"val_acc":[],"train_auc":[],"val_auc":[]}
i = 0
for train_index, test_index in skf.split(X, y):
    print("CV round %d..." % i)
    i += 1
    ###
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    feature_extractor_ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',max_features=None,
            ngram_range=(1, 2), stop_words = 'english')
    feature_extractor_ctv.fit(X_train)
    X_train, X_test = feature_extractor_ctv.transform(X_train), \
                      feature_extractor_ctv.transform(X_test)
    ###
    m, r = get_mdl(X_train, y_train)
    val_pred_prob = m.predict_proba(X_test.multiply(r))[:, 1]
    scores["val_acc"].append(metrics.accuracy_score(y_true=y_test,y_pred=(val_pred_prob>0.5)))
    scores["val_auc"].append(metrics.roc_auc_score(y_test,val_pred_prob))
    train_pred_prob = m.predict_proba(X_train.multiply(r))[:, 1]
    scores["train_acc"].append(metrics.accuracy_score(y_true=y_train,y_pred=(train_pred_prob>0.5)))
    scores["train_auc"].append(metrics.roc_auc_score(y_train,train_pred_prob))
df_scores = pd.DataFrame(scores)
df_scores.index.name = "CV round"
df_scores = df_scores.T
df_scores["mean"] = df_scores.mean(axis=1)
df_scores["std"] = df_scores.std(axis=1)
df_scores

CV round 0...
CV round 1...
CV round 2...
CV round 3...
CV round 4...


CV round,0,1,2,3,4,mean,std
train_acc,1.0,1.0,1.0,1.0,1.0,1.0,0.0
val_acc,0.937901,0.8973,0.929583,0.943165,0.936934,0.928976,0.01642
train_auc,1.0,1.0,1.0,1.0,1.0,1.0,0.0
val_auc,0.97853,0.95979,0.97682,0.985844,0.979595,0.976116,0.008713


In [None]:
# Winner : TFIDF + NR