In [67]:
import pandas as pd

Read the processed data from the txt file

In [61]:
real_title = pd.read_csv("data/clean_real.txt", header=None)
real_title.columns = ["title"]
real_title["is real"] = 1
fake_title = pd.read_csv("data/clean_fake.txt", header=None)
fake_title.columns = ["title"]
fake_title["is real"] = 0

title = pd.concat([real_title, fake_title])

In [None]:
from sklearn.model_selection import train_test_split

In [79]:
# seperate the document and the label
X = title['title']
y = title['is real']

# split the training set and test set

# first split to 70% training set, 30% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# then split to 15% test set, 15% validation set
X_test, X_validate, y_test, y_validate = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [80]:
X_train.shape

(2286,)

In [81]:
X_test.shape

(490,)

### Setup pipeline
- Vectorize document with tf-idf score
- training with decision tree model

In [148]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import ParameterGrid

In [149]:
clf_pipe = Pipeline([('cv', CountVectorizer()), ('clf', DecisionTreeClassifier())])

### Find the estimated parameters which yield the best accuracy

In [154]:
def validation_score(test, predict):
    values = test.values
    correct = 0
    for i in range(len(values)):
        if(values[i] == predict[i]):
            correct += 1
    return correct/len(values)

param_grid = {
    'clf__criterion': ['gini', 'entropy'],
    'clf__max_depth': range(1, 1000)
}

accuracy = 0
for param in ParameterGrid(param_grid):
    clf_pipe.set_params(**param)
    clf_pipe.fit(X_train, y_train)
    y_pred = clf_pipe.predict(X_validate)
    # save if best
    current_score = validation_score(y_validate, y_pred)
    if current_score > accuracy:
        accuracy = current_score
        best_grid = param

clf_pipe.set_params(**best_grid)
clf_pipe.fit(X_train, y_train)
validation_score(y_test, clf_pipe.predict(X_test))

0.7551020408163265

In [166]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

ModuleNotFoundError: No module named 'graphviz'

In [155]:
clf_pipe['clf']

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=9,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [167]:
export_graphviz(clf_pipe['clf'], out_file="tree.dot", max_depth=2)

In [169]:
system("dot -Tpng tree.dot -o tree.png")

['/bin/bash: dot -Tpng tree.dot -o tree.png: command not found']