In [1]:
import pickle as pickle
import pandas as pd
import numpy as np
from sklearn import decomposition, ensemble
import nltk
from nltk.stem.porter import *
nltk.download()
nltk.download('wordnet')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
import scipy.sparse as sp
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import random
from sklearn import preprocessing
#from tensorflow import keras
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
[nltk_data] Downloading package wordnet to C:\Users\Andrea
[nltk_data]     Blasioli\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


ModuleNotFoundError: No module named 'keras'

# 1 - Data Upload

In [23]:
data_training = pd.read_csv("fake_or_real_news_training.csv")
data_training = data_training[(data_training["label"] == "FAKE") | (data_training["label"] == "REAL")]
data_testing = pd.read_csv("fake_or_real_news_test.csv")

# 2 - Data Inspection

In [24]:
overview = True
if overview:
    print("--Shape--")
    print(data_training.shape)
    print("--Head--")
    print(data_training.head(5))
    print("--Class Balance--")
    print(data_training["label"].value_counts())
else:
    print(data_training.info())

--Shape--
(3966, 6)
--Head--
      ID                                              title  \
0   8476                       You Can Smell Hillary’s Fear   
1  10294  Watch The Exact Moment Paul Ryan Committed Pol...   
2   3608        Kerry to go to Paris in gesture of sympathy   
3  10142  Bernie supporters on Twitter erupt in anger ag...   
4    875   The Battle of New York: Why This Primary Matters   

                                                text label   X1   X2  
0  Daniel Greenfield, a Shillman Journalism Fello...  FAKE  NaN  NaN  
1  Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE  NaN  NaN  
2  U.S. Secretary of State John F. Kerry said Mon...  REAL  NaN  NaN  
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE  NaN  NaN  
4  It's primary day in New York and front-runners...  REAL  NaN  NaN  
--Class Balance--
REAL    1990
FAKE    1976
Name: label, dtype: int64


# 3 - NLP Code and ML Algorithms

# a- Functions

In [25]:
def bagofwords(df, variable, step, vectorizer = None, technique=None, verbose=False):
    """
    Apply the standard technics to compute bag of words from the corpus 
    """
    if step == "training":
        if technique == "basic":
            vect = CountVectorizer(stop_words="english")
            train_counts = vect.fit_transform(df[variable].values)
        elif technique == "tfidf":
            vect = TfidfVectorizer(stop_words='english', max_df=0.7)
            train_counts = vect.fit_transform(df[variable].values)
        else:
            train_counts = None
            print("Choose valid technique")
        if verbose:
            print("--" + variable + "--")
            print(train_counts.shape)
            print("--Sample Features--")
            print(random.sample(vect.get_feature_names(), 50))
        return train_counts, vect
    else:
        train_counts = vectorizer.transform(df[variable].values)
        return train_counts

def train_test(df, threshold, verbose=False):
    """
    Split the dataset between training and testing
    """
    tr, te = train_test_split(df, test_size = threshold, random_state = 42)
    if verbose:
        print(tr.shape)
        print(te.shape)
        print(tr.head(5))
    return tr, te

def use_title_and_text(df,):
    """
    Combine the information of the title and text variable. Apply the prefix title if the word comes from 
    the title variable in order to differentiate the source of the information. 
    """
    titles = np.array([ " ".join(["title" + "_" + y for y in x.split(" ")]) for x in df["title"].values ])
    texts = np.array([ ": " + x for x in df["text"].values])
    df["text_and_title"] = np.core.defchararray.add(titles, texts)
    return df

def stem_lem(element):
    lmtzr = WordNetLemmatizer()
    element_lemmatized = " ".join([lmtzr.lemmatize(x) for x in element.split(" ")])
    return element_lemmatized
        

# b - Assumptions

In [26]:
techniques_to_use = ["tfidf", "basic"][0]
use_title = True
lemmatize = True
engineered_features = False

# c - Feature Engineering

In [27]:
if engineered_features:
    for df in [data_training, data_testing]:
        df['char_count_text'] = df['text'].apply(len)
        df['char_count_title'] = df['title'].apply(len)
        df['word_count_text'] = df['text'].apply(lambda x: len(x.split()))
        df['word_count_title'] = df['title'].apply(lambda x: len(x.split()))
        df['word_density_text'] = df['char_count_text'] / (df['word_count_text']+1)
        df['word_density_title'] = df['char_count_title'] / (df['word_count_title']+1)
        df['upper_case_word_count_text'] = df['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
        df['upper_case_word_count_title'] = df['title'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
        if lemmatize:
            df['title'] = df["title"].apply(stem_lem)
            df['text'] = df["text"].apply(stem_lem)

if use_title:
    text_var = ["text", "text_and_title"][1]
    data_training = use_title_and_text(data_training)
    data_testing = use_title_and_text(data_testing)
else:
    text_var = ["text", "text_and_title"][0]

# d - Define Test and Training Data

In [28]:
train, test = train_test(data_training, 0.2)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

# e - Execution BoW

In [29]:
features_train, c_vec_train = bagofwords(train, text_var, "training", technique=techniques_to_use, verbose=True)
features_test = bagofwords(test, text_var, "testing", vectorizer=c_vec_train)
final_features_train, c_vec_train_final = bagofwords(data_training, text_var, "training", technique=techniques_to_use)
final_features_test = bagofwords(data_testing, text_var, "testing", vectorizer=c_vec_train_final)

--text_and_title--
(3172, 56747)
--Sample Features--
['tarot', 'requests', 'darn', 'leone', 'title_u', 'censored', 'shakier', 'consequent', 'title_deaths', 'portly', 'shooter', 'walkouts', 'rarely', 'title_pot', 'title_repeatedly', 'title_wundergroundmusic', 'selling', 'blacks', 'scraper', 'qtaiba', 'hatchback', 'typing', 'mace', 'rye', 'malignant', 'forehead', 'divulge', 'adivasis', 'breed', 'elicits', 'title_affiliate', 'title_suitcase', 'wolverine', 'title_krishnan', 'innovations', 'barr', 'session', 'stretches', 'congresses', 'posters', 'theblogmire', 'title_end', 'dwy2rdwyho', 'decellularization', 'painfully', 'johnetta', 'temperatures', 'dubin', 'toughen', 'existing']


# f - Execution Code Test and Training

In [30]:
if use_title:
    df_features_train = train.join(pd.DataFrame(features_train.todense()).reset_index(drop=True), how='inner').drop(["ID", "title", "text", "text_and_title", "X1", "X2", "label"], axis=1)
    df_features_test = test.join(pd.DataFrame(features_test.todense()).reset_index(drop=True), how='inner').drop(["ID", "title", "text", "text_and_title", "label", "X1", "X2"], axis=1)
    df_final_features_train = data_training.join(pd.DataFrame(final_features_train.todense()).reset_index(drop=True), how='inner').drop(["ID", "title", "text", "text_and_title", "X1", "X2"], axis=1)
    df_final_features_test = data_testing.join(pd.DataFrame(final_features_test.todense()).reset_index(drop=True), how='inner').drop(["ID", "title", "text", "text_and_title"], axis=1)
else: 
    df_features_train = train.join(pd.DataFrame(features_train.todense()).reset_index(drop=True), how='inner').drop(["ID", "title", "text", "X1", "X2", "label"], axis=1)
    df_features_test = test.join(pd.DataFrame(features_test.todense()).reset_index(drop=True), how='inner').drop(["ID", "title", "text", "label", "X1", "X2"], axis=1)
    df_final_features_train = data_training.join(pd.DataFrame(final_features_train.todense()).reset_index(drop=True), how='inner').drop(["ID", "title", "text", "X1", "X2"], axis=1)
    df_final_features_test = data_testing.join(pd.DataFrame(final_features_test.todense()).reset_index(drop=True), how='inner').drop(["ID", "title", "text"], axis=1)

# g - Labeling

In [36]:
trained = True
if not trained:
    print("--Training--")
    clf = MultinomialNB().fit(preprocessing.normalize(df_features_train.values), train["label"])
    sgd = SGDClassifier().fit(preprocessing.normalize(df_features_train.values), train["label"])
    rf = RandomForestClassifier(n_estimators=200).fit(df_features_train, train["label"])
    ada = AdaBoostClassifier(n_estimators=200).fit(df_features_train, train["label"])
    pickle.dump([clf, sgd, rf, ada], open("models_trained.pkl", 'wb'))
else:
    models = pickle.load(open("models_trained.pkl", 'rb'))
    clf = models[0]
    sgd = models[1]
    rf = models[2]
    ada = models[3]
    for model in models:
        print(model)

# f - Print Prediction Accuracy

In [37]:
predicted_clf = clf.predict(df_features_test)
predicted_sgd = sgd.predict(df_features_test)
predicted_rf = rf.predict(df_features_test)
predicted_ada = ada.predict(df_features_test)

print("Naive Classifier Accuracy: {0}".format(metrics.accuracy_score(predicted_clf, test["label"])))
print("SVM Accuracy: {0}".format(metrics.accuracy_score(predicted_sgd, test["label"])))
print("Random Forest Accuracy: {0}".format(metrics.accuracy_score(predicted_rf, test["label"])))
print("AdaBoost Accuracy: {0}".format(metrics.accuracy_score(predicted_ada, test["label"])))

print("--Confusion Matrix Naive--")
print(metrics.confusion_matrix(predicted_clf, test["label"]))
print("--Confusion Matrix SVM--")
print(metrics.confusion_matrix(predicted_sgd, test["label"]))
print("--Confusion Matrix RF--")
print(metrics.confusion_matrix(predicted_rf, test["label"]))
print("--Confusion Matrix ADA--")
print(metrics.confusion_matrix(predicted_ada, test["label"]))

NameError: name 'clf' is not defined

# 4 - FAKE VS REAL

In [12]:
class_labels = sgd.classes_
feature_names = c_vec_train_final.get_feature_names()
feat_with_weights = sorted(zip(sgd.coef_[0], feature_names))
print("---" + class_labels[0] + "---")
print(feat_with_weights[:10])
print("---" + class_labels[1] + "---")
print(feat_with_weights[-10:])

NameError: name 'sgd' is not defined

# 5 - Final Model

In [None]:
final_trained = True
if not final_trained:
    model = SGDClassifier().fit(preprocessing.normalize(df_final_features_train.drop("label", axis=1).values), df_final_features_train["label"])
    pickle.dump(model, open("final_model.pkl", 'wb'))
else:
    model = pickle.load(open("final_model.pkl", 'rb'))
    print(model)
predicted_sgd = model.predict(preprocessing.normalize(df_final_features_test.values))

# 6 - Output

In [None]:
pred_data = {"IDs": range(0, len(predicted_sgd)), "Predictions": predicted_sgd}
df_predictions = pd.DataFrame(data=pred_data)
df_predictions.to_csv("predictions.csv", index=False)