In [1]:
import numpy as np
import pandas as pd
from sklearn import model_selection, preprocessing, metrics, linear_model, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
import string
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

# The Following function cleans stop words and/or punctuation

In [2]:
def remove_puncts_and_stop_words(text):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in text if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    #return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    #nostopwords = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    #nostopwords = ' '.join(nostopwords)
    #print(nostopwords)
    return nostopwords

In [3]:
def lemmatize_text(text):
    wnl = WordNetLemmatizer()
    #txt = """Resumption of the session I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999 , and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period ."""
    lemmatized_text = ([wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(word_tokenize(text))])
    lemmatized_text = ' '.join(lemmatized_text)
    return lemmatized_text

In [4]:
def do_pos_tagging(text):
    #print(text)
    tokens = nltk.word_tokenize(text)
    #print("********************")
    #print(tokens)
    #print("*********************")
    tokens = nltk.pos_tag(tokens)
    #print(tokens)
    tokens = [t[0]+'_'+ t[1] for t in tokens]
    pos_tagged_text = ' '.join(tokens)
    return pos_tagged_text

In [5]:
train_df = pd.read_csv("fake_or_real_news_training.csv")
test_df = pd.read_csv("fake_or_real_news_test.csv")

In [6]:
def clean_text(df):
    df['cleaned_text'] = df['text'].apply(remove_puncts_and_stop_words)
    df.drop('text', axis=1, inplace=True)
    df.rename(columns={'cleaned_text': 'text'}, inplace=True)

In [7]:
#train_df['text'][0]

In [8]:
#lemmatize_text(train_df['text'][0])

In [9]:
#tagged = do_pos_tagging(train_df['text'][0])

In [10]:
#print(tagged)

In [11]:
#train_df['text'] = train_df["text"].apply(lemmatize_text)
#test_df['text'] = test_df["text"].apply(lemmatize_text)
train_df['text'] = train_df["text"].apply(do_pos_tagging)
test_df['text'] = test_df["text"].apply(do_pos_tagging)

In [12]:
train_df.head(5)

Unnamed: 0,ID,title,text,label,X1,X2
0,8476,You Can Smell Hillary’s Fear,"Daniel_NNP Greenfield_NNP ,_, a_DT Shillman_NN...",FAKE,,
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google_NNP Pinterest_NNP Digg_NNP Linkedin_NNP...,FAKE,,
2,3608,Kerry to go to Paris in gesture of sympathy,U.S._NNP Secretary_NNP of_IN State_NNP John_NN...,REAL,,
3,10142,Bernie supporters on Twitter erupt in anger ag...,—_JJ Kaydee_NNP King_NNP (_( @_NNP KaydeeKing_...,FAKE,,
4,875,The Battle of New York: Why This Primary Matters,It_PRP 's_VBZ primary_JJ day_NN in_IN New_NNP ...,REAL,,


In [13]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train_df['text'], train_df['label'])

# label encode the target variable 
#encoder = preprocessing.LabelEncoder()
#train_y = encoder.fit_transform(train_y)
#valid_y = encoder.fit_transform(valid_y)

In [14]:
#valid_x, valid_y

In [15]:
def predict_labels(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return predictions

In [16]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    #predictions = predict_labels(classifier, feature_vector_valid, is_neural_net)
    
    return metrics.accuracy_score(predictions, valid_y)

In [17]:
def write_predictions_to_csv(df_test, df_predictions, filename):
    # Output the predictions into a csv file
    columns = ['ID','label']
    df_submission = pd.DataFrame(columns=columns)
    #to_predict_features=pd.read_csv('TestSetValues.csv',parse_dates=True)
    df_test = df_test.reset_index(drop=True)
    df_predictions = df_predictions.reset_index(drop=True)
    df_submission = df_submission.reset_index(drop=True)
    df_submission['ID'] = df_test['ID']
    df_submission['label'] = df_predictions[0]
    df_submission.to_csv(filename, sep=",", index = False)

# Naive Bayes

In [18]:
def predict_using_CountVectorizer(to_predict_df):
    # create a count vectorizer object 
    count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words='english')
    #count_vect = CountVectorizer(analyzer='text_process')
    #count_vect.fit(train_df['text'])
    
    # transform the training and validation data using count vectorizer object
    xtrain_count =  count_vect.fit_transform(train_x)
    xvalid_count =  count_vect.transform(valid_x)
    
    # Naive Bayes on Count Vectors
    accuracy = train_model(MultinomialNB(), xtrain_count, train_y, xvalid_count)
    print ("NB, Count Vectors: ", accuracy)
    
    test_x = to_predict_df['text']
    xtest_count = count_vect.transform(test_x)
    predictions = predict_labels(MultinomialNB(), xtrain_count, train_y, xtest_count)
    #predictions.shape
    df_predictions = pd.DataFrame(predictions)
    write_predictions_to_csv(to_predict_df, df_predictions, "CountVectorizer.prediction.csv")
    return df_predictions

def predict_using_WordLevelTfidfVectorizer(to_predict_df):
    # create a count vectorizer object 
    
    # transform the training and validation data using count vectorizer object
    tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
    #tfidf_vect.fit(train_df['text'])
    
    xtrain_tfidf =  tfidf_vect.fit_transform(train_x)
    xvalid_tfidf =  tfidf_vect.transform(valid_x)

    # Naive Bayes on Word Level TF IDF Vectors
    accuracy = train_model(MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
    print ("NB, WordLevel TF-IDF: ", accuracy)
    
    test_x = to_predict_df['text']
    xtest_tfidf = tfidf_vect.transform(test_x)
    predictions = predict_labels(MultinomialNB(), xtrain_tfidf, train_y, xtest_tfidf)
    #predictions.shape
    df_predictions = pd.DataFrame(predictions)
    write_predictions_to_csv(to_predict_df, df_predictions, "WordLevelTfidfVectorizer.prediction.csv")
    return df_predictions

def predict_using_NGramLevelTfidfVectorizer(to_predict_df):
    tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
    #tfidf_vect_ngram.fit(train_df['text'])
    
    xtrain_tfidf_ngram =  tfidf_vect_ngram.fit_transform(train_x)
    xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)
    
    # Naive Bayes on Ngram Level TF IDF Vectors
    accuracy = train_model(MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
    print ("NB, N-Gram Vectors: ", accuracy)  
    
    test_x = to_predict_df['text']
    xtest_tfidf_ngram = tfidf_vect_ngram.transform(test_x)
    predictions = predict_labels(MultinomialNB(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
    #predictions.shape
    df_predictions = pd.DataFrame(predictions)
    write_predictions_to_csv(to_predict_df, df_predictions, "NGramLevelTfidfVectorizer.prediction.csv")
    return df_predictions

def predict_using_CharLevelTfidfVectorizer(to_predict_df):
    # characters level tf-idf
    tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
    #tfidf_vect_ngram_chars.fit(train_df['text'])
    
    xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.fit_transform(train_x) 
    xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

    # Naive Bayes on Character Level TF IDF Vectors
    accuracy = train_model(MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
    print ("NB, CharLevel Vectors: ", accuracy)   

    test_x = to_predict_df['text']
    xtest_tfidf_vect_ngram_chars = tfidf_vect_ngram_chars.transform(test_x)
    predictions = predict_labels(MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_vect_ngram_chars)
    #predictions.shape
    df_predictions = pd.DataFrame(predictions)
    write_predictions_to_csv(to_predict_df, df_predictions, "CharLevelTfidfVectorizer.prediction.csv")
    return df_predictions


def predict_using_naive_bayes():
    predictions = predict_using_CountVectorizer(test_df)
    
    predictions = predict_using_WordLevelTfidfVectorizer(test_df)
    
    predictions = predict_using_NGramLevelTfidfVectorizer(test_df)
    
    predictions = predict_using_CharLevelTfidfVectorizer(test_df)
    
predict_using_naive_bayes()

NB, Count Vectors:  0.89
NB, WordLevel TF-IDF:  0.891
NB, N-Gram Vectors:  0.867
NB, CharLevel Vectors:  0.842


# Predict using Logistic Classifiers

In [19]:
def predict_using_CountVectorizer_logit(to_predict_df):
    # create a count vectorizer object 
    count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words='english')
    #count_vect.fit(train_df['text'])
    
    # transform the training and validation data using count vectorizer object
    xtrain_count =  count_vect.fit_transform(train_x)
    xvalid_count =  count_vect.transform(valid_x)
    
    # Naive Bayes on Count Vectors
    accuracy = train_model(LogisticRegression(), xtrain_count, train_y, xvalid_count)
    print ("LR, Count Vectors: ", accuracy)
    
    test_x = to_predict_df['text']
    xtest_count = count_vect.transform(test_x)
    predictions = predict_labels(LogisticRegression(), xtrain_count, train_y, xtest_count)
    #predictions.shape
    df_predictions = pd.DataFrame(predictions)
    write_predictions_to_csv(to_predict_df, df_predictions, "CountVectorizer.prediction.logit.csv")
    return df_predictions

def predict_using_WordLevelTfidfVectorizer_logit(to_predict_df):
    # create a count vectorizer object 
    
    # transform the training and validation data using count vectorizer object
    tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
    #tfidf_vect.fit(train_df['text'])
    
    xtrain_tfidf =  tfidf_vect.fit_transform(train_x)
    xvalid_tfidf =  tfidf_vect.transform(valid_x)

    # Naive Bayes on Word Level TF IDF Vectors
    accuracy = train_model(LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
    print ("LR, WordLevel TF-IDF: ", accuracy)
    
    test_x = to_predict_df['text']
    xtest_tfidf = tfidf_vect.transform(test_x)
    predictions = predict_labels(LogisticRegression(), xtrain_tfidf, train_y, xtest_tfidf)
    #predictions.shape
    df_predictions = pd.DataFrame(predictions)
    write_predictions_to_csv(to_predict_df, df_predictions, "WordLevelTfidfVectorizer.prediction.logit.csv")
    return df_predictions

def predict_using_NGramLevelTfidfVectorizer_logit(to_predict_df):
    tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
    #tfidf_vect_ngram.fit(train_df['text'])
    
    xtrain_tfidf_ngram =  tfidf_vect_ngram.fit_transform(train_x)
    xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)
    
    # Naive Bayes on Ngram Level TF IDF Vectors
    accuracy = train_model(LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
    print ("LR, N-Gram Vectors: ", accuracy)  
    
    test_x = to_predict_df['text']
    xtest_tfidf_ngram = tfidf_vect_ngram.transform(test_x)
    predictions = predict_labels(LogisticRegression(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
    #predictions.shape
    df_predictions = pd.DataFrame(predictions)
    write_predictions_to_csv(to_predict_df, df_predictions, "NGramLevelTfidfVectorizer.prediction.logit.csv")
    return df_predictions

def predict_using_CharLevelTfidfVectorizer_logit(to_predict_df):
    # characters level tf-idf
    tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
    #tfidf_vect_ngram_chars.fit(train_df['text'])
    
    xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.fit_transform(train_x) 
    xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

    # Naive Bayes on Character Level TF IDF Vectors
    accuracy = train_model(LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
    print ("LR, CharLevel Vectors: ", accuracy)   

    test_x = to_predict_df['text']
    xtest_tfidf_vect_ngram_chars = tfidf_vect_ngram_chars.transform(test_x)
    predictions = predict_labels(LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_vect_ngram_chars)
    #predictions.shape
    df_predictions = pd.DataFrame(predictions)
    write_predictions_to_csv(to_predict_df, df_predictions, "CharLevelTfidfVectorizer.prediction.logit.csv")
    return df_predictions

def predict_using_logistic():
    predictions = predict_using_CountVectorizer_logit(test_df)
    
    predictions = predict_using_WordLevelTfidfVectorizer_logit(test_df)
    
    predictions = predict_using_NGramLevelTfidfVectorizer_logit(test_df)
    
    predictions = predict_using_CharLevelTfidfVectorizer_logit(test_df)
    
predict_using_logistic()

LR, Count Vectors:  0.915
LR, WordLevel TF-IDF:  0.889
LR, N-Gram Vectors:  0.892
LR, CharLevel Vectors:  0.851


# SVM

In [20]:
def predict_using_NGramLevelTfidfVectorizer_svm(to_predict_df):
    #tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
    stop_words = stopwords.words("english")
    tfidf_vect_ngram = TfidfVectorizer(stop_words = stop_words)
    #tfidf_vect_ngram.fit(train_df['text'])
    
    xtrain_tfidf_ngram =  tfidf_vect_ngram.fit_transform(train_x)
    xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)
    
    # Naive Bayes on Ngram Level TF IDF Vectors
    accuracy = train_model(svm.LinearSVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
    print ("SVM, N-Gram Vectors: ", accuracy)  
    
    test_x = to_predict_df['text']
    xtest_tfidf_ngram = tfidf_vect_ngram.transform(test_x)
    predictions = predict_labels(svm.LinearSVC(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
    #predictions.shape
    df_predictions = pd.DataFrame(predictions)
    write_predictions_to_csv(to_predict_df, df_predictions, "NGramLevelTfidfVectorizer.prediction.svm.csv")
    return df_predictions

def predict_using_svm():
    predictions = predict_using_NGramLevelTfidfVectorizer_svm(test_df)
    
    #predictions = predict_using_WordLevelTfidfVectorizer_logit(test_df)
    
    #predictions = predict_using_NGramLevelTfidfVectorizer_logit(test_df)
    
    #predictions = predict_using_CharLevelTfidfVectorizer_logit(test_df)
    
predict_using_svm()

SVM, N-Gram Vectors:  0.927


In [21]:
## word level tf-idf
#tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
#tfidf_vect.fit(train_df['text'])
#xtrain_tfidf =  tfidf_vect.transform(train_x)
#xvalid_tfidf =  tfidf_vect.transform(valid_x)
#
## Naive Bayes on Word Level TF IDF Vectors
#accuracy = train_model(MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
#print ("NB, WordLevel TF-IDF: ", accuracy)

In [22]:
## ngram level tf-idf 
#tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
#tfidf_vect_ngram.fit(train_df['text'])
#xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
#xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)
#
## Naive Bayes on Ngram Level TF IDF Vectors
#accuracy = train_model(MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
#print ("NB, N-Gram Vectors: ", accuracy)

In [23]:
## characters level tf-idf
#tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
#tfidf_vect_ngram_chars.fit(train_df['text'])
#xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
#xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 
#
## Naive Bayes on Character Level TF IDF Vectors
#accuracy = train_model(MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
#print ("NB, CharLevel Vectors: ", accuracy)