In [29]:
import numpy as np
import pandas as pd
from sklearn import model_selection, preprocessing, metrics, linear_model, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
import string

# #The Following function cleans stop words and/or punctuation

In [30]:
string.punctuation


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [31]:
stopwords.words('english')[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [32]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [33]:
train_df = pd.read_csv("fake_or_real_news_training.csv")
test_df = pd.read_csv("fake_or_real_news_test.csv")

In [34]:
train_df[0:5]['text']

0    Daniel Greenfield, a Shillman Journalism Fello...
1    Google Pinterest Digg Linkedin Reddit Stumbleu...
2    U.S. Secretary of State John F. Kerry said Mon...
3    — Kaydee King (@KaydeeKing) November 9, 2016 T...
4    It's primary day in New York and front-runners...
Name: text, dtype: object

In [35]:
train_df['text'].head(5).apply(text_process)

0    [Daniel, Greenfield, Shillman, Journalism, Fel...
1    [Google, Pinterest, Digg, Linkedin, Reddit, St...
2    [US, Secretary, State, John, F, Kerry, said, M...
3    [—, Kaydee, King, KaydeeKing, November, 9, 201...
4    [primary, day, New, York, frontrunners, Hillar...
Name: text, dtype: object

In [36]:
#train_title_text = train_df['title'] + ". " + train_df['text']

In [37]:
#train_title_text

In [38]:
#x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=.33)
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train_df['text'], train_df['label'])

# label encode the target variable 
#encoder = preprocessing.LabelEncoder()
#train_y = encoder.fit_transform(train_y)
#valid_y = encoder.fit_transform(valid_y)

In [39]:
#valid_x, valid_y

In [40]:
def predict_labels(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return predictions

In [41]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    #predictions = predict_labels(classifier, feature_vector_valid, is_neural_net)
    
    return metrics.accuracy_score(predictions, valid_y)

In [42]:
def write_predictions_to_csv(df_test, df_predictions, filename):
    # Output the predictions into a csv file
    columns = ['ID','label']
    df_submission = pd.DataFrame(columns=columns)
    #to_predict_features=pd.read_csv('TestSetValues.csv',parse_dates=True)
    df_test = df_test.reset_index(drop=True)
    df_predictions = df_predictions.reset_index(drop=True)
    df_submission = df_submission.reset_index(drop=True)
    df_submission['ID'] = df_test['ID']
    df_submission['label'] = df_predictions[0]
    df_submission.to_csv(filename, sep=",", index = False)

# Naive Bayes

In [43]:
def predict_using_CountVectorizer(to_predict_df):
    # create a count vectorizer object 
    #count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words='english')
    count_vect = CountVectorizer(analyzer='text_process')
    count_vect.fit(train_df['text'])
    
    # transform the training and validation data using count vectorizer object
    xtrain_count =  count_vect.transform(train_x)
    xvalid_count =  count_vect.transform(valid_x)
    
    # Naive Bayes on Count Vectors
    accuracy = train_model(MultinomialNB(), xtrain_count, train_y, xvalid_count)
    print ("NB, Count Vectors: ", accuracy)
    
    test_x = to_predict_df['text']
    xtest_count = count_vect.transform(test_x)
    predictions = predict_labels(MultinomialNB(), xtrain_count, train_y, xtest_count)
    #predictions.shape
    df_predictions = pd.DataFrame(predictions)
    write_predictions_to_csv(to_predict_df, df_predictions, "CountVectorizer.prediction.csv")
    return df_predictions

def predict_using_WordLevelTfidfVectorizer(to_predict_df):
    # create a count vectorizer object 
    
    # transform the training and validation data using count vectorizer object
    tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
    tfidf_vect.fit(train_df['text'])
    
    xtrain_tfidf =  tfidf_vect.transform(train_x)
    xvalid_tfidf =  tfidf_vect.transform(valid_x)

    # Naive Bayes on Word Level TF IDF Vectors
    accuracy = train_model(MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
    print ("NB, WordLevel TF-IDF: ", accuracy)
    
    test_x = to_predict_df['text']
    xtest_tfidf = tfidf_vect.transform(test_x)
    predictions = predict_labels(MultinomialNB(), xtrain_tfidf, train_y, xtest_tfidf)
    #predictions.shape
    df_predictions = pd.DataFrame(predictions)
    write_predictions_to_csv(to_predict_df, df_predictions, "WordLevelTfidfVectorizer.prediction.csv")
    return df_predictions

def predict_using_NGramLevelTfidfVectorizer(to_predict_df):
    tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
    tfidf_vect_ngram.fit(train_df['text'])
    xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
    xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)
    
    # Naive Bayes on Ngram Level TF IDF Vectors
    accuracy = train_model(MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
    print ("NB, N-Gram Vectors: ", accuracy)  
    
    test_x = to_predict_df['text']
    xtest_tfidf_ngram = tfidf_vect_ngram.transform(test_x)
    predictions = predict_labels(MultinomialNB(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
    #predictions.shape
    df_predictions = pd.DataFrame(predictions)
    write_predictions_to_csv(to_predict_df, df_predictions, "NGramLevelTfidfVectorizer.prediction.csv")
    return df_predictions

def predict_using_CharLevelTfidfVectorizer(to_predict_df):
    # characters level tf-idf
    tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
    tfidf_vect_ngram_chars.fit(train_df['text'])
    xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
    xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

    # Naive Bayes on Character Level TF IDF Vectors
    accuracy = train_model(MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
    print ("NB, CharLevel Vectors: ", accuracy)   

    test_x = to_predict_df['text']
    xtest_tfidf_vect_ngram_chars = tfidf_vect_ngram_chars.transform(test_x)
    predictions = predict_labels(MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_vect_ngram_chars)
    #predictions.shape
    df_predictions = pd.DataFrame(predictions)
    write_predictions_to_csv(to_predict_df, df_predictions, "CharLevelTfidfVectorizer.prediction.csv")
    return df_predictions

In [44]:
def predict_using_naive_bayes():
    predictions = predict_using_CountVectorizer(test_df)
    
    predictions = predict_using_WordLevelTfidfVectorizer(test_df)
    
    predictions = predict_using_NGramLevelTfidfVectorizer(test_df)
    
    predictions = predict_using_CharLevelTfidfVectorizer(test_df)
    
predict_using_naive_bayes()

ValueError: text_process is not a valid tokenization scheme/analyzer

# Predict using Logistic Classifiers

In [None]:
def predict_using_CountVectorizer_logit(to_predict_df):
    # create a count vectorizer object 
    count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words='english')
    count_vect.fit(train_df['text'])
    
    # transform the training and validation data using count vectorizer object
    xtrain_count =  count_vect.transform(train_x)
    xvalid_count =  count_vect.transform(valid_x)
    
    # Naive Bayes on Count Vectors
    accuracy = train_model(LogisticRegression(), xtrain_count, train_y, xvalid_count)
    print ("LR, Count Vectors: ", accuracy)
    
    test_x = to_predict_df['text']
    xtest_count = count_vect.transform(test_x)
    predictions = predict_labels(LogisticRegression(), xtrain_count, train_y, xtest_count)
    #predictions.shape
    df_predictions = pd.DataFrame(predictions)
    write_predictions_to_csv(to_predict_df, df_predictions, "CountVectorizer.prediction.logit.csv")
    return df_predictions

def predict_using_WordLevelTfidfVectorizer_logit(to_predict_df):
    # create a count vectorizer object 
    
    # transform the training and validation data using count vectorizer object
    tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
    tfidf_vect.fit(train_df['text'])
    
    xtrain_tfidf =  tfidf_vect.transform(train_x)
    xvalid_tfidf =  tfidf_vect.transform(valid_x)

    # Naive Bayes on Word Level TF IDF Vectors
    accuracy = train_model(LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
    print ("LR, WordLevel TF-IDF: ", accuracy)
    
    test_x = to_predict_df['text']
    xtest_tfidf = tfidf_vect.transform(test_x)
    predictions = predict_labels(LogisticRegression(), xtrain_tfidf, train_y, xtest_tfidf)
    #predictions.shape
    df_predictions = pd.DataFrame(predictions)
    write_predictions_to_csv(to_predict_df, df_predictions, "WordLevelTfidfVectorizer.prediction.logit.csv")
    return df_predictions

def predict_using_NGramLevelTfidfVectorizer_logit(to_predict_df):
    tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
    tfidf_vect_ngram.fit(train_df['text'])
    xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
    xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)
    
    # Naive Bayes on Ngram Level TF IDF Vectors
    accuracy = train_model(LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
    print ("LR, N-Gram Vectors: ", accuracy)  
    
    test_x = to_predict_df['text']
    xtest_tfidf_ngram = tfidf_vect_ngram.transform(test_x)
    predictions = predict_labels(LogisticRegression(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
    #predictions.shape
    df_predictions = pd.DataFrame(predictions)
    write_predictions_to_csv(to_predict_df, df_predictions, "NGramLevelTfidfVectorizer.prediction.logit.csv")
    return df_predictions

def predict_using_CharLevelTfidfVectorizer_logit(to_predict_df):
    # characters level tf-idf
    tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
    tfidf_vect_ngram_chars.fit(train_df['text'])
    xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
    xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

    # Naive Bayes on Character Level TF IDF Vectors
    accuracy = train_model(LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
    print ("LR, CharLevel Vectors: ", accuracy)   

    test_x = to_predict_df['text']
    xtest_tfidf_vect_ngram_chars = tfidf_vect_ngram_chars.transform(test_x)
    predictions = predict_labels(LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_vect_ngram_chars)
    #predictions.shape
    df_predictions = pd.DataFrame(predictions)
    write_predictions_to_csv(to_predict_df, df_predictions, "CharLevelTfidfVectorizer.prediction.logit.csv")
    return df_predictions

In [None]:
def predict_using_logistic():
    predictions = predict_using_CountVectorizer_logit(test_df)
    
    predictions = predict_using_WordLevelTfidfVectorizer_logit(test_df)
    
    predictions = predict_using_NGramLevelTfidfVectorizer_logit(test_df)
    
    predictions = predict_using_CharLevelTfidfVectorizer_logit(test_df)
    
predict_using_logistic()

# SVM

In [None]:
def predict_using_NGramLevelTfidfVectorizer_svm(to_predict_df):
    tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
    tfidf_vect_ngram.fit(train_df['text'])
    xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
    xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)
    
    # Naive Bayes on Ngram Level TF IDF Vectors
    accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
    print ("LR, N-Gram Vectors: ", accuracy)  
    
    test_x = to_predict_df['text']
    xtest_tfidf_ngram = tfidf_vect_ngram.transform(test_x)
    predictions = predict_labels(svm.SVC(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
    #predictions.shape
    df_predictions = pd.DataFrame(predictions)
    write_predictions_to_csv(to_predict_df, df_predictions, "NGramLevelTfidfVectorizer.prediction.logit.csv")
    return df_predictions

In [None]:
def predict_using_svm():
    predictions = predict_using_NGramLevelTfidfVectorizer_svm(test_df)
    
    #predictions = predict_using_WordLevelTfidfVectorizer_logit(test_df)
    
    #predictions = predict_using_NGramLevelTfidfVectorizer_logit(test_df)
    
    #predictions = predict_using_CharLevelTfidfVectorizer_logit(test_df)
    
predict_using_svm()

In [None]:
## word level tf-idf
#tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
#tfidf_vect.fit(train_df['text'])
#xtrain_tfidf =  tfidf_vect.transform(train_x)
#xvalid_tfidf =  tfidf_vect.transform(valid_x)
#
## Naive Bayes on Word Level TF IDF Vectors
#accuracy = train_model(MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
#print ("NB, WordLevel TF-IDF: ", accuracy)

In [None]:
## ngram level tf-idf 
#tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
#tfidf_vect_ngram.fit(train_df['text'])
#xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
#xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)
#
## Naive Bayes on Ngram Level TF IDF Vectors
#accuracy = train_model(MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
#print ("NB, N-Gram Vectors: ", accuracy)

In [None]:
## characters level tf-idf
#tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
#tfidf_vect_ngram_chars.fit(train_df['text'])
#xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
#xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 
#
## Naive Bayes on Character Level TF IDF Vectors
#accuracy = train_model(MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
#print ("NB, CharLevel Vectors: ", accuracy)