In [1]:
import csv   # csv reader
import re    # regular expressions
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support # to report on precision and recall
from random import shuffle
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
# import nltk
# nltk.download('stopwords')
import nltk
nltk.download('omw-1.4')
nltk.download('punkt')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
# Vader consists of list of words which are annotated as positive,neagative,neutraland compound
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [2]:
def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            """parse data line function returns labels and all the meta features"""
            label, text,subject,speaker,speaker_job_title,state_info,party_affiliation,total_barely_true_counts, total_false_counts,total_half_true_counts, total_mostly_true_counts, total_pants_on_fire_counts, context,negative,neutral,positive,compound = parse_data_line(line)
            raw_data.append(((text,subject,speaker,speaker_job_title,state_info,party_affiliation,total_barely_true_counts, total_false_counts,total_half_true_counts, total_mostly_true_counts, total_pants_on_fire_counts, context,negative,neutral,positive,compound),label))

def split_and_preprocess_data(percentage):
    """
    splits raw data into training and testing based on the percentage parameter. 
    Train data consists of 40% data from the first half of raw data and 40% data from the second half of the raw data
    test data consists of 10% data from the first half and 10% data from the second half of the raw data.
    For suppose let us consider the total samples are 100 and the percentage parameter is 0.8. then half of the data is 50,
     samples are 0.8*100/2 which is 40. Using these variables, we divide the raw data into  
    train data=raw data [0:40] +raw data [50:90].  
    Test data=raw data [ 40:50]+raw data [90:100] 
    """
    num_samples = len(raw_data)
    half_samples=int(len(raw_data)/2)
    training_samples=int((percentage*num_samples)/2) 
    train_data.extend(raw_data[:training_samples] + raw_data[half_samples:half_samples+training_samples])
    test_data.extend(raw_data[training_samples:half_samples] + raw_data[half_samples+training_samples:])

In [3]:
features = ['Id', 'label', 'statement', 'subject', 'speaker', 'speaker_job_title', 'state_info', 'party_affiliation', 'total_barely_true_counts', 'total_false_counts total_half_true_counts', 'total_mostly_true_counts', 'total_pants_on_fire_counts', 'context']

In [4]:
from pandas._config.config import ContextDecorator
def convert_label(label):
      """
      Converts the multiple classes into two,
      making it a binary distinction between fake news and real.
      """
      #return label
      # Converting the multiclass labels to binary label
      labels_map = {
          'true': 'REAL',
          'mostly-true': 'REAL',
          'half-true': 'REAL',
          'false': 'FAKE',
          'barely-true': 'FAKE',
          'pants-fire': 'FAKE'
      }
      return labels_map[label]

def parse_data_line(data_line):
    """
    From the tab separated dataline extracting all the additional features from the fake news data
    It uses "convert_labels" function to convert labels into fake or real
    sid assigns a polarity of scores for each statement score includes positive,negative,neutral,compound
    returns labels,statement,subject,speaker,speaker_job and other features along with the polarities 
    in the form of string for our convenience
    """
    label=convert_label(data_line[1])
    statement=data_line[2]
    subject = data_line[3]
    speaker = data_line[4]
    speaker_job_title = data_line[5]
    state_info = data_line[6]
    party_affiliation = data_line[7]
    context = data_line[13]
    total_barely_true_counts = data_line[8]
    total_false_counts = data_line[9]
    total_half_true_counts = data_line[10]
    total_mostly_true_counts = data_line[11]
    total_pants_on_fire_counts = data_line[12]
    ss = sid.polarity_scores(statement)
    return label.strip().lower(),statement,subject,speaker,speaker_job_title,state_info,party_affiliation,total_barely_true_counts, total_false_counts,total_half_true_counts, total_mostly_true_counts, total_pants_on_fire_counts,context,str(ss['neg']),str(ss['neu']),str(ss['pos']),str(ss['compound'])
            

In [5]:
import nltk
nltk.download('wordnet')  

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
from nltk.stem import WordNetLemmatizer     # lemmatization
# TEXT PREPROCESSING AND FEATURE VECTORIZATION
snowball_stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()
# TEXT PREPROCESSING AND FEATURE VECTORIZATION
ps = PorterStemmer()
# input: a string of one review
def pre_process(text):
    # word tokenisation, including punctuation removal
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    """
    preprocess the additional features called from the "pre process_all_features" function .
    word tokenizer returns  
    lowercasing and returns list of tokens and lemmatized words 
    """
    tokens = word_tokenize(text)
    # # lowercasing
    words = [token.lower() for token in tokens]
    words1=[]
    for word in words:
        words1.append(re.sub('[^a-zA-z0-9]','',word))
    token_words = [snowball_stemmer.stem(word) for word in words1]
    lemmatized_words = [lemmatizer.lemmatize(word.lower()) for word in words1]
    return list(set(token_words + lemmatized_words))

print(pre_process("hello ! how can I help you 1 ?"))

['hello', 'you', 'can', 'how', 'help', '1', 'i']


In [7]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
# Vader consists of list of words which are annotated as positive,neagative,neutraland compound
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [8]:
def preprocess_all_features(x):
  """
  preprocessing some of the additional features using preprocess function
  converting all the numerical values to string for consistency
  token_output consists of string of polarity scores of text and list of
  numerical features with extracted tokens from the preprocessed data
  """
  text,subject,speaker,speaker_job_title,state_info,party_affiliation,total_barely_true_counts, total_false_counts,total_half_true_counts, total_mostly_true_counts, total_pants_on_fire_counts, context,negative,neutral,positive,compound = x
  text_tokens  = pre_process(text)
  speaker_tokens  = pre_process(speaker)
  job_title_tokens  = pre_process(speaker_job_title)
  salary_tokens  = pre_process(state_info)
  party_tokens  = pre_process(party_affiliation)
  context_tokens = pre_process(context)
  tokens_output = [str(float(negative)+float(neutral)+float(positive)+float(compound))]+ [total_barely_true_counts + total_false_counts+total_half_true_counts+total_mostly_true_counts + total_pants_on_fire_counts]+text_tokens + speaker_tokens +job_title_tokens+salary_tokens+ party_tokens +context_tokens                                                                                                               
  return tokens_output

In [9]:
def splitFeaturesAndLabels(data_set):
   #extracting features and labels from the data 
    features, labels = zip(*data_set) 
    return features, labels

In [10]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def train_classifier(trainData):
    features, labels = splitFeaturesAndLabels(trainData)
    """ 
    Pipeline does feature extraction and uses machine learning model for classifying labels
    Tfidf-tokens are returned form the "preprocess_all_features" function where stemming,lemmatization, lowercasing and other pre processing 
    converts the tokens into unigram, bigrams and tri grams and does for assigning weights for the feature i'm opting tfidf method
    opting smooth_idf method for smoothing unknown tokens which appear only in test data
    """
    pipeline = Pipeline([('tfidf', TfidfVectorizer(tokenizer = preprocess_all_features, ngram_range = (1, 3), lowercase=False, smooth_idf=True, use_idf=True)), 
                         ('svc', LinearSVC(class_weight = 'balanced'))])
    pipeline.fit(features, labels)
    return pipeline


In [11]:
def predict_labels(samples, classifier):
    """Assuming preprocessed samples, return their predicted labels from the classifier model."""
    return classifier.predict(samples)

def predict_label_from_raw(sample, classifier):
    """Assuming raw text, return its predicted label from the classifier model."""
    return classifier.classify(to_feature_vector(preProcess(sample)))

In [12]:
from random import shuffle

def cross_validate(dataset, folds):
    shuffle(dataset)
    cv_scores = []
    fold_size = int(len(dataset)/folds) + 1

    for i in range(0,len(dataset),int(fold_size)):
        # insert code here that trains and tests on the 10 folds of data in the dataset
        print("Fold start on items %d - %d" % (i, i+fold_size))

        data_train = dataset[ i + fold_size:] +  dataset[0: i] 
        data_val = dataset[i:i+fold_size]      
        #splitting data into features and labels
        val_feat, val_labels = splitFeaturesAndLabels(data_val)
  
        classifier = train_classifier(data_train)
        val_label = [x[1] for x in data_val]
        val_data1 = [x[0] for x in data_val]
        # print(val_data[0:2])

        val_pred = predict_labels(val_data1, classifier)
    
        #precision, recall, fscore, _ = metrics.precision_recall_fscore_support(validation_labels, predicted_labels, average='weighted')
        cv_scores.append(precision_recall_fscore_support(val_label, val_pred, average='weighted'))
        accuracy = metrics.accuracy_score(val_label, val_pred)
    avgResults = [np.mean([x[0] for x in cv_scores]),
                   np.mean([x[1] for x in cv_scores]),
                   np.mean([x[2] for x in cv_scores]),
                ]           
    """
    STEPS:
        1.for the cross validation, data set is divided into 10 folds, for each fold a training and validation data is created. 
        2. Here i is increment by fold size in each iteration untill length of the dataset. 
           suppose i =1 and fold size is 100 validation data is collected from data having index [1 + (1+foldsize):] and training data is collected 
            from index [0:1]+[1+foldsize:]. 
        3.Linear SVM classifier trains on the training data.
        4.labels and data is collected from the validation data for further prediction.
        5.precision, recall, fscore and support is calculated using validation labels and labels predicted from the classifier.
        6.accuracy is found using the metrics. 
        7. for each iteration from the step 2 all the steps are repeated.
        8. average results are calculated for each metrics and returned in the form of lists 
    """
        
    return avgResults

In [13]:
# MAIN

# loading reviews
# initialize global lists that will be appended to by the methods below
raw_data = []          # the filtered data from the dataset file
train_data = []        # the pre-processed training data as a percentage of the total dataset
test_data = []         # the pre-processed test data as a percentage of the total dataset


# references to the data files
data_file_path = 'fake_news.tsv'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing the dataset...",sep='\n')

load_data(data_file_path) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing training and test data...",sep='\n')

split_and_preprocess_data(0.8)

# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Training Samples: ", len(train_data), sep='\n')


Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 10241 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 10241 rawData, 8192 trainData, 2049 testData
Training Samples: 
8192


In [14]:
cv_results = cross_validate(train_data, 10)
print('Precision_score: {0}\n Recall_score: {1}\n FScore: {2}\n'.format(cv_results[0], cv_results[1], cv_results[2]))

Fold start on items 0 - 820
Fold start on items 820 - 1640
Fold start on items 1640 - 2460
Fold start on items 2460 - 3280
Fold start on items 3280 - 4100
Fold start on items 4100 - 4920
Fold start on items 4920 - 5740
Fold start on items 5740 - 6560
Fold start on items 6560 - 7380
Fold start on items 7380 - 8200
Precision_score: 0.6966156664194634
 Recall_score: 0.6972545956986662
 FScore: 0.6952406722914374



In [15]:
# Finally, check the accuracy of your classifier by training on all the traning data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(test_data[0])   # have a look at the first test data instance
    classifier = train_classifier(train_data)  # train the classifier
    test_true = [t[1] for t in test_data]   # get the ground-truth labels from the data
    test_pred = predict_labels([x[0] for x in test_data], classifier)  # classify the test data to get predicted labels
    final_scores = precision_recall_fscore_support(test_true, test_pred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % final_scores[:3])

(('I fought to bring about the largest private-sector infrastructure project in North American history.', 'energy', 'sarah-palin', '', 'Alaska', 'republican', '9', '19', '9', '6', '6', 'a radio address', '0.161', '0.839', '0.0', '-0.3182'), 'real')
Done training!
Precision: 0.693671
Recall: 0.694973
F Score:0.691586
