In [1]:
import pandas as pd
import matplotlib as plt

from sklearn_pandas import DataFrameMapper

# Packages for NLP
import nltk
from nltk.corpus import stopwords
import regex as re

# Packages for sentiment analysis
from textblob import TextBlob

# Packages for visualisation 
import matplotlib.pyplot as plt

# Reading data

In [2]:
train_data = pd.read_csv("../Data/train_data.csv", index_col=1)
val_data = pd.read_csv("../Data/validation_data.csv", index_col=1)
test_data = pd.read_csv("../Data/test_data.csv", index_col=1)

# Feature creation

In [3]:
discourse_keywords = 'although, as though, but, by comparison, even if, even though, however, nevertheless, on the other hand, still, then, though, while, yet, and, meanwhile, in turn, next, ultimately, meantime, also, as if, even as, even still, even then, regardless, when, by contrast, conversely, if, in contrast, instead, nor, or, rather, whereas, while, yet, even after, by contrast, nevertheless, besides, much as, as much as, whereas, neither, nonetheless, even when, on the one hand indeed, finally, in fact, separately, in the end, on the contrary, while'
discourse_keywords = discourse_keywords.split(', ')
discourse_keywords = list(set(discourse_keywords))

In [4]:
# POS tags reference: https://www.learntek.org/blog/categorizing-pos-tagging-nltk-python/ 
nouns_list = ['NN', 'NNS', 'NNP', 'NNPS']
verbs_list = ['VB', 'VBD', 'VBG', 'VBN', 'VDP', 'VBZ']
adj_list = ['JJ', 'JJR', 'JJS']

def get_pos_tags(text):
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    return pos_tags

def get_num_nouns(text):
    pos_tags = get_pos_tags(text)
    nouns_count = len([word for (word, pos) in pos_tags if pos in nouns_list])
    return nouns_count

def get_num_verbs(text):
    pos_tags = get_pos_tags(text)
    verbs_count = len([word for (word, pos) in pos_tags if pos in verbs_list])
    return verbs_count

def get_num_adj(text):
    pos_tags = get_pos_tags(text)
    adj_count = len([word for (word, pos) in pos_tags if pos in adj_list])
    return adj_count

def get_num_discourse(text):
    tokens = nltk.word_tokenize(text)
    discourse_count = len([word for word in tokens if word in discourse_keywords])
    return discourse_count

def get_num_stopwords(text):
    tokens = nltk.word_tokenize(text)
    stopword_count = len([word for word in tokens if word in stopwords.words('english')])
    return stopword_count

def get_num_punctuations(text):
    punctuations = '!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~'
    punctuation_count = len([char for char in text if char in punctuations])
    return punctuation_count

def get_num_words_in_quotes(text):
    quotes = re.findall("'.'|\".\"", text)
    quote_count = 0
    if quotes is None:
        return 0
    else:
        for quote in quotes:
            words_in_quote = quote[1:-1]
            quote_count += len(words_in_quote.split())
        return quote_count

## Adding features to train data

In [None]:
train_data.reset_index(inplace=True)

In [45]:
train_num_nouns = []
train_num_verbs = []
train_num_adj = []
train_num_discourse = []
train_num_stopwords = []
train_num_punctuations = []
train_num_quote_words = []

for row in train_data['text']:
    nouns_count = get_num_nouns(row)
    train_num_nouns.append(nouns_count)

    verbs_count = get_num_verbs(row)
    train_num_verbs.append(verbs_count)
    
    adj_count = get_num_adj(row)
    train_num_adj.append(adj_count)
    
    discourse_count = get_num_discourse(row)
    train_num_discourse.append(discourse_count)

    stopword_count = get_num_stopwords(row)
    train_num_stopwords.append(stopword_count)

    punctuation_count = get_num_punctuations(row)
    train_num_punctuations.append(punctuation_count)

    quote_count = get_num_words_in_quotes(row)
    train_num_quote_words.append(quote_count)

In [21]:
train_data['char_count'] = train_data['text'].apply(lambda x: len(str(x)))
train_data['word_count'] = train_data['text'].apply(lambda x: len(str(x).split(" ")))
train_data['sentence_count'] = train_data['text'].apply(lambda x: len(str(x).split(".")))
train_data["num_unique_words"] = train_data['text'].apply(lambda x: len(set(str(x).split(" "))))
train_data["avg_sentence_length"] = train_data['word_count']/train_data['sentence_count']
train_data['num_punctuations'] = train_num_punctuations
train_data['num_stopwords'] = train_num_stopwords
train_data['num_words_in_quotes'] = train_num_quote_words

train_data['num_nouns'] = train_num_nouns
train_data['num_verbs'] = train_num_verbs
train_data['num_adjectives'] = train_num_adj
train_data['num_discourse_relations'] = train_num_discourse

train_data['textblob_sentiment'] = train_data['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
train_data.to_csv("train_data_with_added_features.csv")

## Adding features to validation data

In [5]:
val_data.reset_index(inplace=True)
val_data.head(5)

Unnamed: 0.1,text,Unnamed: 0,class_label,text_preprocessed
0,Britain's May presses Northern Ireland leaders...,20227,0,britain may press northern ireland leader rest...
1,GOP EVIDENCE: Comey FBI Busted Giving Clinton ...,9356,1,gop evid comey fbi bust give clinton special s...
2,Want to See ‘Hamilton’ in a City Near You? Buy...,1437,0,want see hamilton citi near buy subscript wait...
3,Why 'Never Trumpers' must reconsiderWhile Demo...,8457,1,never trumper must reconsiderwhil democrat per...
4,Trump Says Health Law Replacement May Not Be R...,11482,0,trump say health law replac may readi next yea...


In [6]:
val_num_nouns = []
val_num_verbs = []
val_num_adj = []
val_num_discourse = []
val_num_stopwords = []
val_num_punctuations = []
val_num_quote_words = []

for row in val_data['text']:
    nouns_count = get_num_nouns(row)
    val_num_nouns.append(nouns_count)

    verbs_count = get_num_verbs(row)
    val_num_verbs.append(verbs_count)
    
    adj_count = get_num_adj(row)
    val_num_adj.append(adj_count)
    
    discourse_count = get_num_discourse(row)
    val_num_discourse.append(discourse_count)

    stopword_count = get_num_stopwords(row)
    val_num_stopwords.append(stopword_count)

    punctuation_count = get_num_punctuations(row)
    val_num_punctuations.append(punctuation_count)

    quote_count = get_num_words_in_quotes(row)
    val_num_quote_words.append(quote_count)

In [7]:
val_data['char_count'] = val_data['text'].apply(lambda x: len(str(x)))
val_data['word_count'] = val_data['text'].apply(lambda x: len(str(x).split(" ")))
val_data['sentence_count'] = val_data['text'].apply(lambda x: len(str(x).split(".")))
val_data["num_unique_words"] = val_data['text'].apply(lambda x: len(set(str(x).split(" "))))
val_data["avg_sentence_length"] = val_data['word_count']/val_data['sentence_count']
val_data['num_punctuations'] = val_num_punctuations
val_data['num_stopwords'] = val_num_stopwords
val_data['num_words_in_quotes'] = val_num_quote_words

val_data['num_nouns'] = val_num_nouns
val_data['num_verbs'] = val_num_verbs
val_data['num_adjectives'] = val_num_adj
val_data['num_discourse_relations'] = val_num_discourse

val_data['textblob_sentiment'] = val_data['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [16]:
val_data.to_csv("val_data_with_added_features.csv")

In [18]:
val_data.head(5)

Unnamed: 0.1,text,Unnamed: 0,class_label,text_preprocessed,char_count,word_count,sentence_count,num_unique_words,avg_sentence_length,num_punctuations,num_stopwords,num_words_in_quotes,num_nouns,num_verbs,num_adjectives,num_discourse_relations,textblob_sentiment
0,Britain's May presses Northern Ireland leaders...,20227,0,britain may press northern ireland leader rest...,986,159,5,102,31.8,16,53,0,61,22,15,3,0.045833
1,GOP EVIDENCE: Comey FBI Busted Giving Clinton ...,9356,1,gop evid comey fbi bust give clinton special s...,3411,583,24,285,24.291667,60,224,0,223,88,40,12,0.127709
2,Want to See ‘Hamilton’ in a City Near You? Buy...,1437,0,want see hamilton citi near buy subscript wait...,5837,1019,49,499,20.795918,126,427,0,377,173,77,40,0.183048
3,Why 'Never Trumpers' must reconsiderWhile Demo...,8457,1,never trumper must reconsiderwhil democrat per...,5662,931,61,539,15.262295,135,386,0,319,121,85,54,0.007988
4,Trump Says Health Law Replacement May Not Be R...,11482,0,trump say health law replac may readi next yea...,2609,453,25,264,18.12,49,171,0,162,76,29,14,0.114201


## Adding features to test data

In [20]:
test_data.reset_index(inplace=True)
test_data.head(5)

Unnamed: 0.1,level_0,index,text,Unnamed: 0,class_label,text_preprocessed
0,0,0,The Billionaire Art Dealer Guy Wildenstein Is ...,12818,0,billionair art dealer guy wildenstein clear ta...
1,1,1,POLICE UNION Threatens 49er’s With BOYCOTT: TA...,13097,1,polic union threaten 49er boycott take action ...
2,2,2,U.S.G.A. Regrets ‘Distraction’ in Ruling Again...,1727,0,usga regret distract rule dustin johnson new y...
3,3,3,"FAKE NEWS WEEK: Truth, War Propaganda, CIA and...",22995,1,fake news week truth war propaganda cia media ...
4,4,4,WATCH: Reince Priebus Subtly But Hilariously ...,2404,1,watch reinc priebu subtli hilari embarrass ste...


In [None]:
test_num_nouns = []
test_num_verbs = []
test_num_adj = []
test_num_discourse = []
test_num_stopwords = []
test_num_punctuations = []
test_num_quote_words = []

for row in test_data['text']:
    nouns_count = get_num_nouns(row)
    test_num_nouns.append(nouns_count)

    verbs_count = get_num_verbs(row)
    test_num_verbs.append(verbs_count)
    
    adj_count = get_num_adj(row)
    test_num_adj.append(adj_count)
    
    discourse_count = get_num_discourse(row)
    test_num_discourse.append(discourse_count)

    stopword_count = get_num_stopwords(row)
    test_num_stopwords.append(stopword_count)

    punctuation_count = get_num_punctuations(row)
    test_num_punctuations.append(punctuation_count)

    quote_count = get_num_words_in_quotes(row)
    test_num_quote_words.append(quote_count)

In [None]:
test_data['char_count'] = test_data['text'].apply(lambda x: len(str(x)))
test_data['word_count'] = test_data['text'].apply(lambda x: len(str(x).split(" ")))
test_data['sentence_count'] = test_data['text'].apply(lambda x: len(str(x).split(".")))
test_data["num_unique_words"] = test_data['text'].apply(lambda x: len(set(str(x).split(" "))))
test_data["avg_sentence_length"] = test_data['word_count']/test_data['sentence_count']
test_data['num_punctuations'] = test_num_punctuations
test_data['num_stopwords'] = test_num_stopwords
test_data['num_words_in_quotes'] = test_num_quote_words

test_data['num_nouns'] = test_num_nouns
test_data['num_verbs'] = test_num_verbs
test_data['num_adjectives'] = test_num_adj
test_data['num_discourse_relations'] = test_num_discourse

test_data['textblob_sentiment'] = test_data['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
test_data.to_csv("test_data_with_added_features.csv")

# Applying TFIDF 

In [None]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)

mapper = DataFrameMapper([
    (['char_count', 'word_count', 'sentence_count', 'num_unique_words', 'avg_sentence_length', 'num_punctuations', 'num_stopwords', 'num_words_in_quotes', 'num_nouns', 'num_verbs', 'num_adjectives', 'num_discourse_relations', 'textblob_sentiment'], None), 
    ('text_preprocessed',tfidf_vectorizer)
])

X_train = mapper.fit_transform(train_data)