In [2]:
import pandas as pd
import matplotlib as plt

from sklearn_pandas import DataFrameMapper

# Packages for NLP
import nltk
from nltk.corpus import stopwords
import regex as re

# Packages for sentiment analysis
from textblob import TextBlob

# Packages for visualisation 
import matplotlib.pyplot as plt

# Reading data

In [46]:
train_data = pd.read_csv("../Data/Combined data/train_data.csv", index_col=0)
val_data = pd.read_csv("../Data/Combined data/val_data.csv", index_col=0)
test_data = pd.read_csv("../Data/Combined data/test_data.csv", index_col=0)

# Feature ceation

In [55]:
# Creating functions to get various features

def get_pos_tags(text): # POS tags reference: https://www.learntek.org/blog/categorizing-pos-tagging-nltk-python/ 
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    return pos_tags

def get_num_nouns(text):
    nouns_list = ['NN', 'NNS', 'NNP', 'NNPS']
    pos_tags = get_pos_tags(text)
    nouns_count = len([word for (word, pos) in pos_tags if pos in nouns_list])
    return nouns_count

def get_num_verbs(text):
    verbs_list = ['VB', 'VBD', 'VBG', 'VBN', 'VDP', 'VBZ']
    pos_tags = get_pos_tags(text)
    verbs_count = len([word for (word, pos) in pos_tags if pos in verbs_list])
    return verbs_count

def get_num_adj(text):
    adj_list = ['JJ', 'JJR', 'JJS']
    pos_tags = get_pos_tags(text)
    adj_count = len([word for (word, pos) in pos_tags if pos in adj_list])
    return adj_count

def get_num_discourse(text):
    discourse_keywords = ['even then', 'as though', 'still', 'whereas', 'on the other hand', 'but', 'while', 'ultimately', 'if', 'even when', 'instead', 'next', 'when', 'on the one hand indeed', 'even still', 'in the end', 'meanwhile', 'separately', 'or', 'nonetheless', 'neither', 'in contrast', 'nevertheless', 'although', 'then', 'in turn', 'regardless', 'as much as', 'rather', 'meantime', 'much as', 'yet', 'however', 'even as', 'conversely', 'even after', 'nor', 'finally', 'as if', 'in fact', 'also', 'even if', 'by comparison', 'and', 'besides', 'by contrast', 'on the contrary', 'even though', 'though']
    tokens = nltk.word_tokenize(text)
    discourse_count = len([word for word in tokens if word in discourse_keywords])
    return discourse_count

def get_num_stopwords(text):
    tokens = nltk.word_tokenize(text)
    stopword_count = len([word for word in tokens if word in stopwords.words('english')])
    return stopword_count

def get_num_punctuations(text):
    punctuations = '!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~'
    punctuation_count = len([char for char in text if char in punctuations])
    return punctuation_count

def get_num_words_in_quotes(text):
    quotes = re.findall("'.'|\".\"", text)
    quote_count = 0
    if quotes is None:
        return 0
    else:
        for quote in quotes:
            words_in_quote = quote[1:-1]
            quote_count += len(words_in_quote.split())
        return quote_count

In [5]:
def create_features(dataframe):
    """
        Adds 13 additional features to an input dataframe and returns the updated dataframe
    """
    num_nouns = []
    num_verbs = []
    num_adj = []
    num_discourse = []
    num_stopwords = []
    num_punctuations = []
    num_quote_words = []

    for row in dataframe['text']:
        nouns_count = get_num_nouns(row)
        num_nouns.append(nouns_count)

        verbs_count = get_num_verbs(row)
        num_verbs.append(verbs_count)
        
        adj_count = get_num_adj(row)
        num_adj.append(adj_count)
        
        discourse_count = get_num_discourse(row)
        num_discourse.append(discourse_count)

        stopword_count = get_num_stopwords(row)
        num_stopwords.append(stopword_count)

        punctuation_count = get_num_punctuations(row)
        num_punctuations.append(punctuation_count)

        quote_count = get_num_words_in_quotes(row)
        num_quote_words.append(quote_count)
    
    dataframe['char_count'] = dataframe['text'].apply(lambda x: len(str(x)))
    dataframe['word_count'] = dataframe['text'].apply(lambda x: len(str(x).split(" ")))
    dataframe['sentence_count'] = dataframe['text'].apply(lambda x: len(str(x).split(".")))
    dataframe["num_unique_words"] = dataframe['text'].apply(lambda x: len(set(str(x).split(" "))))
    dataframe["avg_sentence_length"] = dataframe['word_count']/dataframe['sentence_count']
    dataframe['num_punctuations'] = num_punctuations
    dataframe['num_stopwords'] = num_stopwords
    dataframe['num_words_in_quotes'] = num_quote_words
    dataframe['num_nouns'] = num_nouns
    dataframe['num_verbs'] = num_verbs
    dataframe['num_adjectives'] = num_adj
    dataframe['num_discourse_relations'] = num_discourse
    dataframe['textblob_sentiment'] = dataframe['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

    return dataframe

In [49]:
# Running create_features and output a new csv

train_data = create_features(train_data)
train_data.to_csv("../Data/Data with added features/train_data_with_added_features.csv")

test_data = create_features(test_data)
test_data.to_csv("../Data/Data with added features/test_data_with_added_features.csv")

train_data = create_features(train_data)
train_data.to_csv("../Data/Data with added features/train_data_with_added_features.csv")