In [1]:
from collections import Counter
from nltk import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize
from string import ascii_lowercase, ascii_uppercase
import arff
import json
import nltk
import os

# Define features

In [11]:
features = [('id', 'NUMERIC'),
            ('number_of_words', 'NUMERIC'),
            ('average word length', 'NUMERIC'),
            ('length of the longest word', 'NUMERIC'),
            ('whether start with number', ['True', 'False']),
            ('whether start with who/what/why/where/when/how', ['True', 'False']),
            ('number_of_character_1_grams', 'NUMERIC'),
            ('number_of_character_2_grams', 'NUMERIC'),
            ('number_of_character_3_grams', 'NUMERIC'),
            ('clindex', 'NUMERIC'),
            ('formality_measure', 'NUMERIC'),
            ('is_exclamation_question_mark_present', ['0', '1']),
            ('lix', 'NUMERIC'),
            ('number_of_uppercase_words', 'NUMERIC'),
            ('rix', 'NUMERIC'),
            ('number_of_word_1_grams', 'NUMERIC'),
#             ('number_of_contractions','NUMERIC'),
            ('label', ['0', '1']),
]

# New features

In [3]:
# https://stackoverflow.com/questions/10677020/real-word-count-in-nltk
def number_of_words(text):
# TODO
#     regexptokenizer = RegexpTokenizer(r'\w+')
#     words = regexptokenizer.tokenize(text)
    words = word_tokenize(text)
    return len(words)


def number_of_character_1_grams(text):
    characters = [c for c in text]
    onegrams = ngrams(characters, 1)
    return len([gram for gram in onegrams])


def number_of_character_2_grams(text):
    if len(text) == 0:
        return []
    characters = [c for c in text]
    twograms = ngrams(characters, 2)
    return len([gram for gram in twograms])


def number_of_character_3_grams(text):
    if len(text) <= 1:
        return 0
    characters = [c for c in text]
    threegrams = ngrams(characters, 3)
    return len([gram for gram in threegrams])


# https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index
def clindex(text):
    text_lower = text.lower()
    number_of_letters = 0
    for character in text_lower:
        if character in ascii_lowercase:
            number_of_letters += 1
    number_of_sentences = len(sent_tokenize(text))
    n_of_words = number_of_words(text)
    l = 0
    s = 0
    # TODO should l and s be 0?
    if n_of_words == 0:
        pass
    else:
        # l = Letters ÷ Words × 100
        l = number_of_letters / n_of_words * 100
        # s = Sentences ÷ Words × 100
        s = number_of_sentences / n_of_words * 100
    return 0.0588 * l - 0.296 * s - 15.8


# https://stackoverflow.com/questions/10674832/count-verbs-nouns-and-other-parts-of-speech-with-pythons-nltk
def formality_measure(text):
    tokenized_text = nltk.word_tokenize(text.lower())
    t = nltk.Text(tokenized_text)
    pos_tags = nltk.pos_tag(t)
    counts = Counter(tag for word,tag in pos_tags)
    return (counts['NN'] + counts['NNP'] + counts['NNS'] + counts['JJ'] + counts['JJR'] + counts['JJS'] + counts['IN'] + counts['DT'] + counts['PDT'] + counts['WDT'] - counts['PRP'] - counts['PRP$'] - counts['WP'] - counts['WP$'] - counts['VB'] - counts['VBD'] - counts['VBG'] - counts['VBN'] - counts['VBP'] - counts['VBZ'] - counts['RB'] - counts['RBR'] - counts['RBS'] - counts['WRB'] - counts['UH'] + 100) / 2


def is_exclamation_question_mark_present(text):
    return 0 if '!' not in text and '?' not in text else 1


def lix(text):
    # TODO should we return 0?
    if len(sent_tokenize(text)) == 0:
        return 0
    return number_of_words(text) / len(sent_tokenize(text))


def number_of_uppercase_words(text):
    words = word_tokenize(text)
    n_of_uppercase_words = 0
    for word in words:
        if word[0] in ascii_uppercase:
            n_of_uppercase_words += 1
    return n_of_uppercase_words

def rix(text):
    lw = 0
    words = word_tokenize(text)
    for word in words:
        if len(word) >= 7:
            lw += 1
    # TODO should we return 0?
    if len(sent_tokenize(text)) == 0:
        return 0
    return lw / len(sent_tokenize(text))


def number_of_word_1_grams(text):
    onegrams = ngrams(word_tokenize(text), 1)
    return len([gram for gram in onegrams])


# def number_of_contractions(text):
#     stripped_contractions = ['aint', 'amnt', 'arent', 'cant', 'couldve', 'couldnt', 'couldntve',
#                 'didnt', 'doesnt', 'dont', 'gonna', 'gotta', 'hadnt', 'hadntve', 'hasnt',
#                 'havent','hell', 'hes', 'hesnt', 'howd', 'howll',
#                 'hows', 'id', 'idnt', 'idve', 'ill', 'im', 'ive', 'ivent', 'isnt',
#                 'itd', 'itll', 'its', 'itsnt', 'mightnt','mightve', 'mustnt', 'mustntve', 'mustve', 'neednt', 'ol', 'oughtnt',
#                 'shant', 'shed', 'shes', 'shouldve','shouldnt', 'shouldntve', 'somebodydve', 'somebodydntve', 'somebodys',
#                 'someonell', 'someones','somethingd', 'somethingdnt', 'somethingdntve', 'somethingdve', 'somethingll',
#                 'somethings', 'thatll', 'thats', 'thatd', 'thered', 'therednt', 'theredntve',
#                 'theredve', 'therere', 'theres', 'theyd', 'theydnt', 'theydntve', 'theydve',
#                 'theyll', 'theyontve', 'theyre', 'theyve', 'theyvent', 'wasnt',
#                 'wed', 'wedve', 'wednt', 'wedntve', 'well', 'wontve', 'were', 'weve', 'werent',
#                 'whatd', 'whatll', 'whats', 'whatve', 'whens', 'whered', 'wheres',
#                 'whereve', 'whod', 'whodve', 'wholl', 'whore', 'whos', 'whove', 'whyd', 'whyre',
#                 'whys', 'wont', 'wontve', 'wouldve', 'wouldnt', 'wouldntve', 'yall', 'yallllve', 'yallre', 'yallllvent', 'yaint',
#                 'youd', 'youdve', 'youll', 'youre', 'yourent', 'youve', 'youvent']
#     words = word_tokenize(text)
#     num = len([word.replace("'","") for word in words if word in stripped_contractions])
#     return(num)

In [4]:
def extract_features(tweet_id, text):
    doc = text.strip().split(' ')
    f1 = 0
    f2 = 0
    f3 = 0
    f4 = False
    f5 = False
    for token in doc:
        word = token.lower()
        if f1 == 0:
            if word[0].isdigit():
                f4 = True
            if word in ['who', 'what', 'why', 'where', 'when', 'how']:
                f5 = True
        f1 += 1
        length = len(word)
        f2 += length
        f3 = max(f3, length)
    if f1 == 0:
        return False
    return (tweet_id, f1, f2 * 1.0 / f1, f3, f4, f5, number_of_character_1_grams(text), number_of_character_2_grams(text), number_of_character_3_grams(text), clindex(text), formality_measure(text), is_exclamation_question_mark_present(text), lix(text), number_of_uppercase_words(text), rix(text), number_of_word_1_grams(text), number_of_contractions(text))

In [5]:
# https://github.com/ipython/ipython/issues/10123
directory_path = os.getcwd()
dataset_no_figures_path = directory_path + '/../data/dataset_no_figures/'

In [6]:
id_features = {}
with open(dataset_no_figures_path + 'instances_train.jsonl', 'r') as f:
    for line in f:
        dic = json.loads(line)
        if len(dic['postText'][0]) > 0:
            feat = extract_features(dic['id'], dic['postText'][0])
            if feat:
                id_features.setdefault(dic['id'], feat)
print(len(id_features))

17535


In [7]:
id_labels = {}
with open(dataset_no_figures_path + 'truth_train.jsonl', 'r') as f:
    for line in f:
        dic = json.loads(line)
        label = 1
        if dic['truthClass'][0] == 'n':
            label = 0
        if dic['id'] in id_features:
            id_labels.setdefault(dic['id'], label)
print(len(id_labels))

17535


In [8]:
data = {}
data.setdefault('attributes', features)
data.setdefault('description', '')
data.setdefault('relation', 'team-one')
data.setdefault('data', [])
for i in id_labels:
    tmp = [_ for _ in id_features[i]]
    tmp.append(str(id_labels[i]))
    data['data'].append(tmp)

with open(dataset_no_figures_path + 'sample_train.arff', 'w') as f:
    f.write(arff.dumps(data))

# Generate test .arff file

In [9]:
id_features = {}
id_labels = {}
with open(dataset_no_figures_path + 'instances_test.jsonl', 'r') as f:
    for line in f:
        dic = json.loads(line)
        if len(dic['postText'][0]) > 0:
            feat = extract_features(dic['id'], dic['postText'][0])
            if feat:
                id_features.setdefault(dic['id'], feat)
            if dic['id'] in id_features:
                id_labels.setdefault(dic['id'], '?')
print(len(id_features))

4408


In [10]:
data = {}
data.setdefault('attributes', features)
data.setdefault('description', '')
data.setdefault('relation', 'team-one')
data.setdefault('data', [])
for i in id_labels:
    tmp = [_ for _ in id_features[i]]
    tmp.append(str(id_labels[i]))
    data['data'].append(tmp)

with open(dataset_no_figures_path + 'sample_test.arff', 'w') as f:
    f.write(arff.dumps(data))