In [1]:
import json
import pandas as pd
import numpy as np
import re
from collections import Counter
from functools import reduce
import operator
from spellchecker import SpellChecker

#Load data in json format and store in a dataframe
with open("proj1_data.json") as fp:
    data = json.load(fp)
    df = pd.DataFrame(data)

In [2]:
#Convert true-false to 1-0
df["is_root"] = df["is_root"].astype(int)
#Convert all text to lower cases
df["text"]= [x.lower() for x in df["text"]]
#Parse text where there's a space
df["text"]= [x.split() for x in df["text"]]

In [3]:
#First 10000 data points as training set
train = df.iloc[0:10000,:]
#10000 to 11000 as validation set
validation = df.iloc[10000:11000,:]
#Last 1000 as test set
test = df.iloc[11000:12000,:]
#Display the first 10 points of the training data
train.head(10)

Unnamed: 0,children,controversiality,is_root,popularity_score,text
0,0,0,0,1.254698,"[its, raining, sideways]"
1,0,0,0,0.509813,"[wheel, of, time, reader, confirmed!]"
2,0,0,1,0.370827,"[the, jungle, book, of, pussy]"
3,0,0,0,-0.272843,"[i'm, just, making, this, thread, since, there..."
4,0,0,1,0.56015,"[hi, there,, looks, like, you're, wanting, to,..."
5,0,0,1,0.696554,"[when, there, is, a, line, to, exit, a, store,..."
6,0,0,1,1.050417,"[it, is, none, of, their, business.]"
7,1,0,1,0.310543,"[infinite, ammo, and, the, absurd, number, of,..."
8,1,1,0,-1.208735,"[if, you're, trying, to, argue, for, a, claim,..."
9,1,0,0,1.1237,"[oh, god, it's, all, my, cousins, from, the, s..."


In [5]:
def preprocess(dataset,words):
    
    # Feature: Does the comment contain a question mark
    qmarks = np.zeros((dataset.shape[0]))
    # Feature: Normalized comment length (number of words)
    n_words = np.zeros((dataset.shape[0]))
    # Feature: Avg number of letters per words
    letters_per_word = np.zeros((dataset.shape[0]))
    # Feature: Number of punctuation signs per word (, . ! ? : ;)
    punctuation_count = np.zeros((dataset.shape[0]))
    punct = [',', '.', '!', '?', ':', ';']
    # Feature: Most common word count
    l = reduce(operator.concat, dataset["text"])
    most_common_words = [word for word, word_count in Counter(l).most_common(words)]
    zeros = np.zeros(shape = (dataset.shape[0], words))
    word_count_features = pd.DataFrame(zeros, columns = most_common_words)    
    # Feature: Misspelled words
    misspelled_feature = np.zeros(dataset.shape[0])
    spell = SpellChecker()
    #Feature: Swear words
    swear_words = pd.read_csv("swearWords.csv")
    s_words = np.zeros(dataset.shape[0])

    
    # Iterate over comments
    for i in range(dataset.shape[0]):
        txt = dataset.iloc[i]["text"]
        # Iterate over words
        for w in txt:
            # most common words
            for target in most_common_words:
                if w == target:
                    word_count_features.iloc[i][target] += 1
                    
            # misspelled count
            [re.sub(r'[^a-zA-Z]', '', x) for x in txt]
            misspelled_words = spell.unknown(txt)
            misspelled_feature[i] = len(misspelled_words)
            
            #swear words count
            for target in swear_words:
                if w == target:
                    s_words[i]+=1
                    
            # punctuation count
            for x in punct:
                punctuation_count[i] += w.count(x)
                
            # question counter
            if "?" in w:
                qmarks[i] = 1
                
            # comment length
            n_words[i] += 1
            # number of letters
            letters_per_word[i] += len(w)
            
            
    # Get average number of letters per word
    for i in range(dataset.shape[0]):
        letters_per_word[i] = letters_per_word[i]/n_words[i]
        
    # Get average punctuation marks per word
    for i in range(dataset.shape[0]):
        punctuation_count[i] = punctuation_count[i]/n_words[i]
    
    # Normalize word length (divide by average)
    n_words = n_words/np.mean(n_words)
                
    # Add feature columns 
    # Most common words
    dataset = pd.concat([dataset, word_count_features], axis=1)
    # Misspelled words count
    dataset = dataset.assign(misspelled=pd.Series(misspelled_feature).values)
    # Swear words
    dataset = dataset.assign(s_words=pd.Series(s_words).values)
    # Question marks
    dataset = dataset.assign(has_question=pd.Series(qmarks).values.astype(int))
    # Number of words
    dataset = dataset.assign(n_words=pd.Series(n_words).values)
    # Avg letters per word
    dataset = dataset.assign(letters_per_word=pd.Series(letters_per_word).values)
    # Punctuation per word
    dataset = dataset.assign(punctuation_count=pd.Series(punctuation_count).values)
                
    return dataset

preprocess(train,10)

Unnamed: 0,children,controversiality,is_root,popularity_score,text,the,i,a,to,and,...,you,that,in,it,misspelled,s_words,has_question,n_words,letters_per_word,punctuation_count
0,0,0,0,1.254698,"[its, raining, sideways]",0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.124979,6.000000,0.000000
1,0,0,0,0.509813,"[wheel, of, time, reader, confirmed!]",0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0,0.208299,5.400000,0.200000
2,0,0,1,0.370827,"[the, jungle, book, of, pussy]",1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0,0.208299,4.000000,0.000000
3,0,0,0,-0.272843,"[i'm, just, making, this, thread, since, there...",1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,2.0,0.0,0,0.583236,4.000000,0.071429
4,0,0,1,0.560150,"[hi, there,, looks, like, you're, wanting, to,...",1.0,0.0,2.0,3.0,1.0,...,1.0,0.0,0.0,0.0,14.0,0.0,1,2.166306,6.250000,0.230769
5,0,0,1,0.696554,"[when, there, is, a, line, to, exit, a, store,...",5.0,4.0,10.0,4.0,3.0,...,1.0,0.0,3.0,5.0,8.0,1.0,0,4.749208,3.710526,0.078947
6,0,0,1,1.050417,"[it, is, none, of, their, business.]",0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0,0.249958,4.000000,0.166667
7,1,0,1,0.310543,"[infinite, ammo, and, the, absurd, number, of,...",1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0,0.541576,4.615385,0.076923
8,1,1,0,-1.208735,"[if, you're, trying, to, argue, for, a, claim,...",0.0,0.0,2.0,3.0,0.0,...,0.0,1.0,0.0,0.0,3.0,0.0,0,0.958174,4.478261,0.086957
9,1,0,0,1.123700,"[oh, god, it's, all, my, cousins, from, the, s...",2.0,3.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,7.0,0.0,0,1.666389,4.075000,0.100000
