In [1]:
import json
import math
import numpy as np
from collections import Counter

In [2]:
#DATA LOADING
with open("Project 1 Materials\proj1_data.json") as fp:
    data = json.load(fp)

In [3]:
#PRIMARY VARIABLES
X = []
y = []
training_X = []
training_Y = []
cross_validation_X = []
cross_validation_Y = []
testing_X = []
testing_Y = []

In [4]:
#TEXT PREPROCESSING

#Splits a given text into individual tokens
def splitText(text):
    lowerCaseText =  text.lower()
    splitText = lowerCaseText.split()
    return splitText

#Find top 160 frequently occurring word in 'data'
def topFrequencies(dictionaryList, num):
    countVariable = Counter()
    for item in dictionaryList:
        splitted_text = splitText(item['text'])
        for word in splitted_text:
            countVariable[word] += 1
    return dict(countVariable.most_common(num))

#Convert list of key-value pairs (word, frequency) to (word, ranking)
def convertFrequenciesToRankings(list):
    i = 0
    newList = {}
    for word in list:
        newList[word] = i
        i+=1
    return newList

#Construct a 160 word count vector from a comment
def construct160WordCountVector(text):
    vector = [0]*160
    splitted_text = splitText(text)
    for word in splitted_text:
        i = rankingOfMostFrequent160Words.get(word, -1)
        if (i != -1): vector[i] += 1
    return vector

#Construct a 60 word count vector from a comment
def construct60WordCountVector(text):
    vector = [0]*60
    splitted_text = splitText(text)
    for word in splitted_text:
        i = rankingOfMostFrequent60Words.get(word, -1)
        if (i != -1): vector[i] += 1
    return vector

mostFrequent160Words = topFrequencies(data, 160)
rankingOfMostFrequent160Words = convertFrequenciesToRankings(mostFrequent160Words)

mostFrequent60Words = topFrequencies(data, 60)
rankingOfMostFrequent60Words = convertFrequenciesToRankings(mostFrequent60Words)


#Extra feature 1: Number of words per comment
#Count the number of words in a comment
def countWord(text):
    words = text.split()
    num_words = len(words)
    return num_words
    

#Extra feature 2: Average length of all words in a comment
#Calculate the average length of words in a comment
def averageWordLength(text):
    words = text.split()
    numWords = len(words)
    totalNumLetters = len(text) - text.count(' ')
    avgNumLetters = totalNumLetters/numWords
    return avgNumLetters

In [5]:
#STANDARD DATASET CONSTRUCTION FUNCTIONS

#Constructs a dataset with the prescribed features + a 160 word count feature
def construct_dataset_1(d):
    X = []
    y = []
    for p in d:
        training_example = [1]
        for key, value in p.items():
            if (key == "popularity_score"):
                y.append(value)
            elif (key == "is_root"):
                if (value == True): training_example.append(1)
                elif (value == False): training_example.append(0)
            elif (key == "text"):
                vector = construct160WordCountVector(value)
                training_example.extend(vector)
            else:
                training_example.append(value)
        X.append(training_example)
    return X,y

#Constructs a dataset with the prescribed features + a 60 word count feature
def construct_dataset_2(d):
    X = []
    y = []
    for p in d:
        training_example = [1]
        for key, value in p.items():
            if (key == "popularity_score"):
                y.append(value)
            elif (key == "is_root"):
                if (value == True): training_example.append(1)
                elif (value == False): training_example.append(0)
            elif (key == "text"):
                vector = construct60WordCountVector(value)
                training_example.extend(vector)
            else:
                training_example.append(value)
        X.append(training_example)
    return X,y

In [6]:
#DATASET CONSTRUCTION FUNCTIONS WITH EXTRA FEATURES

#Constructs a dataset with the prescribed features + a 160 word count feature + 2 additional features
def construct_dataset_3(d):
    X = []
    y = []
    for p in d:
        training_example = [1]
        
        for key, value in p.items():
            if (key == "popularity_score"):
                y.append(value)
            elif (key == "is_root"):
                if (value == True): training_example.append(1)
                elif (value == False): training_example.append(0)
            elif (key == "text"):
                vector = construct160WordCountVector(value)
                training_example.extend(vector)
                
                wordCount = countWord(value)
                training_example.append(wordCount)
                
                avgWordLength = averageWordLength(value)
                training_example.append(avgWordLength)
            else:
                training_example.append(value)
        X.append(training_example)
    return X,y

#Constructs a dataset with the prescribed features + a 60 word count feature + 2 additional features
def construct_dataset_4(d):
    X = []
    y = []
    for p in d:
        training_example = [1]
        for key, value in p.items():
            if (key == "popularity_score"):
                y.append(value)
            elif (key == "is_root"):
                if (value == True): training_example.append(1)
                elif (value == False): training_example.append(0)
            elif (key == "text"):
                vector = construct60WordCountVector(value)
                training_example.extend(vector)
                                
                wordCount = countWord(value)
                training_example.append(wordCount)
                
                avgWordLength = averageWordLength(value)
                training_example.append(avgWordLength)
            else:
                training_example.append(value)
        X.append(training_example)
    return X,y

In [7]:
#DATASET CONSTRUCTION FUNCTIONS WITH BASIC PRESCRIBED FEATURES (EXCLUDES TEXT PROCESSING)

#Constructs a dataset with the prescribed features excluding text features
def construct_dataset_5(d):
    X = []
    y = []
    for p in d:
        training_example = [1]
        for key, value in p.items():
            if (key != "text"):
                if (key == "popularity_score"):
                    y.append(value)
                elif (key == "is_root"):
                    if (value == True): training_example.append(1)
                    elif (value == False): training_example.append(0)
                else: training_example.append(value)
        X.append(training_example)
    return X,y

#Constructs a dataset with the prescribed features + 2 extra features but excluding text features
def construct_dataset_6(d):
    X = []
    y = []
    for p in d:
        training_example = [1]
        for key, value in p.items():
            if (key == "popularity_score"):
                y.append(value)
            elif (key == "is_root"):
                if (value == True): training_example.append(1)
                elif (value == False): training_example.append(0)
            elif (key == "text"):
                #POSSIBLE TEXT FEATURE
                wordCount = countWord(value)
                training_example.append(wordCount)
                
                #POSSIBLE TEXT FEATURE
                avgWordLength = averageWordLength(value)
                training_example.append(avgWordLength)
                
                #POSSIBLE TEXT FEATURE
                #interactionFeature = math.log(wordCount * p['children']  + 1, 2)
                #training_example.append(interactionFeature)
            else:
                training_example.append(value)
        X.append(training_example)
    return X,y

In [8]:
#DATASET SEPARATION
def split_dataset(X,y):
    train_X = []
    train_Y = []
    cv_X = []
    cv_Y = []
    test_X = []
    test_Y = []
    for i in range(10000):
        train_X.append(X[i])
        train_Y.append(y[i])
    for i in range(1000):
        cv_X.append(X[i + 10000])
        cv_Y.append(y[i + 10000])
    for i in range(1000):
        test_X.append(X[i + 11000])
        test_Y.append(y[i + 11000])
    return train_X, train_Y, cv_X, cv_Y, test_X, test_Y

In [9]:
#GENERATE AND SAVE DATASET 1
X, y = construct_dataset_1(data)
print("Dataset 1 Constructed\n")
training_X, training_Y, cross_validation_X, cross_validation_Y, testing_X, testing_Y = split_dataset(X,y)
print("Dataset 1 Split\n")
np.save("training_data_X_1.npy", training_X)
np.save("training_data_Y_1.npy", training_Y)
np.save("cross_validation_data_X_1.npy", cross_validation_X)
np.save("cross_validation_data_Y_1.npy", cross_validation_Y)
np.save("testing_data_X_1.npy", testing_X)
np.save("testing_data_Y_1.npy", testing_Y)
print("Dataset 1 Saved\n")

Dataset 1 Constructed

Dataset 1 Split

Dataset 1 Saved



In [10]:
#GENERATE AND SAVE DATASET 2
X, y = construct_dataset_2(data)
print("Dataset 2 Constructed\n")
training_X, training_Y, cross_validation_X, cross_validation_Y, testing_X, testing_Y = split_dataset(X,y)
print("Dataset 2 Split\n")
np.save("training_data_X_2.npy", training_X)
np.save("training_data_Y_2.npy", training_Y)
np.save("cross_validation_data_X_2.npy", cross_validation_X)
np.save("cross_validation_data_Y_2.npy", cross_validation_Y)
np.save("testing_data_X_2.npy", testing_X)
np.save("testing_data_Y_2.npy", testing_Y)
print("Dataset 2 Saved\n")

Dataset 2 Constructed

Dataset 2 Split

Dataset 2 Saved



In [11]:
#GENERATE AND SAVE DATASET 3
X, y = construct_dataset_3(data)
print("Dataset 3 Constructed\n")
training_X, training_Y, cross_validation_X, cross_validation_Y, testing_X, testing_Y = split_dataset(X,y)
print("Dataset 3 Split\n")
np.save("training_data_X_3.npy", training_X)
np.save("training_data_Y_3.npy", training_Y)
np.save("cross_validation_data_X_3.npy", cross_validation_X)
np.save("cross_validation_data_Y_3.npy", cross_validation_Y)
np.save("testing_data_X_3.npy", testing_X)
np.save("testing_data_Y_3.npy", testing_Y)
print("Dataset 3 Saved\n")

Dataset 3 Constructed

Dataset 3 Split

Dataset 3 Saved



In [12]:
#GENERATE AND SAVE DATASET 4
X, y = construct_dataset_4(data)
print("Dataset 4 Constructed\n")
training_X, training_Y, cross_validation_X, cross_validation_Y, testing_X, testing_Y = split_dataset(X,y)
print("Dataset 4 Split\n")
np.save("training_data_X_4.npy", training_X)
np.save("training_data_Y_4.npy", training_Y)
np.save("cross_validation_data_X_4.npy", cross_validation_X)
np.save("cross_validation_data_Y_4.npy", cross_validation_Y)
np.save("testing_data_X_4.npy", testing_X)
np.save("testing_data_Y_4.npy", testing_Y)
print("Dataset 4 Saved\n")

Dataset 4 Constructed

Dataset 4 Split

Dataset 4 Saved



In [13]:
#GENERATE AND SAVE DATASET 5
X, y = construct_dataset_5(data)
print("Dataset 5 Constructed\n")
training_X, training_Y, cross_validation_X, cross_validation_Y, testing_X, testing_Y = split_dataset(X,y)
print("Dataset 5 Split\n")
np.save("training_data_X_5.npy", training_X)
np.save("training_data_Y_5.npy", training_Y)
np.save("cross_validation_data_X_5.npy", cross_validation_X)
np.save("cross_validation_data_Y_5.npy", cross_validation_Y)
np.save("testing_data_X_5.npy", testing_X)
np.save("testing_data_Y_5.npy", testing_Y)
print("Dataset 5 Saved\n")

Dataset 5 Constructed

Dataset 5 Split

Dataset 5 Saved



In [14]:
#GENERATE AND SAVE DATASET 6
X, y = construct_dataset_6(data)
print("Dataset 6 Constructed\n")
training_X, training_Y, cross_validation_X, cross_validation_Y, testing_X, testing_Y = split_dataset(X,y)
print("Dataset 6 Split\n")
np.save("training_data_X_6.npy", training_X)
np.save("training_data_Y_6.npy", training_Y)
np.save("cross_validation_data_X_6.npy", cross_validation_X)
np.save("cross_validation_data_Y_6.npy", cross_validation_Y)
np.save("testing_data_X_6.npy", testing_X)
np.save("testing_data_Y_6.npy", testing_Y)
print("Dataset 6 Saved\n")

Dataset 6 Constructed

Dataset 6 Split

Dataset 6 Saved



In [18]:
#COUNTING DISPARITY OF BOOLEAN FEATURES AND AVERAGES OF SCALAR FEATURES
A = np.load("training_data_X_2.npy")
numRTrue = 0   #Count for isRoot = True
numRFalse = 0  #Count for isRoot = False
numCTrue = 0   #Count for Controversiality = True
numCFalse = 0  #Count for Controversiality = False
i = 0
for line in A:
    if (line[60] == 1): numRTrue+=1
    else: numRFalse += 1
    if (line[61] == 1): numCTrue+=1
    else: numCFalse += 1
    i+=1
print("Number of 'is_root' features that are true = ", numRTrue)
print("Number of 'is_root' features that are false = ", numRFalse)
print("Number of 'controversy' features that are true = ", numCTrue)
print("Number of 'controversy' features that are false = ", numCFalse)

A = np.load("training_data_X_3.npy")
avgWordCount = 0
avgWordLength = 0
avgControversy = 0
avgIsRoot = 0
avgChildren = 0
counter = 0
for x in A:
    counter += 1
    avgWordCount += x[161]
    avgWordLength += x[162]
    avgIsRoot += x[163]
    avgControversy += x[164]
    avgChildren += x[165]
avgWordCount = avgWordCount/counter
avgWordLength = avgWordLength/counter
avgIsRoot = avgIsRoot/counter
avgControversy = avgControversy/counter
avgChildren = avgChildren/counter

print("Average word count = ", avgWordCount)
print("Average word length = ", avgWordLength)
print("Average is root = ", avgIsRoot)
print("Average controversy = ", avgControversy)
print("Average children = ", avgChildren)

Number of 'is_root' features that are true =  248
Number of 'is_root' features that are false =  9752
Number of 'controversy' features that are true =  4207
Number of 'controversy' features that are false =  5793
Average word count =  24.004
Average word length =  5.288804888570435
Average is root =  0.4207
Average controversy =  0.0117
Average children =  0.4024
