In [1]:
import json
import numpy as np
from collections import Counter

In [2]:
#DATA LOADING
with open("Project 1 Materials\proj1_data.json") as fp:
    data = json.load(fp)

In [3]:
#PRIMARY VARIABLES
X = []
y = []
training_X = []
training_Y = []
cross_validation_X = []
cross_validation_Y = []
testing_X = []
testing_Y = []

In [4]:
#TEXT PREPROCESSING

#Splits a given text into individual tokens
def splitText(text):
    lowerCaseText =  text.lower()
    splitText = lowerCaseText.split()
    return splitText

#Find top 160 frequently occurring word in 'data'
def topFrequencies(dictionaryList, num):
    countVariable = Counter()
    for item in dictionaryList:
        splitted_text = splitText(item['text'])
        for word in splitted_text:
            countVariable[word] += 1
    return dict(countVariable.most_common(num))

#Convert list of key-value pairs (word, frequency) to (word, ranking)
def convertFrequenciesToRankings(list):
    i = 0
    newList = {}
    for word in list:
        newList[word] = i
        i+=1
    return newList

#Construct a 160 word count vector from a comment
def construct160WordCountVector(text):
    vector = [0]*160
    splitted_text = splitText(text)
    for word in splitted_text:
        i = rankingOfMostFrequent160Words.get(word, -1)
        if (i != -1): vector[i] += 1
    return vector

#Construct a 60 word count vector from a comment
def construct60WordCountVector(text):
    vector = [0]*60
    splitted_text = splitText(text)
    for word in splitted_text:
        i = rankingOfMostFrequent60Words.get(word, -1)
        if (i != -1): vector[i] += 1
    return vector

mostFrequent160Words = topFrequencies(data, 160)
rankingOfMostFrequent160Words = convertFrequenciesToRankings(mostFrequent160Words)

mostFrequent60Words = topFrequencies(data, 60)
rankingOfMostFrequent60Words = convertFrequenciesToRankings(mostFrequent60Words)

In [5]:
#DATASET CONSTRUCTION FUNCTIONS

#Constructs a dataset with the prescribed features + a 160 word count feature
def construct_dataset_1(d):
    X = []
    y = []
    for p in d:
        training_example = [1]
        
        for key, value in p.items():
            if (key == "popularity_score"):
                y.append(value)
            elif (key == "is_root"):
                if (value == True): training_example.append(1)
                elif (value == False): training_example.append(0)
            elif (key == "text"):
                vector = construct160WordCountVector(value)
                training_example.extend(vector)
            else:
                training_example.append(value)
        X.append(training_example)
    return X,y

#Constructs a dataset with the prescribed features + a 60 word count feature
def construct_dataset_2(d):
    X = []
    y = []
    for p in d:
        training_example = [1]
        for key, value in p.items():
            if (key == "popularity_score"):
                y.append(value)
            elif (key == "is_root"):
                if (value == True): training_example.append(1)
                elif (value == False): training_example.append(0)
            elif (key == "text"):
                vector = construct60WordCountVector(value)
                training_example.extend(vector)
            else:
                training_example.append(value)
        X.append(training_example)
    return X,y

In [6]:
#DATASET SEPARATION
def split_dataset(X,y):
    train_X = []
    train_Y = []
    cv_X = []
    cv_Y = []
    test_X = []
    test_Y = []
    for i in range(10000):
        train_X.append(X[i])
        train_Y.append(y[i])
    for i in range(1000):
        cv_X.append(X[i + 10000])
        cv_Y.append(y[i + 10000])
    for i in range(1000):
        test_X.append(X[i + 11000])
        test_Y.append(y[i + 11000])
    return train_X, train_Y, cv_X, cv_Y, test_X, test_Y

In [7]:
#GENERATE AND SAVE DATASET 1
X, y = construct_dataset_1(data)
print("Dataset 1 Constructed\n")
training_X, training_Y, cross_validation_X, cross_validation_Y, testing_X, testing_Y = split_dataset(X,y)
print("Dataset 1 Split\n")
np.save("training_data_X_1.npy", training_X)
np.save("training_data_Y_1.npy", training_Y)
np.save("cross_validation_data_X_1.npy", cross_validation_X)
np.save("cross_validation_data_Y_1.npy", cross_validation_Y)
np.save("testing_data_X_1.npy", testing_X)
np.save("testing_data_Y_1.npy", testing_Y)
print("Dataset 1 Saved\n")

Dataset 1 Constructed

Dataset 1 Split

Dataset 1 Saved



In [8]:
#GENERATE AND SAVE DATASET 2
X, y = construct_dataset_2(data)
print("Dataset 2 Constructed\n")
training_X, training_Y, cross_validation_X, cross_validation_Y, testing_X, testing_Y = split_dataset(X,y)
print("Dataset 2 Split\n")
np.save("training_data_X_2.npy", training_X)
np.save("training_data_Y_2.npy", training_Y)
np.save("cross_validation_data_X_2.npy", cross_validation_X)
np.save("cross_validation_data_Y_2.npy", cross_validation_Y)
np.save("testing_data_X_2.npy", testing_X)
np.save("testing_data_Y_2.npy", testing_Y)
print("Dataset 2 Saved\n")

Dataset 2 Constructed

Dataset 2 Split

Dataset 2 Saved



In [11]:
#COUNTING PROPORTIONS OF BOOLEAN FEATURES
numRTrue = 0   #Count for isRoot = True
numRFalse = 0  #Count for isRoot = False
numCTrue = 0   #Count for Controversiality = True
numCFalse = 0  #Count for Controversiality = False
i = 0
#Here X is based of dataset 2 since that was the last to be generated
for line in X:
    if (line[60] == 1): numRTrue+=1
    else: numRFalse += 1
    if (line[61] == 1): numCTrue+=1
    else: numCFalse += 1
    i+=1
print(numRTrue)
print(numRFalse)
print(numRTrue + numRFalse)
print(numCTrue)
print(numCFalse)
print(numCTrue + numCFalse)

291
11709
12000
4993
7007
12000
