In [1]:
import json
import pandas as pd
import numpy as np
import re
from collections import Counter
from functools import reduce
import operator
from spellchecker import SpellChecker
import nltk

#Load data in json format and store in a dataframe
with open("proj1_data.json") as fp:
    data = json.load(fp)
    df = pd.DataFrame(data)

In [2]:
#Convert true-false to 1-0
df["is_root"] = df["is_root"].astype(int)
#Convert all text to lower cases
df["text"]= [x.lower() for x in df["text"]]
#Parse text where there's a space
df["text"]= [x.split() for x in df["text"]]

In [3]:
#First 10000 data points as training set
train = df.iloc[0:10000,:]
#10000 to 11000 as validation set
validation = df.iloc[10000:11000,:]
validation.index -= 10000
#Last 1000 as test set
test = df.iloc[11000:12000,:]
test.index -= 11000
#Display the first 10 points of the training data
train.head(4)

Unnamed: 0,children,controversiality,is_root,popularity_score,text
0,0,0,0,1.254698,"[its, raining, sideways]"
1,0,0,0,0.509813,"[wheel, of, time, reader, confirmed!]"
2,0,0,1,0.370827,"[the, jungle, book, of, pussy]"
3,0,0,0,-0.272843,"[i'm, just, making, this, thread, since, there..."


In [4]:
import numpy as np
import pandas as pd
import re
from collections import Counter
from spellchecker import SpellChecker

def preprocess(dataset, nb_words):
    
    # Feature: Does the comment contain a question mark
    qmarks = np.zeros((dataset.shape[0]))
    # Feature: Normalized comment length (number of words)
    n_words = np.zeros((dataset.shape[0]))
    # Feature: Avg number of letters per words
    letters_per_word = np.zeros((dataset.shape[0]))
    # Feature: Number of punctuation signs per word (, . ! ? : ;)
    punctuation_count = np.zeros((dataset.shape[0]))
    punct = [',', '.', '!', '?', ':', ';']
    # Feature: Most common word count
    l = np.concatenate(dataset["text"])
    most_common_words = [word for word, word_count in Counter(l).most_common(nb_words)]
    zeros = np.zeros(shape = (dataset.shape[0], nb_words))
    word_count_features = pd.DataFrame(zeros, columns = most_common_words)    
    # Feature: Misspelled words
    misspelled_feature = np.zeros(dataset.shape[0])
    spell = SpellChecker()
    #Feature: Swear words
    swear_words = pd.read_csv("swearWords.csv")
    s_words = np.zeros(dataset.shape[0])
    
    
    # Iterate over comments
    for i in range(dataset.shape[0]):
        txt = dataset.iloc[i]["text"]
        # Iterate over words
        for w in txt:
            # most common words
            for target in most_common_words:
                if w == target:
                    word_count_features.iloc[i][target] += 1
            
            #swear words count
            for target in swear_words:
                if w == target:
                    s_words[i]+=1
            
            # punctuation count
            for x in punct:
                punctuation_count[i] += w.count(x)
                
            # question counter
            if "?" in w:
                qmarks[i] = 1
                
            # comment length
            n_words[i] += 1
            # number of letters
            letters_per_word[i] += len(w)
            
    # misspelled count   
    for i in range(dataset.shape[0]):
        new = [re.sub(r"^\W+|\W+$","", word) for word in dataset.iloc[i]["text"]]
        misspelled_words = spell.unknown(new)
        misspelled_feature[i] = len(misspelled_words)
    
    # Get average number of letters per word
    for i in range(dataset.shape[0]):
        letters_per_word[i] = letters_per_word[i]/n_words[i]
        
    # Get average punctuation marks per word
    for i in range(dataset.shape[0]):
        punctuation_count[i] = punctuation_count[i]/n_words[i]
        
    # Add feature columns 
    # Most common words
    dataset = pd.concat([dataset, word_count_features], axis=1)
    # Misspelled words count
    dataset = dataset.assign(misspelled=pd.Series(misspelled_feature).values)
    # Swear words
    dataset = dataset.assign(s_words=pd.Series(s_words).values)
    # Question marks
    dataset = dataset.assign(has_question=pd.Series(qmarks).values.astype(int))
    # Avg letters per word
    dataset = dataset.assign(letters_per_word=pd.Series(letters_per_word).values)
    # Punctuation per word
    dataset = dataset.assign(punctuation_count=pd.Series(punctuation_count).values)
    #Add bias term
    ones = np.ones((dataset.shape[0]))
    dataset = dataset.assign(bias = pd.Series(ones).values)
     #Drop text column
    dataset = dataset.drop(["text"], axis=1)
    #Move y value to the end
    dataset = dataset[["bias"] + [c for c in dataset if c not in ["popularity_score","bias"]] + ["popularity_score"]]
    return (dataset, most_common_words)

In [6]:
train = preprocess(train, 160)

In [45]:
train[0].iloc[:,164:170]

Unnamed: 0,misspelled,s_words,has_question,letters_per_word,punctuation_count,popularity_score
0,0.0,0.0,0,6.000000,0.000000,1.254698
1,0.0,0.0,0,5.400000,0.200000,0.509813
2,0.0,1.0,0,4.000000,0.000000,0.370827
3,1.0,0.0,0,4.000000,0.071429,-0.272843
4,4.0,0.0,1,6.250000,0.230769,0.560150
5,1.0,1.0,0,3.710526,0.078947,0.696554
6,0.0,0.0,0,4.000000,0.166667,1.050417
7,1.0,0.0,0,4.615385,0.076923,0.310543
8,1.0,0.0,0,4.478261,0.086957,-1.208735
9,3.0,0.0,0,4.075000,0.100000,1.123700


In [6]:
def feature_extraction(dataset, words):
    
    # Feature: Does the comment contain a question mark
    qmarks = np.zeros((dataset.shape[0]))
    # Feature: Normalized comment length (number of words)
    n_words = np.zeros((dataset.shape[0]))
    # Feature: Avg number of letters per words
    letters_per_word = np.zeros((dataset.shape[0]))
    # Feature: Number of punctuation signs per word (, . ! ? : ;)
    punctuation_count = np.zeros((dataset.shape[0]))
    punct = [',', '.', '!', '?', ':', ';']
    # Feature: Most common word count
    most_common_words = words
    zeros = np.zeros(shape = (dataset.shape[0], len(words)))
    word_count_features = pd.DataFrame(zeros, columns = most_common_words)    
    # Feature: Misspelled words
    misspelled_feature = np.zeros(dataset.shape[0])
    spell = SpellChecker()
    #Feature: Swear words
    swear_words = pd.read_csv("swearWords.csv")
    s_words = np.zeros(dataset.shape[0])
    
    
    # Iterate over comments
    for i in range(dataset.shape[0]):
        txt = dataset.iloc[i]["text"]
        # Iterate over words
        for w in txt:
            # most common words
            for target in most_common_words:
                if w == target:
                    word_count_features.iloc[i][target] += 1
            
            #swear words count
            for target in swear_words:
                if w == target:
                    s_words[i]+=1
            
            # punctuation count
            for x in punct:
                punctuation_count[i] += w.count(x)
                
            # question counter
            if "?" in w:
                qmarks[i] = 1
                
            # comment length
            n_words[i] += 1
            # number of letters
            letters_per_word[i] += len(w)
            
    # misspelled count   
    for i in range(dataset.shape[0]):
        new = [re.sub(r'[^a-zA-Z]', '', x) for x in dataset.iloc[i]["text"]]
        misspelled_words = spell.unknown(new)
        misspelled_feature[i] = len(misspelled_words)
    
    # Get average number of letters per word
    for i in range(dataset.shape[0]):
        letters_per_word[i] = letters_per_word[i]/n_words[i]
        
    # Get average punctuation marks per word
    for i in range(dataset.shape[0]):
        punctuation_count[i] = punctuation_count[i]/n_words[i]
                    
    # Add feature columns 
    # Most common words
    dataset = pd.concat([dataset, word_count_features], axis=1)
    # Misspelled words count
    dataset = dataset.assign(misspelled=pd.Series(misspelled_feature).values)
    # Swear words
    dataset = dataset.assign(s_words=pd.Series(s_words).values)
    # Question marks
    dataset = dataset.assign(has_question=pd.Series(qmarks).values.astype(int))
    # Avg letters per word
    dataset = dataset.assign(letters_per_word=pd.Series(letters_per_word).values)
    # Punctuation per word
    dataset = dataset.assign(punctuation_count=pd.Series(punctuation_count).values)
    #Add bias term
    ones = np.ones((dataset.shape[0]))
    dataset= dataset.assign(bias = pd.Series(ones).values)
    #Drop text column
    dataset = dataset.drop(["text"], axis=1)
    #Move y value to the end
    dataset = dataset[["bias"] +[c for c in dataset if c not in ["popularity_score"]] + ["popularity_score"]]
    return dataset

In [8]:
train, words = preprocess(train,5)
train = train.values.astype(float)
validation = feature_extraction(validation, words).values.astype(float)
test = feature_extraction(test, words).values.astype(float)


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [33]:
def MSE(X, y, w):
    return np.mean((X @ w - y)**2)
X = validation[:,0:4]
y = validation[:,-1]


[[ 1.          0.          0.         ...  0.2         1.
   0.84333697]
 [ 1.          0.          0.         ...  0.5         1.
   0.89400237]
 [ 1.          2.          0.         ...  0.1         1.
   3.42605184]
 ...
 [ 1.          0.          0.         ...  0.          1.
   0.65148906]
 [ 1.          0.          0.         ...  0.15384615  1.
   1.01984666]
 [ 1.          4.          0.         ...  0.1875      1.
  -0.74624472]]


In [19]:
from textblob import TextBlob
from nltk.corpus import stopwords
nltk.download('stopwords')
def sentiment(dataset, words):
    l = reduce(operator.concat, dataset["text"])
    most_common_words = [word for word, word_count in Counter(l).most_common(words)]
    stop = stopwords.words('english')
    for i in range(dataset.shape[0]):
        txt = dataset.iloc[i]["text"]
        for w in txt:
            if w in stop:
                txt.remove(w)
    #remove most common words
            if w in most_common_words:
                txt.remove(w)
    #Correct misspelled
           # w.correct()
    #Lemmatization
            #w.lemmatize()
    #Sentiment analysis
        polarity, subjectivity = txt.sentiment()
        
    train['sentiment'] = train['tweet'].apply(lambda x: TextBlob(x).sentiment[0] )
    return dataset

[nltk_data] Downloading package stopwords to /Users/Jenny/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
from GD import gradient_descent
import time
def Linear_regression(X, y, method = 0, w_0 = None, alpha_0 = 1, b = 1, eps = 1e-06):

    # local variables
    p = X.shape[1]
    optim_w = np.zeros(p)

    # computes the optimal weights using the closed form solution
    if(method == 0):
        X_T = X.T
        b = X_T @ y
        A = X_T @ X
        
        optim_w = np.linalg.solve(A,b)

        return optim_w

    # computes the optimal weights using gradient descent
    else:
        if w_0 is None:
            w_0 = np.zeros(p)

        optim_w = gradient_descent(X, y, w_0, alpha_0, b, eps)

        return optim_w


In [31]:
def gradient_descent(X, y, w_0, alpha_0, b, eps):

    # local variables
    i = 0 # iterations performed
    alpha = alpha_0 # step size
    prev_w = w_0 # weight of previous iteration
    current_w = w_0 # weight of current iteration

    # precomputes terms used in the gradient descent update
    X_T = X.T
    crossprod_X = X_T @ X
    y_term = X_T @ y

    # performs gradient descent until stopping condition reached
    while True:

        # updates the step size
        alpha = alpha_0/(1 + b*i)

        # updates the weights
        prev_w = current_w
        current_w = current_w - 2 * alpha * (crossprod_X @ current_w - y_term)

        # updates the iteration number
        i = i + 1

        # checks the stopping condition
        if np.linalg.norm(current_w - prev_w) < eps:
            break

    # returns the optimal weights
    return current_w

from sympy.solvers import solve
def Linear_regression(X, y, method = 0, w_0 = None, alpha_0 = 1, b = 1, eps = 1e-06):

    # local variables
    p = X.shape[1]
    optim_w = np.zeros(p)

    # computes the optimal weights using the closed form solution
    if(method == 0):
        X_T = X.T
        b = X_T @ y
        A = X_T @ X
        
        optim_w = np.linalg.inv(A)@b 
        #optim_w = np.linalg.solve(A,b)



        return optim_w

    # computes the optimal weights using gradient descent
    else:
        if w_0 is None:
            w_0 = np.zeros(p)

        optim_w = gradient_descent(X, y, w_0, alpha_0, b, eps)

        return optim_w

In [32]:
X = train[:,:-1]
y = train[:,-1]

start_cf = time.time()
linreg = Linear_regression(X, y, method = 0)
end_cf = time.time()

time_cf = end_cf - start_cf

print(linreg, time_cf)

[ 8.45983041e-01  3.75533780e-01 -1.09554785e+00 -2.33466789e-01
 -3.34379529e-03 -4.06777600e-04 -7.57469048e-03  8.38157340e-03
  2.98382428e-02 -3.79098679e-03  6.19035359e-02 -1.00117708e-01
 -5.17723460e-03  2.13392348e-02] 0.00510096549987793


In [30]:
start_gd = time.time()
gd = Linear_regression(X, y, method = 1, alpha_0 = 1e-06, b = 0, eps = 1e-11)
end_gd = time.time()
time_gd = end_gd - start_gd
print(gd, time_gd)

[ 8.45983040e-01  3.75533780e-01 -1.09554781e+00 -2.33466789e-01
 -3.34379529e-03 -4.06777542e-04 -7.57469053e-03  8.38157348e-03
  2.98382427e-02 -3.79098682e-03  6.19035355e-02 -1.00117708e-01
 -5.17723461e-03  2.13392350e-02] 1.6073899269104004


(10000, 14)