In [1]:
import json
import pandas as pd
import numpy as np
import re
from collections import Counter
from functools import reduce
import operator
from spellchecker import SpellChecker
import nltk

#Load data in json format and store in a dataframe
with open("proj1_data.json") as fp:
    data = json.load(fp)
    df = pd.DataFrame(data)

In [2]:
#Convert true-false to 1-0
df["is_root"] = df["is_root"].astype(int)
#Convert all text to lower cases
df["text"]= [x.lower() for x in df["text"]]
#Parse text where there's a space
df["text"]= [x.split() for x in df["text"]]

In [3]:
#First 10000 data points as training set
train = df.iloc[0:10000,:]
#10000 to 11000 as validation set
validation = df.iloc[10000:11000,:]
#Last 1000 as test set
test = df.iloc[11000:12000,:]
#Display the first 10 points of the training data
train.head(10)

Unnamed: 0,children,controversiality,is_root,popularity_score,text
0,0,0,0,1.254698,"[its, raining, sideways]"
1,0,0,0,0.509813,"[wheel, of, time, reader, confirmed!]"
2,0,0,1,0.370827,"[the, jungle, book, of, pussy]"
3,0,0,0,-0.272843,"[i'm, just, making, this, thread, since, there..."
4,0,0,1,0.56015,"[hi, there,, looks, like, you're, wanting, to,..."
5,0,0,1,0.696554,"[when, there, is, a, line, to, exit, a, store,..."
6,0,0,1,1.050417,"[it, is, none, of, their, business.]"
7,1,0,1,0.310543,"[infinite, ammo, and, the, absurd, number, of,..."
8,1,1,0,-1.208735,"[if, you're, trying, to, argue, for, a, claim,..."
9,1,0,0,1.1237,"[oh, god, it's, all, my, cousins, from, the, s..."


In [5]:
def preprocess(dataset,words):
    
    # Feature: Does the comment contain a question mark
    qmarks = np.zeros((dataset.shape[0]))
    # Feature: Normalized comment length (number of words)
    n_words = np.zeros((dataset.shape[0]))
    # Feature: Avg number of letters per words
    letters_per_word = np.zeros((dataset.shape[0]))
    # Feature: Number of punctuation signs per word (, . ! ? : ;)
    punctuation_count = np.zeros((dataset.shape[0]))
    punct = [',', '.', '!', '?', ':', ';']
    # Feature: Most common word count
    l = reduce(operator.concat, dataset["text"])
    most_common_words = [word for word, word_count in Counter(l).most_common(words)]
    zeros = np.zeros(shape = (dataset.shape[0], words))
    word_count_features = pd.DataFrame(zeros, columns = most_common_words)    
    # Feature: Misspelled words
    misspelled_feature = np.zeros(dataset.shape[0])
    spell = SpellChecker()
    #Feature: Swear words
    swear_words = pd.read_csv("swearWords.csv")
    s_words = np.zeros(dataset.shape[0])
    
    
    # Iterate over comments
    for i in range(dataset.shape[0]):
        txt = dataset.iloc[i]["text"]
        # Iterate over words
        for w in txt:
            # most common words
            for target in most_common_words:
                if w == target:
                    word_count_features.iloc[i][target] += 1
            
            #swear words count
            for target in swear_words:
                if w == target:
                    s_words[i]+=1
            
            # punctuation count
            for x in punct:
                punctuation_count[i] += w.count(x)
                
            # question counter
            if "?" in w:
                qmarks[i] = 1
                
            # comment length
            n_words[i] += 1
            # number of letters
            letters_per_word[i] += len(w)
            
    # misspelled count   
    for i in range(dataset.shape[0]):
        new = [re.sub(r'[^a-zA-Z]', '', x) for x in dataset.iloc[i]["text"]]
        misspelled_words = spell.unknown(new)
        misspelled_feature[i] = len(misspelled_words)
    
    # Get average number of letters per word
    for i in range(dataset.shape[0]):
        letters_per_word[i] = letters_per_word[i]/n_words[i]
        
    # Get average punctuation marks per word
    for i in range(dataset.shape[0]):
        punctuation_count[i] = punctuation_count[i]/n_words[i]
    
    # Normalize word length (divide by average)
    n_words = n_words/np.mean(n_words)
                
    # Add feature columns 
    # Most common words
    dataset = pd.concat([dataset, word_count_features], axis=1)
    # Misspelled words count
    dataset = dataset.assign(misspelled=pd.Series(misspelled_feature).values)
    # Swear words
    dataset = dataset.assign(s_words=pd.Series(s_words).values)
    # Question marks
    dataset = dataset.assign(has_question=pd.Series(qmarks).values.astype(int))
    # Number of words
    dataset = dataset.assign(n_words=pd.Series(n_words).values)
    # Avg letters per word
    dataset = dataset.assign(letters_per_word=pd.Series(letters_per_word).values)
    # Punctuation per word
    dataset = dataset.assign(punctuation_count=pd.Series(punctuation_count).values)
                
    return dataset

In [None]:
train = preprocess(train,5)
validation = preprocess(validation, 5)
test = preprocess(test,5)

In [19]:
from textblob import TextBlob
from nltk.corpus import stopwords
nltk.download('stopwords')
def sentiment(dataset, words):
    l = reduce(operator.concat, dataset["text"])
    most_common_words = [word for word, word_count in Counter(l).most_common(words)]
    stop = stopwords.words('english')
    for i in range(dataset.shape[0]):
        txt = dataset.iloc[i]["text"]
        for w in txt:
            if w in stop:
                txt.remove(w)
    #remove most common words
            if w in most_common_words:
                txt.remove(w)
    #Correct misspelled
           # w.correct()
    #Lemmatization
            #w.lemmatize()
    #Sentiment analysis
        polarity, subjectivity = txt.sentiment()
        
    train['sentiment'] = train['tweet'].apply(lambda x: TextBlob(x).sentiment[0] )
    return dataset

[nltk_data] Downloading package stopwords to /Users/Jenny/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
from sympy.solvers import solve
def Linear_regression(X, y, method = 0, w_0 = None, alpha_0 = 1, b = 1, eps = 1e-06):

    # local variables
    p = X.shape[1]
    optim_w = np.zeros(p)

    # computes the optimal weights using the closed form solution
    if(method == 0):
        X_T = X.T
        b = X_T @ y
        A = X_T @ X
        
        optim_w = np.linalg.inv(A).dot(b)

        return optim_w

    # computes the optimal weights using gradient descent
    else:
        if w_0 is None:
            w_0 = np.zeros(p)

        optim_w = gradient_descent(X, y, w_0, alpha_0, b, eps)

        return optim_w

In [38]:
def gradient_descent(X, y, w_0, alpha_0, b, eps):

    # local variables
    i = 0 # iterations performed
    alpha = n_0 # step size
    prev_w = w_0 # weight of previous iteration
    current_w = w_0 # weight of current iteration

    # precomputes terms used in the gradient descent update
    X_T = X.T
    crossprod_X = X_T @ X
    y_term = X_T @ y

    # performs gradient descent until stopping condition reached
    while True:

        # updates the step size
        alpha = n_0/(1 + b*i)

        # updates the weights
        prev_w = current_w
        current_w = current_w - 2 * alpha * (crossprod_X @ current_w - y_term)

        # updates the iteration number
        i = i + 1

        # checks the stopping condition
        if np.norm(current_w - prev_w) < eps:
            break

    # returns the optimal weights
    return current_w

In [43]:
X = train.drop(["popularity_score","text"], axis=1)
y = train["popularity_score"]
linreg = Linear_regression(X, y, method = 0, w_0 = None, alpha_0 = 0.1, b = 1, eps = 1e-06)
linreg

array([ 4.33768243e+01,  5.27350628e+02, -7.15620197e-01,  4.06140834e+01,
       -9.37555583e+01,  1.15952807e+01,  3.69383762e+00,  7.65041510e+01,
        7.05733811e+01,  8.40381180e+01, -1.87684890e+02,  4.47554672e+01,
        1.55978201e+02,  2.58963108e-02,  6.56293811e-02,  1.24272171e-01,
        3.18137307e-01,  5.82797811e-02,  3.90278112e-02, -5.08026246e+01,
        8.85137063e+01, -7.64224492e+00, -3.51753457e+00, -6.85903178e+01,
       -8.00458461e+01, -9.08274214e+01,  1.91486547e+02, -4.80595959e+01,
       -1.52640296e+02,  1.18021895e-02, -2.91371672e-02,  3.01110970e-02,
       -1.13103770e-02, -5.49825020e-02, -8.70566682e-02,  4.88318424e-02,
        1.83402413e-02,  1.74819006e-02,  1.04053895e-04,  1.65199119e-02,
       -3.94150722e-02, -9.51721196e-02, -6.83505481e-02,  3.68610550e-02,
        7.02168227e-02, -3.85142764e-02,  8.05563146e-03, -9.18425274e-03,
        1.32054775e-02, -2.15889063e-03, -5.22676288e-02,  7.83873578e-03,
        3.15621728e-02,  

In [None]:
preprocess(train,10)