In [None]:
# Base
import numpy as np
import pandas as pd
import json
import re
import string
from os import listdir
import math
import timeit
import datetime

# Natural Language Processing
import nltk
from nltk.stem.snowball import EnglishStemmer # load the stemmer module from NLTK
stemmer = EnglishStemmer() # Get an instance of SnowballStemmer for English

# Plotting
import matplotlib.pyplot as plt
%matplotlib inline

### Text Preprocessing

In [None]:
def load_json(fileList,filePath):
    dataDF = pd.DataFrame()
    for file in fileList:
        #json_data = open(filePath+file, encoding = "ISO-8859-1").read()
        json_data = open(filePath+file, errors = "ignore").read()
        data = json.loads(json_data)

        # Move Reviews only to DataFrame
        dataDF = dataDF.append(pd.DataFrame.from_dict(data['Reviews']))
        
    return dataDF

In [None]:
def clean_words(text):
    tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
    words = tokenizer.tokenize(text)

    # Convert to Lowercase
    # words = words.map(str.lower)aa
    cleanWords = [t.lower() for t in words]

    # Normalize (remove punctuation)
    # punc = string.punctuation
    # cleanWords = [t for t in cleanWords if t not in punc]
    cleanWords = [re.sub('[^0-9a-z]', "", x) for x in cleanWords]
    
    # Remove Empty Vectors
    cleanWords = [x for x in cleanWords if x != '']
 
    # Identify Digits & Convert to Num
    cleanWords = [re.sub("\d+", "NUM", x) for x in cleanWords]

    # Stem Words
    cleanWords = [stemmer.stem(x) for x in cleanWords] # call stemmer to stem the input
    
    return cleanWords

In [None]:
# Remove Stop Words (From 1-grams)
def removeStopwords(text, stopwordList):
    newList = [t for t in text if t not in stopwordList]
    return newList

### NLP

In [None]:
def getTermFreq(textList):
    TF = {}
    for row in textList:
        #print(row)
        for word in row:
            # print(word)
            if word in TF:
                TF[word] += 1
            else:
                TF[word] = 1
    return TF

In [None]:
def getDocFreq(textlist):
    DF = {}
    for row in textlist:
        for word in set(row):
            # print(word)
            if word in DF:
                DF[word] += 1
            else:
                DF[word] = 1
    return DF

In [None]:
# Unigram Language Model
def genUniLM(TF):
    u_theta = pd.DataFrame.from_dict(TF, orient = "index")
    u_theta.columns = ['TF']
    # u_theta.sort('TF', ascending = False)[0:10]
    # Total Number of Words in Training Corpus
    nWords = u_theta['TF'].sum()
    nWords
    # Number of Unique Words in Training Corpus
    vSize = len(u_theta['TF'])
    vSize
    # Calculate Probabilty of Each Word by TTF/N
    u_theta['p'] = u_theta/nWords
    u_theta = u_theta.sort('TF', ascending = False)
    # Check that Probability Sums to 1
    print("Total Probability: ",u_theta['p'].sum())
    return u_theta

In [None]:
def calc_pSmoothAdditive(tokenList, u_theta, d):
    
    vSize_train = len(u_theta)
    nWords_train = sum(u_theta['TF'])
    
    unseenWords = list(set(tokenList) - set(u_theta.index))
    #print(len(unseenWords))
    if len(unseenWords) == 0:
        return u_theta['p']
    else:
        # Build Series with all unique words in training set + unseen words from test document
        pSmooth = u_theta['TF'].append(pd.Series(([0]*len(unseenWords)), index = unseenWords))
        nWords_train += len(unseenWords)
        vSize_train += len(unseenWords)
        f = lambda x: ((x + d) / (nWords_train + d*vSize_train))
        pSmooth = pSmooth.map(f)
        return pSmooth

In [None]:
def create_countVectors(tokens):
    doc_TF = {}
    for token in tokens:
        if token in doc_TF:
            doc_TF[token] += 1
        else:
            doc_TF[token] = 1
    return doc_TF

In [1]:
def createScaledTFIDFvectors(textlist, dataDF):
    
    # Make Dictionary of Dictionaries for textlist
    doc_TF = {}
    for i in range(0,len(dataDF)):
        doc_TF[dataDF.ix[i,'ReviewID']] = create_countVectors(textlist[i])
    # print(len(doc_TF.keys()))
    # Calculate TF with Sub-linear TF scaling
    TF_scaled = {}
    for review in doc_TF.keys():
        TF_scaled[review] = {}
        for key in doc_TF[review].keys():
            TF_scaled[review][key] = (1 + math.log(doc_TF[review][key]))

    DF = getDocFreq(textlist)

    scaled_TFIDF = {}
    n_doc = len(doc_TF.keys())
    for review in doc_TF.keys():
        scaled_TFIDF[review] = {}
        for key in doc_TF[review].keys():
            if key in DF.keys():
                scaled_TFIDF[review][key] = TF_scaled[review][key] * (1 + math.log((n_doc/DF[key])))
            else:
                scaled_TFIDF[review][key] = TF_scaled[review][key]
        
    return scaled_TFIDF

### Cosine Similarity

In [None]:
def calc_norm(vecDict):
    norm = 0
    for value in vecDict.values():
        #print(key, value)
        norm = norm + value**2
        #print(tot)
    norm = math.sqrt(norm)
    return norm

In [None]:
def calc_dotProd(vecDict1,vecDict2):
    totDot = 0
    comWords = set(vecDict1.keys() & vecDict2.keys())
    for word in comWords:
        vec1_count = vecDict1[word]
        vec2_count = vecDict2[word]
        totDot = totDot + (vec1_count * vec2_count)
    return totDot

In [None]:
def calc_cosineDist(vecDict1,vecDict2):
    dist = calc_dotProd(vecDict1,vecDict2)/(calc_norm(vecDict1) * calc_norm(vecDict2))
    return dist

### Other

In [None]:
def most_common(lst):
    return max(set(lst), key=lst.count)