In [1]:
#!python3 -m spacy download en_core_web_lg
# warning: takes 80 years on public wifi
# !pip install spacy

In [2]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re
from nltk.stem.wordnet import WordNetLemmatizer

from typing import List, Set

import spacy

from nltk import FreqDist

from math import log

In [3]:
nlp = spacy.load('en_core_web_lg')

In [4]:
def remove_tags(text: str) -> str:    
    return re.sub('&lt;/?.*?&gt;', '', text)

def remove_special_chars_and_digits(text: str) -> str:
    return re.sub('(\\d|\\W)+', '', text)

def remove_punctuation(text: str) -> str:
    return re.sub('[^a-zA-Z]', '', str(text))

In [31]:
def apply_lemmatization(in_text):
    # Lemmatization
    lem = WordNetLemmatizer()
    word_list = nltk.word_tokenize(in_text)
    print(word_list)
    output = ' '.join([lem.lemmatize(w, "j") for w in word_list])
    return output

In [32]:
def get_lemmas(text: str, stopwords: Set[str]) -> List[str]:
    initial = [remove_tags(remove_special_chars_and_digits(remove_punctuation(x.lemma_.lower()))) for x in nlp(text)]
    return [x for x in initial if x not in stopwords]

In [33]:
stop_words = stopwords.words('english')

def clean_please(some_data):
    """
    """
    re_punk = re.compile('[%s]' % re.escape(string.punctuation))
    lemmas = apply_lemmatization(some_data)
    tokens = word_tokenize(lemmas)
    stripped_tkns = [re_punk.sub('', wxy) for wxy in tokens]
    lower = [word.lower() for word in stripped_tkns]
    st0p = [word for word in lower if not word in stop_words]

    while "" in st0p:
        st0p.remove("")
    return st0p

In [34]:
TDM = pd.read_csv("../week2/document_matrix.csv") #term_document_matrix

In [35]:
reviews = pd.read_csv("/Users/kerry/Projects/msds453/random_nlp/assign0/toxicAvenger/reviews.csv", sep="|",
                     lineterminator=">", engine="c")

In [36]:
stg = ""
for rev in reviews.review:
    stg += rev
    
stg = stg.translate(str.maketrans('', '', string.punctuation))
stg = stg.replace("\n", " ").replace("  ", " ")

In [37]:
stg = " ".join([word.lower() for word in stg.split(" ")])

In [38]:
words

['film',
 'believe',
 'one',
 'favorate',
 'film',
 'time',
 'also',
 'kinda',
 'another',
 'one',
 'childhood',
 'gem',
 'well',
 'like',
 'teen',
 'gem',
 'ill',
 'explain',
 'kinda',
 'gradual',
 'build',
 'first',
 'discover',
 'franshise',
 'cartoon',
 'show',
 'toxic',
 'crusader',
 'think',
 'cool',
 'even',
 'know',
 'time',
 'loosely',
 'inspiredbase',
 'movie',
 'toxic',
 'avenger',
 'see',
 'film',
 'lot',
 'latter',
 'teenager',
 '',
 'ill',
 'admit',
 'blow',
 'away',
 'entirely',
 'expect',
 'make',
 'well',
 'outrageously',
 'fun',
 'funny',
 'watch',
 'adult',
 'even',
 'fun',
 'funn',
 'also',
 'one',
 'film',
 'actually',
 'inspire',
 'yes',
 'kid',
 'write',
 'get',
 'entertainment',
 'business',
 'film',
 'really',
 'turn',
 'thing',
 'head',
 'let',
 'alone',
 'turn',
 'notch',
 'violence',
 'effect',
 'comidy',
 'superhero',
 'genere',
 '',
 'content',
 'wild',
 'expressionism',
 'film',
 'common',
 'place',
 'time',
 'defenantly',
 'tv',
 'show',
 'south',
 'park

In [39]:
words = get_lemmas(stg, stop_words)

In [40]:
"violence" in words

True

In [41]:
"violen" in words

False

In [42]:
fdist1 = FreqDist(words)

In [43]:
TDM.rename(columns={"Unnamed: 0":"terms"}, inplace=True)

In [44]:
#df['sum']=df.sum(axis=1) # jk
TDM['n_doc_freq_of_term'] = TDM.astype(bool).sum(axis=1)

In [45]:
TDM.set_index("terms")

Unnamed: 0_level_0,Document 1,Document 2,Document 3,Document 4,Document 5,Document 6,Document 7,Document 8,Document 9,Document 10,n_doc_freq_of_term
terms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
cult,0,0,2,0,4,0,0,0,1,0,4
toxic,3,9,12,5,23,4,11,6,2,3,11
gor,0,0,0,0,0,0,0,0,0,0,1
violen,0,0,0,0,0,0,0,0,0,0,1
humilat,0,0,0,0,0,0,0,0,0,0,1
horror,0,0,0,1,7,1,0,5,2,1,7
prank,0,0,1,0,1,0,0,0,0,0,3
blind,0,2,1,4,2,0,2,0,1,1,8
comidy,3,0,0,0,0,0,0,0,0,0,2
girlfriend,0,0,0,1,2,0,3,0,0,0,4


In [17]:
def TF_IDF(value, idx):
    """
    value is a series object passed with axis=1
    """
    idf = log(10/value.n_doc_freq_of_term)+1
    tf = value[f'Document {idx}']/len(fdist1)
    return idf*tf

In [18]:
abc = {'terms':TDM.terms,
       'Document 1':TDM.apply(lambda x: TF_IDF(x, '1'), axis=1),
       'Document 2':TDM.apply(lambda x: TF_IDF(x, '2'), axis=1),
       'Document 3':TDM.apply(lambda x: TF_IDF(x, '3'), axis=1),
       'Document 4':TDM.apply(lambda x: TF_IDF(x, '4'), axis=1),
       'Document 5':TDM.apply(lambda x: TF_IDF(x, '5'), axis=1),
       'Document 6':TDM.apply(lambda x: TF_IDF(x, '6'), axis=1),
       'Document 7':TDM.apply(lambda x: TF_IDF(x, '7'), axis=1),
       'Document 8':TDM.apply(lambda x: TF_IDF(x, '8'), axis=1),
       'Document 9':TDM.apply(lambda x: TF_IDF(x, '9'), axis=1)}

In [19]:
df = pd.DataFrame(abc)

In [20]:
df.set_index('terms', inplace=True)

In [21]:
df

Unnamed: 0_level_0,Document 1,Document 2,Document 3,Document 4,Document 5,Document 6,Document 7,Document 8,Document 9
terms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
cult,0.0,0.0,0.002465,0.0,0.004929,0.0,0.0,0.0,0.001232
toxic,0.001745,0.005236,0.006982,0.002909,0.013381,0.002327,0.0064,0.003491,0.001164
gor,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
violen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
humilat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
horror,0.0,0.0,0.0,0.000872,0.006107,0.000872,0.0,0.004362,0.001745
prank,0.0,0.0,0.001417,0.0,0.001417,0.0,0.0,0.0,0.0
blind,0.0,0.001573,0.000787,0.003146,0.001573,0.0,0.001573,0.0,0.000787
comidy,0.005034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
girlfriend,0.0,0.0,0.0,0.001232,0.002465,0.0,0.003697,0.0,0.0


In [22]:
df.idxmax()

Document 1      film
Document 2     toxic
Document 3     toxic
Document 4     blind
Document 5     toxic
Document 6     toxic
Document 7     toxic
Document 8    horror
Document 9      film
dtype: object

In [23]:
terms = list(TDM['terms'])
terms.remove("film")
terms.remove("toxic")

In [24]:
TDM.set_index("terms", inplace=True)

In [25]:
TDM2 = TDM.drop(index=["film", "toxic"], axis=0)

In [26]:
abc2 = {'terms':terms,
       'Document 1':TDM2.apply(lambda x: TF_IDF(x, '1'), axis=1),
       'Document 2':TDM2.apply(lambda x: TF_IDF(x, '2'), axis=1),
       'Document 3':TDM2.apply(lambda x: TF_IDF(x, '3'), axis=1),
       'Document 4':TDM2.apply(lambda x: TF_IDF(x, '4'), axis=1),
       'Document 5':TDM2.apply(lambda x: TF_IDF(x, '5'), axis=1),
       'Document 6':TDM2.apply(lambda x: TF_IDF(x, '6'), axis=1),
       'Document 7':TDM2.apply(lambda x: TF_IDF(x, '7'), axis=1),
       'Document 8':TDM2.apply(lambda x: TF_IDF(x, '8'), axis=1),
       'Document 9':TDM2.apply(lambda x: TF_IDF(x, '9'), axis=1)}

In [27]:
df2 = pd.DataFrame(abc2)

In [28]:
df2.set_index('terms', inplace=True)

In [29]:
df2.idxmax()

Document 1        comidy
Document 2         mayor
Document 3          cult
Document 4         blind
Document 5        horror
Document 6        horror
Document 7    girlfriend
Document 8        horror
Document 9        horror
dtype: object