# HW02: Tokenization

Remember that these homework work as a completion grade. **You can skip one section without losing credit.**

In [0]:
#Import the AG news dataset (same as hw01)
#Download them from here 
#!wget https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv

import pandas as pd
import nltk
df = pd.read_csv('train.csv')

df.columns = ["label", "title", "lead"]
label_map = {1:"world", 2:"sport", 3:"business", 4:"sci/tech"}
def replace_label(x):
	return label_map[x]
df["label"] = df["label"].apply(replace_label) 
df["text"] = df["title"] + " " + df["lead"]
df.head()

## Preprocess Text

In [0]:
import spacy
dfs = df.sample(50)
nlp = spacy.load('en_core_web_sm')

##TODO use spacy to split the documents in the sampled dataframe (dfs) in sentences and tokens

dfs["tokenized"] = dfs["text"].apply(lambda x: nlp(x))

##TODO print the first sentence of the first document in your sample

print (list(dfs.iloc[0]["tokenized"].sents)[0])

In [0]:
##TODO create a new column with tokens in lowercase (x.lower()), without punctuation tokens (x.is_punct) nor stopwords (x.is_stop)

def tokenize(x):
    return [w.lemma_.lower() for w in nlp(x) if not w.is_stop and not w.is_punct and not w.is_digit]
dfs["preprocessed"] = dfs["text"].apply(lambda x: tokenize(x))

##TODO print the tokens (x.lemma_) and the tags (x.tag_ ) of the first sentence of the first document (doc.sents)
for sent in dfs.iloc[0]["tokenized"].sents:
    for token in sent:
        print (token.lemma_, token.tag_)
    break

### Noun Chunks

In [0]:
##TODO print the first 20 noun chuncks in your sample corpus (doc.noun_chunks)
counter = 1
for doc in dfs["tokenized"]:
    for chunk in doc.noun_chunks:
        if counter > 20:
            break
        print (counter, chunk)
        counter += 1


### Named Entities

Let's compute the ratio of named entities starting with a capital letter, e.g. if we have "University of Chicago" as a NE, "University" and "Chicago" are capitalized, "of" is not, thus the ratio is 2/3.

In [0]:
##TODO print the ratio of tokens being part of a named entity span starting with a capital letter (doc.ents)

num_capitalized, n = 0, 0
for doc in dfs["tokenized"]:
    for token in doc.ents:
        if token.text[0].isupper():
            num_capitalized += 1
        n += 1
print (num_capitalized / n)

In [0]:
##TODO print the ratio of capitalized tokens not being part of a named entity span
# e.g. "The dog barks" = 1/3; 3 tokens, only "The" is capitalized

num_capitalized, n = 0, 0

for doc in dfs["tokenized"]:
    for token in doc:
        if not token.ent_type_:
            if token.text[0].isupper():
                num_capitalized += 1
            n += 1
print (num_capitalized / n)

In [0]:
##TODO print the ratio of capitalized tokens not being a named entity and not being the first token in a sentence
# e.g. "The dog barks" = 0; 3 tokens, "The" is capitalized but the starting token of a sentence, no other tokens are capitalized.

##TODO print the ratio of capitalized tokens not being part of a named entity span
# e.g. "The dog barks" = 1/3; 3 tokens, only "The" is capitalized

num_capitalized, n = 0, 0


for doc in dfs["tokenized"]:
    for sent in doc.sents:
        for i, token in enumerate(sent):
            if i == 0:
                continue
            if not token.ent_type_:
                if token.text[0].isupper():
                    num_capitalized += 1
                    error_analysis = sent # keep one sentence where we have a capitalized token which is not the start of the sentence
                    
                n += 1
print (num_capitalized / n)

In [0]:
print (error_analysis, [(t, t.ent_type_) for t in error_analysis])

Give an example of a capitalized token in the data which is neither a named entity nor at the start of a sentence. What could be the reason the token is capitalized (one sentence)?

## Term Frequencies

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=0.01, 
                        max_df=0.9,  
                        max_features=1000,
                        stop_words='english',
                        use_idf=True, # the new piece
                        ngram_range=(1,2))

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np

dfs["input_TFIDF"] = dfs["preprocessed"].apply(lambda x: " ".join(x))
X_tfidf = tfidf.fit_transform(dfs["input_TFIDF"])
X_tfidf.shape

##TODO using the whole sample, produce a world cloud with bigrams for each label using tfidf frequencies
vocab = tfidf.get_feature_names()
print (vocab[:10])

total_freqs = list(np.array(X_tfidf.sum(axis=0))[0])
fdict = dict(zip(vocab,total_freqs))
# generate word cloud of words with highest counts
wordcloud = WordCloud().generate_from_frequencies(fdict) 
plt.clf()
plt.imshow(wordcloud, interpolation='bilinear') 
plt.axis("off") 
plt.show()


## Hash Vectorizer

In [0]:
from sklearn.feature_extraction.text import HashingVectorizer

hv = HashingVectorizer(n_features=5000)

##TODO print the first 10 features produced by the hash vectorizer
from eli5.sklearn import InvertableHashingVectorizer
X_hash = hv.fit_transform(dfs['input_TFIDF'])
ivec = InvertableHashingVectorizer(hv)
inverted_hv = ivec.fit(dfs['input_TFIDF'])

#print ([i for i in inverted_hv.get_feature_names()[:10]])
counter = 1
for i in inverted_hv.get_feature_names():
    if counter > 10:
        break
    if isinstance(i, list):
        print (i)
        counter += 1
        

## Supervised Feature Selection

In [0]:
from sklearn.feature_selection import SelectKBest, f_classif, chi2


##TODO compute the number of words per document (excluding stopwords)
##TODO get the most predictive features of the number of words per document using first f_class and then chi2
import numpy as np
vocab = tfidf.get_feature_names()

# f_class
Y = dfs['label']
select = SelectKBest(f_classif, k=10)
X_new = select.fit_transform(X_tfidf, Y)
print ([vocab[i] for i in np.argsort(select.scores_)[:10]])

# chi2
select = SelectKBest(chi2, k=10)
X_new = select.fit_transform(X_tfidf, Y)
print ([vocab[i] for i in np.argsort(select.scores_)[:10]])



Are the results different? What could be a reason for this? 

## Huggingface Tokenizers

In [0]:
# # we use distilbert tokenizer
from transformers import DistilBertTokenizerFast

# let's instantiate a tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

##TODO tokenize the sentences in the sampled dataframe (dfs) using the DisilBertTokenizer

dfs["huggingface_tokenizer"] = dfs["text"].apply(lambda x: tokenizer.tokenize(x))

##TODO what is the type/token ratio from this tokenizer (number_of_unqiue_token_types/number_of_tokens)?

tokens, types = 0, set()
for doc in dfs["huggingface_tokenizer"]:
    for token in doc:
        tokens += 1
        types.add(token)
        
print (len(types) / tokens)

##TODO what is the amount of subword tokens returned by the huggingface tokenizer? hint: each subword token starts with "#"
num_subwords = 0
for doc in dfs["huggingface_tokenizer"]:
    for token in doc:
        if token.startswith("#"):
            num_subwords += 1
            
print ("number of subwords:", num_subwords, "number of words:", tokens)