In [1]:
import os
os.chdir("..")
os.getcwd()

'/Users/Louis/ml_projects/ToxPost'

In [2]:
import random

In [3]:
from src.data.make_dataset import load_data
from src.features.clean_corpus import clean_corpus
from src.features.shrink_corpus import shrink_corpus
from src.features.embed_corpus import embed_corpus
from resources.glove.load_embedding import load_embedding

In [4]:
# We begin by importing the raw data:
raw_data_path = "./data/raw/data.csv"

raw_data = load_data(raw_data_path, header=True, id=True)
raw_features = [datapoint[0] for datapoint in raw_data]
labels = [datapoint[1] for datapoint in raw_data]

In [22]:
# Let's look at a few typical datapoints:

indices = [index for index in random.sample(range(0,len(raw_data)),10)]
example_corpus = [raw_features[index] for index in indices]

In [23]:
for comment in example_corpus:
    print(" ".join(comment))
    print("\n")

" I added the ""highly"" reference to help the reader understand what that kind of insurance is best suited for. Not all level of HO insurance are applicable to all types of domicile. An adjective does not necessarily mean that an sentence is no longer neutral. "


" Hi, Calpez. I really need you to back up these assertions, because as far as I can tell, none of these teams exist at all. The main problem is that none of these places are ""nations"" of any kind, so they can't possibly have ""national"" teams. Just a few examples: Australian Indigenous national football team: 2 hits at Google (both Wikipedia) Christmas Island national football team: 15 hits, but only from Wikipedia and Wikipedia mirrors Cocos (Keeling) Islands national football team: 5 hits. All Wikipedia and Wikipedia mirrors I could go on, but I'd be wasting my time. If you want to save this section, please start looking for reliable sources to back up this information. And please bear in mind that Wikipedia has a poli

```
Next, we'll clean up the data:

1. tokenize  
2. remove numbers, links, punctuation and articles
3. remove stopwords as defined in nltk.stopwords
4. replace words using a custom list  

The cleaned up data can be found in the data/cleaned folder
```


In [24]:
cleaned_corpus = clean_corpus(example_corpus)

100%|██████████| 10/10 [00:00<00:00, 650.31it/s]


In [25]:
for i in range(0,10):
    print("the comment \n\n{}\n\nwas cleaned to \n\n{}\n\n\n".format(" ".join(example_corpus[i]), " ".join(cleaned_corpus[i])))

the comment 

" I added the ""highly"" reference to help the reader understand what that kind of insurance is best suited for. Not all level of HO insurance are applicable to all types of domicile. An adjective does not necessarily mean that an sentence is no longer neutral. "

was cleaned to 

added highly reference help reader understand kind insurance best suited level insurance applicable types domicile adjective necessarily mean sentence longer neutral



the comment 

" Hi, Calpez. I really need you to back up these assertions, because as far as I can tell, none of these teams exist at all. The main problem is that none of these places are ""nations"" of any kind, so they can't possibly have ""national"" teams. Just a few examples: Australian Indigenous national football team: 2 hits at Google (both Wikipedia) Christmas Island national football team: 15 hits, but only from Wikipedia and Wikipedia mirrors Cocos (Keeling) Islands national football team: 5 hits. All Wikipedia and Wi

next, we'll shrink the length of each comment to size 40, by only keeping the words with the top TfIdf scores:

In [26]:
shrunken_corpus = shrink_corpus(cleaned_corpus, 20)

100%|██████████| 10/10 [00:00<00:00, 404.64it/s]


In [27]:
for i in range(0,10):
    print("the comment \n\n{}\n\nwas shrunken to \n\n{}\n\n\n".format(" ".join(cleaned_corpus[i]), " ".join(shrunken_corpus[i])))

the comment 

added highly reference help reader understand kind insurance best suited level insurance applicable types domicile adjective necessarily mean sentence longer neutral

was shrunken to 

added highly reference help reader understand kind insurance best suited level insurance applicable types domicile adjective necessarily mean sentence longer neutral



the comment 

calpez really need back assertions far tell none teams exist main problem none places nations kind cant possibly national teams examples australian indigenous national football team hits google wikipedia christmas island national football team hits wikipedia wikipedia mirrors cocos keeling islands national football team hits wikipedia wikipedia mirrors could wasting time want save section please start looking reliable sources back information please bear mind wikipedia policy original research cant claim teams exist wikipedia pages thats things work 桜ん坊

was shrunken to 

back none teams exist none places cant 

In [28]:
# next. we'll embedd the comments using the GloVe embedding:

In [29]:
embedding_path = "./resources/glove/glove.twitter.27B.25d.txt"
embedding = load_embedding(embedding_path)
dim = 10

In [30]:
embedded_corpus = embed_corpus(shrunken_corpus, embedding, dim)

100%|██████████| 10/10 [00:00<00:00, 14084.30it/s]


In [31]:
for i in range(0,10):
    print("the comment \n\n{}\n\nwas embedded to \n\n{}\n\n\n".format(" ".join(shrunken_corpus[i]), embedded_corpus[i]))

the comment 

added highly reference help reader understand kind insurance best suited level insurance applicable types domicile adjective necessarily mean sentence longer neutral

was embedded to 

[array([ 0.33589556, -0.6934941 , -0.55123194,  0.5656964 , -0.08577663,
        1.03598595,  0.91414217, -0.24772415, -0.19354427, -0.46324612]), array([ 0.90326558,  1.16440205,  0.30820981, -0.86696218, -1.30613961,
       -0.04159566,  1.92028224, -0.25762592, -0.50523927, -0.03105853]), array([ 1.2692879 , -0.39140384, -0.84175361,  0.070821  ,  0.37517738,
        0.12873042,  1.02138875, -0.26801452,  0.63498432,  0.10781987]), array([-1.2214756 , -0.55900778, -0.88013851, -0.90231593, -1.28526901,
       -0.22621875, -0.91164833,  0.37540989, -0.02753832, -0.46636515]), array([ 1.99012222, -0.68346508, -1.69175078,  0.02783333,  0.16369376,
        0.22956155,  0.47568318, -0.06967941, -0.02918084,  0.04606007]), array([-1.66334158,  0.96554522, -0.63985326, -1.46771457, -0.20116229