# Imports

In [34]:
import re
import pickle
import gensim
import numpy as np

from pprint import pprint
from gensim.models import KeyedVectors
from nltk.tokenize import sent_tokenize
from gensim.utils import simple_preprocess
from gensim.summarization.textcleaner import *
from gensim.parsing.preprocessing import remove_stopwords, strip_non_alphanum, strip_multiple_whitespaces, strip_tags, split_alphanum

# Utils

In [12]:
model = KeyedVectors.load_word2vec_format('./models/model.bin.gz', binary = True)

# Functions

In [54]:
def run_pos(data):
    
    data = remove_stopwords(data)
    sentences = [simple_preprocess(sent) for sent in sent_tokenize(data)]
    sentences = [[x for x in sent if x in model.vocab] for sent in sentences]
    return sentences

In [9]:
def remove_contractions(text):
    """
        Removes contractions to clean sentences
        
        Paras:
            raw: raw text data
        Returns:
            raw: cleaned text
    """
    contractions = { 
                    "ain't": "is not",
                    "aren't": "are not",
                    "can't": "cannot",
                    "could've": "could have",
                    "couldn't": "could not",
                    "didn't": "did not",
                    "doesn't": "does not",
                    "don't": "do not",
                    "hadn't": "had not",
                    "hasn't": "has not",
                    "haven't": "have not",
                    "he'd": "he would",
                    "he'll": "he will",
                    "he's": "he is",
                    "how'd": "how did",
                    "how'll": "how will",
                    "how's": "how is",
                    "i'd": "I would",
                    "i'll": "I will",
                    "i'm": "I am",
                    "i've": "I have",
                    "isn't": "is not",
                    "it'd": "it would",
                    "it'll": "it will",
                    "it's": "it is",
                    "let's": "let us",
                    "ma'am": "madam",
                    "mayn't": "may not",
                    "might've": "might have",
                    "mightn't": "might not",
                    "must've": "must have",
                    "mustn't": "must not",
                    "needn't": "need not",
                    "o'clock": "of the clock",
                    "oughtn't": "ought not",
                    "shan't": "shall not",
                    "sha'n't": "shall not",
                    "she'd": "she would",
                    "she'll": "she will",
                    "she's": "she is",
                    "should've": "should have",
                    "shouldn't": "should not",
                    "shouldn't've": "should not have",
                    "so've": "so have",
                    "so's": "so as",
                    "that'd": "that would",
                    "that's": "that is",
                    "there'd": "there had",
                    "there's": "there is",
                    "they'd": "they would",
                    "they'll": "they will",
                    "they're": "they are",
                    "they've": "they have",
                    "to've": "to have",
                    "wasn't": "was not",
                    "we'd": "we would",
                    "we'll": "we will",
                    "we're": "we are",
                    "we've": "we have",
                    "weren't": "were not",
                    "what'll": "what will",
                    "what're": "what are",
                    "what's": "what is",
                    "what've": "what have",
                    "when's": "when is",
                    "when've": "when have",
                    "where'd": "where did",
                    "where's": "where is",
                    "where've": "where have",
                    "who'll": "who will",
                    "who'll've": "who will have",
                    "who's": "who is",
                    "who've": "who have",
                    "why's": "why has",
                    "why've": "why have",
                    "will've": "will have",
                    "won't": "will not",
                    "won't've": "will not have",
                    "would've": "would have",
                    "wouldn't": "would not",
                    "y'all": "you all",
                    "you'd": "you had / you would",
                    "you'll": "you will",
                    "you'll've": "you will have",
                    "you're": "you are",
                    "you've": "you have",
                    "1st": "first",
                    "1 st": "first",
                    "2nd": "second",
                    "2 nd": "second",
                    "3rd":"third",
                    "3 rd":"third",
                }
    
    for contrac in list(contractions.keys()):
        text = re.sub(contrac, contractions[contrac], text)
    return text


# Classes

In [57]:
class syn_vecs():

    def __init__(self, model):
        self.model = model
        
    def create_synonyms(self, categories):
        synonyms = []
        for word in categories:
            vecs = model.most_similar(positive = [word])
            vecs = [vec[0] for vec in vecs if vec[0] in model.vocab]
            vecs.insert(0, word)
            synonyms.append(vecs)
        return synonyms

In [84]:
class classify_sentences():

    def __init__(self, tokenized_sents, tokens, vecs, model):
        self.model = model
        self.similarity_mat = np.zeros([len(tokens), len(vecs)])

    def similarity_matrix(self, tokenized_sents, synonym_vecs):
        for i, token in enumerate(tokenized_sents):
            for j, vector in enumerate(synonym_vecs):
                self.similarity_mat[i, j] = self.model.n_similarity(vector, token)

    def classify_sents(self, tokenized_sents, tokens, synonym_vecs):
        classifications = {}
        tokenized_sents = np.array(tokenized_sents)
        self.similarity_matrix(tokens, synonym_vecs)        
        y_pred = np.argmax(self.similarity_mat, axis = 1)

        for i, category in enumerate(categories):
            idx = np.where(y_pred == i)[0].tolist()
            classifications[category] = tokenized_sents[idx].tolist()

        return classifications

# Main

In [143]:
categories = ["flavor", "scent", "mental_illness", "romance", "Len_kutel"]
categories = [x for x in categories if x in model.vocab]

In [2]:
data = "I love eating spicy hand pulled noodles. I also like to buy perfumes. I suffer from clinical depression. But, I really love my wife. I love eating spicy hand pulled noodles."

In [145]:
clean_data = clean_text(data)

In [146]:
tokens = run_pos(clean_data)

In [147]:
tokenized_sents = sent_tokenize(clean_data)

In [133]:
vector = syn_vecs(model)

In [134]:
synonyms = vector.create_synonyms(categories)

In [135]:
classifier = classify_sentences(tokenized_sents, tokens, synonyms, model)

In [136]:
classifier.classify_sents(tokenized_sents, tokens, synonyms)

{'flavor': ['I love eating spicy hand pulled noodles.',
  'I love eating spicy hand pulled noodles.'],
 'scent': ['I also like to buy perfumes.'],
 'mental_illness': ['I suffer from clinical depression.'],
 'romance': ['But,  i really love my wife.']}

**Try stuff**

In [103]:
def clean_text(data):
    data = strip_tags(data)
    data = strip_multiple_whitespaces(data)
    data = re.sub(r"[^\x00-\x9f]", r"", data)
    data = remove_contractions(data)
    data = split_alphanum(data)
    data = re.sub(r"\.\.+|\.+\,", r", ", data)
    data = re.sub(r"\W+\)", r")", data)
    data = re.sub(r"\(\W+", r"(", data)
    data = re.sub(r"\s+\.", ".", data)
    data = replace_abbreviations(data)
    data = " ".join([sent.capitalize() for sent in split_sentences(data) if len(sent.split()) > 2])
    data = ", ".join(data.split(","))
    return data

In [104]:
data = "Friends, stoners, red-eyed countrymen, lend me your ears; for I bring unto thee a tale of the Blue Dream... T’was a calm April night, 2014 it was, and I had eagerly purchased an eighth of some pungent Blue Dream. It’s abundance of sugary trichomes, paired with the thick density of the bud was enough to bring a tear to your eye. I enthusiastically ground up the cheeba, packed a generous bowl and went to town. Eight minutes and a bowl later, I was beginning to assume that my herb wasn’t all that strong…but then it hit me like a 150-ton locomotive of euphoria. “Whoooa” was the only thing that I could say, as I looked at everything around the living room. Everything looked as if it were lagging behind by a few frames, and this cerebral adventure lasted for the first few minutes…but just when I thought that Blue Dream had shown me everything there was to experience about her, her sativa effects began to kick in. All of a sudden, I felt as if I was briskly cruising on a warm cloud, which was followed by an amazing burst of energy. Folks let me tell you, if you’d ever like to find out how an eagle feels when it spreads its majestic wings and takes to the air at 80 mph., this strain is a kickass tool to take you there. Finally, when all of your euphoric energy has been expended, Blue Dream ends her experience with a mellow cruise induced by her indica side. Call in at Jimmy John’s and order 12 sandwiches, fire up Netflix, and take it easy on the couch until you slowly begin to melt into the furniture, because you're going to start to drift off into your happy place; and as soon as you reach that critical point of relaxation, you’re going to sleep like a sloth on twelve doses of Ambien. Folks, I guess the moral of the story here is that Blue Dream is an outstanding and pleasurable strain that is fun for cannabis enthusiasts anywhere on the experience spectrum; from the novice user who is looking to have an easy-going yet memorable experience, to the seasoned smoker who owns a laser pointer and a cat, and anybody in between; but my review alone can’t depict the exquisite effects that Blue Dream has to offer. Roll up a liberal amount of Blue Dream, spark it up, and let her take you on a spectacular trip; you’ll be thankful you did when your mind is blissfully floating through the heavens 15.5 %abd. This is number 1 bs ( khabib time ma man!)  ."