In [1]:
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
import unicodedata

from ast import literal_eval

from gensim.models import Word2Vec
import gensim
print("Genism Version: ", gensim.__version__)

Genism Version:  4.3.0


In [2]:
question1 = "I want to split a bottle of wine with my friends for a birthday. Were all most likely eating something different for dinner."
question2 = "Tonight I will be eating pasta at the Italian restaurant for dinner. I don't like sweet wines."
question3 = "I like Cabernets from Napa Valley"
question4 = "I'm looking to try something different for my dinner tonight. I'm having a white fish and tend to go for acidic white wines."
question5 = "I don't know what I want."
question6 = "Do you have a red wine that is light? Not too heavy."
question7 = "Do you have any heavy flavorful red wines?"
question8 = "I am looking for a dry white wine that's easy to drink."
question9 = "Wine is for snobs"
question10 = "I am not normally a wine drinker. What do you recommend?"
question11 = "I want to split a bottle of red wine with my friends for a birthday. Were all most likely eating something different for dinner."
question12 = "I like Cabernet Sauvignon s from Napa Valley"

questions = [question1, question2, 
             question3, question4, 
             question5, question6, 
             question7, question8, 
             question9, question10,
             question11, question12
            ]

In [3]:
for idx, question in enumerate(questions):
    print(f"Question {idx+1} : {question}")

Question 1 : I want to split a bottle of wine with my friends for a birthday. Were all most likely eating something different for dinner.
Question 2 : Tonight I will be eating pasta at the Italian restaurant for dinner. I don't like sweet wines.
Question 3 : I like Cabernets from Napa Valley
Question 4 : I'm looking to try something different for my dinner tonight. I'm having a white fish and tend to go for acidic white wines.
Question 5 : I don't know what I want.
Question 6 : Do you have a red wine that is light? Not too heavy.
Question 7 : Do you have any heavy flavorful red wines?
Question 8 : I am looking for a dry white wine that's easy to drink.
Question 9 : Wine is for snobs
Question 10 : I am not normally a wine drinker. What do you recommend?
Question 11 : I want to split a bottle of red wine with my friends for a birthday. Were all most likely eating something different for dinner.
Question 12 : I like Cabernet Sauvignon s from Napa Valley


In [4]:
questions_df = pd.DataFrame(questions, columns=['question'])

In [5]:
stop_words = stopwords.words('english')  + ["wine", 'vinho']
punc = string.punctuation
punc_less_underscore = punc.replace("_", "")

def strip_accents(STR):
    return ''.join(c for c in unicodedata.normalize('NFD', STR)
                   if unicodedata.category(c) != 'Mn')

def preprocess_text(text):
    """
    Preprocessing for efficient vectorization.
       -- Removes tabs, newlines, punctuation, empty strings.
             -- NOTE punctuation does not incl. underscore for things like acidity_category.
       -- Lemmatizes with universal Part of Speech (POS) method.
       -- Removes default English stopwords.
    """
    
    text = strip_accents(text)
    text = re.sub(r'[^\x00-\x7F]', ' ', text) # removes any remaining non-ascii chars
    
    text = text.lower()
    text = text.replace("\t", " ")
    text = text.replace("\n", " ") 
    
    re_punk = re.compile('[%s]' % re.escape(punc_less_underscore))
    tokens = word_tokenize(text)
    stripped_tkns = [re_punk.sub('', wxy) for wxy in tokens]
    st0p = [word for word in stripped_tkns if not word in stop_words]
    lemmatizer = WordNetLemmatizer()
    p0s_st0p_list = nltk.tag.pos_tag(st0p, 'universal') 
    so_fresh_and_so_clean = []
    for word, tag in p0s_st0p_list:
        if tag == 'VERB':
            tag = 'v'
        elif tag == "ADJ":
            tag = 'a'
        elif tag == "ADV":
            tag = 'r'
        else:
            tag = 'n'
        so_fresh_and_so_clean.append(lemmatizer.lemmatize(word, tag)) 
    while "" in so_fresh_and_so_clean:
        so_fresh_and_so_clean.remove("")
    while " " in so_fresh_and_so_clean:
        so_fresh_and_so_clean.remove(" ")
    return " ".join(so_fresh_and_so_clean)

In [6]:
questions_df['processed_question_str'] = questions_df.question.apply(preprocess_text)
questions_df['processed_question_tokens'] = questions_df.processed_question_str.apply(lambda xyz: xyz.split())

In [7]:
questions_df

Unnamed: 0,question,processed_question_str,processed_question_tokens
0,I want to split a bottle of wine with my frien...,want split bottle friend birthday likely eat s...,"[want, split, bottle, friend, birthday, likely..."
1,Tonight I will be eating pasta at the Italian ...,tonight eat pasta italian restaurant dinner nt...,"[tonight, eat, pasta, italian, restaurant, din..."
2,I like Cabernets from Napa Valley,like cabernet napa valley,"[like, cabernet, napa, valley]"
3,I'm looking to try something different for my ...,look try something different dinner tonight wh...,"[look, try, something, different, dinner, toni..."
4,I don't know what I want.,nt know want,"[nt, know, want]"
5,Do you have a red wine that is light? Not too ...,red light heavy,"[red, light, heavy]"
6,Do you have any heavy flavorful red wines?,heavy flavorful red wine,"[heavy, flavorful, red, wine]"
7,I am looking for a dry white wine that's easy ...,look dry white easy drink,"[look, dry, white, easy, drink]"
8,Wine is for snobs,snob,[snob]
9,I am not normally a wine drinker. What do you ...,normally drinker recommend,"[normally, drinker, recommend]"


In [8]:
filtered_bigram = pd.read_csv('../data/filtered_bigrams.csv', converters={"bigram": literal_eval})

In [9]:
filtered_bigram.head()

Unnamed: 0,bigram,pmi
0,"(hong, kong)",17.14411
1,"(1 3545217 ambassador, petrolhead)",17.028632
2,"(1 1260249 robbie, priddle)",17.028632
3,"(antao, vaz)",17.027633
4,"(1 15397767 aaron, blazer)",16.88777


In [10]:
filtered_bigram[filtered_bigram.bigram==('cabernet','sauvignon')]

Unnamed: 0,bigram,pmi
777,"(cabernet, sauvignon)",8.694494


In [11]:
filtered_bigram.shape

(4944, 2)

In [12]:
bigrams = [' '.join(x) for x in filtered_bigram.bigram.values if len(x[0]) > 2 or len(x[1]) > 2]

def replace_ngram(x):
    print(x)
    for gram in bigrams:
        x = x.replace(gram, '_'.join(gram.split()))
    print(x)
    return x

In [13]:
questions_df['processed_question_str'] = questions_df.processed_question_str.map(lambda x: replace_ngram(x))

want split bottle friend birthday likely eat something different dinner
want split bottle friend birthday likely eat something_different dinner
tonight eat pasta italian restaurant dinner nt like sweet wine
tonight eat pasta italian_restaurant dinner nt like sweet wine
like cabernet napa valley
like cabernet napa_valley
look try something different dinner tonight white fish tend go acidic white wine
look try_something_different dinner tonight white_fish tend go acidic white_wine
nt know want
nt know want
red light heavy
red light heavy
heavy flavorful red wine
heavy flavorful red wine
look dry white easy drink
look dry white easy_drink
snob
snob
normally drinker recommend
normally drinker recommend
want split bottle red friend birthday likely eat something different dinner
want split bottle red friend birthday likely eat something_different dinner
like cabernet sauvignon napa valley
like cabernet_sauvignon napa_valley


In [14]:
#questions_df[['question', 'processed_question_str']].to_csv("../data/questions.csv", index=False)
questions_df.to_csv("../data/questions.csv", index=False)