# Clean Wine Comments

In [1]:
import pandas as pd
import regex as re
import nltk
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import collections
from collections import Counter

# nltk.download('punkt')
# nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup

In [2]:
wine = pd.read_csv('../data/wine_50comments.csv')
# wine.drop(columns=['Unnamed: 0', 'created_utc'], inplace=True)
# wine.head()

In [3]:
wine.dtypes


Unnamed: 0      int64
subreddit      object
body           object
created_utc     int64
dtype: object

In [None]:
wine.shape

In [None]:
wine['body'][0]

In [None]:
# word in body of subreddit 
words = wine['body']

In [None]:
# this will count the number of tokens in the wine['body'] before any cleaning

def word_count(series):
    list_tokens = [w.lower() for w in series]
    string_tokens = str(list_tokens)
    tokens = BeautifulSoup(string_tokens).get_text()
    return tokens

len(word_count(wine['body']).split())

# 1718955 words before cleaning

In [None]:
# put stopwords here: may want to append to 'english' 

stop_words = stopwords.words('english')
stop_words.append('like')
stop_words.append('one')
# stop_words.append('oz')
# stop_words.append('wine')
# stop_words.append('good')

In [None]:
# modified from NLP1 lesson from Matt Bremms GA DSI 11

# Instantiate the models 
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def status_words(status):
    '''takes a series and cleans the text data '''
    
    review_text = BeautifulSoup(status).get_text()
    # Removed HTLM
    
    letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
    # Removed Non Letter
    
    words = letters_only.lower().split()
    # Tokenize without official tokenizer
    
    #stops = set(stopwords.words('english'))
        #need to add 'like' 'one'
    stops = set(stop_words)
    # Remove Stopwords
    
    meaningful_words = [lemmatizer.lemmatize(w) for w in words if w not in stops]
#     meaningful_words = [stemmer.stem(w) for w in words if w not in stops]
    # list

    return(' '.join(meaningful_words))

In [None]:
wine.head(2)

In [None]:
# # Test out some lines to see what the lemmatizer is doing 

original_text = wine['body'][9876].split()

lemmatizer = WordNetLemmatizer()
lemmy = [lemmatizer.lemmatize(w) for w in original_text]

In [None]:
# Compare tokens to lemmatized version. 
# NLP1 Danielle Medellin (she/her) - NYC

list(zip(original_text, lemmy))

for a, b in zip(original_text, lemmy):
    if a != b:
        print(a,b)

In [None]:
# # Test out some lines to see what the stemmer is doing 
# original_text = wine['body'][34].split()

# stem_words = [stemmer.stem(w) for w in original_text]

# # Compare tokens to lemmatized version. 
# # NLP1 Danielle Medellin (she/her) - NYC

# list(zip(original_text, stem_words))

# for a, b in zip(original_text, stem_words):
#     if a != b:
#         print(a,b)

In [None]:
# add a column onto DF with clean data
# I want to keep the orginal data as well for comparision 

wine['body_clean'] = wine['body'].map(status_words)

In [None]:
# Original 
wine['body'][0]

In [None]:
# Same line clean 
wine['body_clean'][0]

In [None]:
# https://www.geeksforgeeks.org/find-k-frequent-words-data-set-python/
# Top 10 most common words from cleaned data 

clean_tokens = word_count(wine['body_clean'])
count = Counter(clean_tokens.split())
top_wine = count.most_common(10)
top_wine

In [None]:
# visually look at this 

names, values = zip(*top_wine)

x=values
y=names

plt.figure(figsize=(9, 6))
plt.barh(y, x);
plt.title('Top 10 Wine Words', fontsize=18);
plt.xticks(fontsize=14);
plt.yticks(fontsize=16);

In [None]:
top_wine

In [None]:
# Top 10 most common words from original data 

clean_tokens = word_count(wine['body'])
count = Counter(clean_tokens.split())
count.most_common(10)

In [None]:
# this is for 20K pull 
# wine.to_csv('../data/clean_wine.csv', index=None)

# save changes to_csv for next notebook 
# comment out to prevent accidental override

In [None]:
# this if for 50K pull

# wine.to_csv('../data/clean_50wine.csv', index=None)

## Look at the cleaning step by step to see how it is working beginning to end

In [None]:
wine.loc[400:410]

In [None]:
ex = BeautifulSoup(wine['body'][498])
ex

In [None]:
ex1 = ex.get_text()
ex1

In [None]:
ex2 = re.sub('[^a-zA-Z]', ' ', ex1)
ex2

In [None]:
ex3 = ex2.lower().split()
ex3

In [None]:
stops = set(stop_words)
ex4 = [lemmatizer.lemmatize(w) for w in ex3 if w not in stops]
ex4