In [1]:
import pandas as pd
import regex as re
import nltk
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import collections
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline, Pipeline

from bs4 import BeautifulSoup


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jobethschroeter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jobethschroeter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Import both datasets

In [2]:
cocktails = pd.read_csv('../data/cocktails_comments.csv')
cocktails.drop(columns=['Unnamed: 0', 'created_utc'], inplace=True) 
cocktails.head()

Unnamed: 0,subreddit,body
0,cocktails,"Mine didn’t pop up, but it came away from the ..."
1,cocktails,Yes it is.
2,cocktails,Are you still able to discern the Bombay Sapph...
3,cocktails,"Damn, just ran out of gin and curacao. I'm gue..."
4,cocktails,Jesus... you have a bottle of Blanton's and yo...


In [3]:
cocktails['body'][0]

'Mine didn’t pop up, but it came away from the side of the glass, and you could spin it around for fun. Eventually melted, and I drank it. \n\nIt did change the flavor of the drink as you go which is pretty neat.'

In [4]:
cocktails.isnull().sum()
# no missing values

subreddit    0
body         0
dtype: int64

In [5]:
cocktails.shape

(20000, 2)

In [6]:
wine = pd.read_csv('../data/wine_comments.csv')
wine.drop(columns=['Unnamed: 0', 'created_utc'], inplace=True)
wine.head()

Unnamed: 0,subreddit,body
0,wine,QQ - how long did you let it breathe for befor...
1,wine,I think this is a serious question for some pe...
2,wine,Definitely put Staglin in this mix. Such an am...
3,wine,"Oh I just tried vaping some fragrant, lemony c..."
4,wine,I mean this is why Parker points start at 50 r...


In [7]:
wine['body'][0]

'QQ - how long did you let it breathe for before you started drinking? Did you decant or choose not to?'

In [8]:
wine.isnull().sum()
# no missing values

subreddit    0
body         0
dtype: int64

### Clean the Body

In [None]:
# stop_words = stopwords.words('english')
# # stop_words.append('like')
# # stop_words.append('one')
# stop_words

In [9]:
# NLP1 lesson from Matt Bremms GA DSI 11

# then put this into X_train, y_train
# I need to combine both dataframes 
# then into CountVectorizer

# Instantiate the models 
lemmatizer = WordNetLemmatizer()


def status_words(status):
    
    review_text = BeautifulSoup(status).get_text()
    # Removed HTLM
    
    letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
    # Removed Non Letter
    
    words = letters_only.lower().split()
    # Tokenize without official tokenizer
    
    stops = set(stopwords.words('english'))
#     stops = set(stop_words)
    # Remove Stopwords
    
    meaningful_words = [lemmatizer.lemmatize(w) for w in words if w not in stops]
    # list

    return(' '.join(meaningful_words))

In [14]:
# map the clean text onto the DataFrame
cocktails['body_clean'].copy = cocktails['body'].map(status_words)

In [22]:
cocktails.to_csv('../data/clean_cocktail.csv', index=None)

In [16]:
wine['body_clean'] = wine['body'].map(status_words)

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

https://redd.it/8enkkp" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document 

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [11]:
# Examine New Dataframes
cocktails.isnull().sum()

subreddit     0
body          0
body_clean    0
dtype: int64

In [12]:
cocktails['body'][192]

'Do it!'

In [13]:
cocktails['body_clean'][192]

''

In [None]:
wine['body'][0]

In [None]:
wine['body_clean'][0]

In [None]:
# this will count the number of tokens in the df['body'] before any cleaning

def token_counts(series):
    list_tokens = [w.lower() for w in series]
    string_tokens = str(list_tokens)
    tokens = BeautifulSoup(string_tokens).get_text()
    return tokens

In [None]:
initial_tokens = token_counts(cocktails['body'])
print('Inital Cocktail', len(initial_tokens))

initial_tokens = token_counts(cocktails['body_clean'])
print('Clean Cocktail', len(initial_tokens))

print('='*22)
initial_tokens = token_counts(wine['body'])
print('Initial Wine', len(initial_tokens))

initial_tokens = token_counts(wine['body_clean'])
print('Clean Wine', len(initial_tokens))

In [None]:
def top_10(series):
    clean_tokens = token_counts(series)
    count = Counter(clean_tokens.split())
    return count.most_common(10)