In [1]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
import Class_replace_impute_encode as rie
import Class_regression as reg
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
# my_analyzer replaces both the preprocessor and tokenizer
# it also replaces stop word removal and ngram constructions

def my_analyzer(s):
    # Synonym List
    syns = {'veh': 'vehicle', 'car': 'vehicle', 'chev':'cheverolet', \
              'chevy':'cheverolet', 'air bag': 'airbag', \
              'seat belt':'seatbelt', "n't":'not', 'to30':'to 30', \
              'wont':'would not', 'cant':'can not', 'cannot':'can not', \
              'couldnt':'could not', 'shouldnt':'should not', \
              'wouldnt':'would not', }
    
    # Preprocess String s
    s = s.lower()
    s = s.replace(',', '. ')
    # Tokenize 
    tokens = word_tokenize(s)
    tokens = [word.replace(',','') for word in tokens ]
    tokens = [word for word in tokens if ('*' not in word) and \
              ("''" != word) and ("``" != word) and \
              (word!='description') and (word !='dtype') \
              and (word != 'object') and (word!="'s")]
    
    # Map synonyms
    for i in range(len(tokens)):
        if tokens[i] in syns:
            tokens[i] = syns[tokens[i]]
            
    # Remove stop words
    punctuation = list(string.punctuation)+['..', '...']
    pronouns = ['i', 'he', 'she', 'it', 'him', 'they', 'we', 'us', 'them']
    stop = stopwords.words('english') + punctuation + pronouns
    filtered_terms = [word for word in tokens if (word not in stop) and \
                  (len(word)>1) and (not word.replace('.','',1).isnumeric()) \
                  and (not word.replace("'",'',2).isnumeric())]
    
    # Lemmatization & Stemming - Stemming with WordNet POS
    # Since lemmatization requires POS need to set POS
    tagged_words = pos_tag(filtered_terms, lang='eng')
    # Stemming with for terms without WordNet POS
    stemmer = SnowballStemmer("english")
    wn_tags = {'N':wn.NOUN, 'J':wn.ADJ, 'V':wn.VERB, 'R':wn.ADV}
    wnl = WordNetLemmatizer()
    stemmed_tokens = []
    for tagged_token in tagged_words:
        term = tagged_token[0]
        pos  = tagged_token[1]
        pos  = pos[0]
        try:
            pos   = wn_tags[pos]
            stemmed_tokens.append(wnl.lemmatize(term, pos=pos))
        except:
            stemmed_tokens.append(stemmer.stem(term))
    return stemmed_tokens

# Further Customization of Stopping and Stemming using NLTK
def my_preprocessor(s):
    #Vectorizer sends one string at a time
    s = s.lower()
    s = s.replace(',', '. ')
    print("preprocessor")
    return(s)
    
def my_tokenizer(s):
    # Tokenize
    print("Tokenizer")
    tokens = word_tokenize(s)
    tokens = [word.replace(',','') for word in tokens ]
    tokens = [word for word in tokens if word.find('*')!=True and \
              word != "''" and word !="``" and word!='description' \
              and word !='dtype']
    return tokens

In [3]:
# Increase Pandas column width to let pandas read large text columns
pd.set_option('max_colwidth', 32000)
# Read GMC Ignition Recall Comments from NTHSA Data
#file_path = '/Users/Home/Desktop/python/Excel/'
df = pd.read_excel("CaliforniaCabernet.xlsx")
# Setup simple constants
n_docs     = len(df['description'])
n_samples  = n_docs
m_features = None
s_words    = 'english'
ngram = (1,2)

# Setup reviews in list 'discussions'
discussions = []
for i in range(n_samples):
    discussions.append(("%s" %df['description'].iloc[i]))

In [4]:
# Create Word Frequency by Review Matrix using Custom Analyzer
cv = CountVectorizer(max_df=0.95, min_df=2, max_features=m_features,\
                     analyzer=my_analyzer, ngram_range=ngram)
tf = cv.fit_transform(discussions)

print("\nVectorizer Parameters\n", cv, "\n")


Vectorizer Parameters
 CountVectorizer(analyzer=<function my_analyzer at 0x000002084C6457B8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=0.95,
        max_features=None, min_df=2, ngram_range=(1, 2), preprocessor=None,
        stop_words=None, strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None) 



In [5]:
n_topics        = 9
max_iter        =  5
learning_offset = 20.
learning_method = 'online'
# LDA for TF-IDF x Doc Matrix
# First Create Term-Frequency/Inverse Doc Frequency by Review Matrix
# This requires constructing Term Freq. x Doc. matrix first
tf_idf = TfidfTransformer()
print("\nTF-IDF Parameters\n", tf_idf.get_params(),"\n")
tf_idf = tf_idf.fit_transform(tf)
# Or you can construct the TF/IDF matrix from the data
tfidf_vect = TfidfVectorizer(max_df=0.95, min_df=2, max_features=m_features,\
                             analyzer=my_analyzer, ngram_range=ngram)
tf_idf = tfidf_vect.fit_transform(discussions)
print("\nTF_IDF Vectorizer Parameters\n", tfidf_vect, "\n")

lda = LatentDirichletAllocation(n_components=n_topics, max_iter=max_iter,\
                                learning_method=learning_method, \
                                learning_offset=learning_offset, \
                                random_state=12345)
lda.fit_transform(tf_idf)
print('{:.<22s}{:>6d}'.format("Number of Reviews", tf.shape[0]))
print('{:.<22s}{:>6d}'.format("Number of Terms",     tf.shape[1]))
print("\nTopics Identified using LDA with TF_IDF")
tf_features = cv.get_feature_names()
max_words = 15
topic_description=[]
for topic_idx, topic in enumerate(lda.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([tf_features[i]
                             for i in topic.argsort()[:-max_words - 1:-1]])
        topic_description.append(message[10:])
        print(message)
        print()


TF-IDF Parameters
 {'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': False, 'use_idf': True} 


TF_IDF Vectorizer Parameters
 TfidfVectorizer(analyzer=<function my_analyzer at 0x000002084C6457B8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=0.95,
        max_features=None, min_df=2, ngram_range=(1, 2), norm='l2',
        preprocessor=None, smooth_idf=True, stop_words=None,
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None) 

Number of Reviews..... 13135
Number of Terms.......  6263

Topics Identified using LDA with TF_IDF
Topic #0: wine flavor tannin black blackberry cabernet currant oak year fruit cherry dry rich show ripe

Topic #1: barely wait sweaty bay overpower weave chile front tongue create funky drop generosity acceptable underbelly

Topic #2: meet coconut tightly party wound lend fade s

In [6]:
for i in range(len(topic_description)):
    topic_description[i]=topic_description[i].split(' ')

In [7]:
temp=lda.transform(tf_idf)
temp1=[]
for i in range(len(temp)):
    temp1.append(temp[i].argmax())
temp1=pd.DataFrame(temp1,columns=['Topic#'])
df=df.join(temp1)

In [8]:
df.head()

Unnamed: 0,Review,description,year,points,price,winery,Region,Topic
0,1,"This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak. Juicy red-cherry fruit and a compelling hint of caramel greet the palate, framed by elegant, fine tannins and a subtle minty tone in the background. Balanced and rewarding from start to finish, it has years ahead of it to develop further nuance. Enjoy 2022–2030.",,96,235.0,Heitz,Napa,0
1,17,"This blockbuster, powerhouse of a wine suggests blueberry pie and chocolate as it opens in the glass. On the palate, it's smooth and seductively silky, offering complex cedar, peppercorn and peppery oak seasonings amidst its dense richness. It finishes with finesse and spice.",,95,325.0,Hall,Napa,4
2,48,"Blended with 9% Malbec, 9% Cabernet Franc and 5% Petit Verdot, this is a perennial classic for the winery, the sister brand of Cuvaison. Juicy in cherry and cassis, it sustains big, pillowy tannins and tar, suggesting more time for the fruit to match up with the structure. Drink through 2020.",,90,60.0,Brandlin,Napa,5
3,68,"From the producer's monumental Atlas Peak vineyard, this is a tightly wound, solidly constructed mountain Cab, blended with a handful of Petit Verdot. Tobacco, black tea and a sliver of coconut intermingle around a medium-bodied whole that will benefit from cellaring, through 2021.",,91,85.0,Michael Mondavi Family Estate,Napa,0
4,70,"A juiciness of cherry and vanilla spark the opening of this wine, a celebration of the vintage, appellation and in this case, fruit-forwardness of the variety. With a backbone of oak and cedar, it has smooth tannins and medium weight, finishing in mocha chocolate. Drink now through 2022.",,91,60.0,Provenance Vineyards,Napa,4


In [9]:
table1=df.pivot_table(['points','price'],index='Topic')
table1=table1.join(pd.DataFrame(topic_description))

## Table 1 with average points, average price and 15 words of cluster description

In [10]:
table1=table1.rename_axis({'points':'avg_points','price':'avg_price'},axis=1)
table1.T

Topic,0,1,2,3,4,5,6,7,8
avg_points,90.0806,84.5,87.2632,90,85.6479,89.0723,82.0909,84.5,86
avg_price,64.7879,28.4286,46.7895,65,33.7153,57.3002,24,47,33.7778
0,wine,barely,meet,punch,flavor,palate,sirah,brightness,bouquet
1,flavor,wait,coconut,expansive,blackberry,petit,petite,weedy,effort
2,tannin,sweaty,tightly,cardamom,cherry,verdot,cherry-berry,muscular,santa
3,black,bay,party,coast,dry,nose,bottling,breadth,light-bodied
4,blackberry,overpower,wound,aromatics,soft,merlot,showy,recall,elevation
5,cabernet,weave,lend,boast,drink,malbec,reduce,farm,lurk
6,currant,chile,fade,handful,wine,small,figure,opposite,loam
7,oak,front,saddle,enjoyment,sweet,franc,appropriately,cake,slate


In [11]:
table2=df.pivot_table('Review',index='Region',columns='Topic',\
                      aggfunc='count',\
                      fill_value=0,margins=True)

## Table 2 with Percent of reviews in that region associated with each of 9 clusters

In [12]:
def percent_convert(x):
    for index in x.index:
        for i in x.columns:
            x.loc[index,i]=round(x.loc[index,i]*100/x.loc[index,'All'],2)
    
    return x
percent_convert(table2)

Topic,0,1,2,3,4,5,6,7,8,All
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
California Other,26.77,0.0,0.0,0.0,71.22,1.34,0.27,0.0,0.4,100.0
Central Coast,50.7,0.17,0.28,0.0,43.62,5.0,0.0,0.0,0.22,100.0
Central Valley,33.99,0.99,0.99,0.0,61.58,2.46,0.0,0.0,0.0,100.0
Clear Lake,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,100.0
High Valley,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,100.0
Lake County,50.0,0.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0,100.0
Mendocino,60.0,0.0,3.33,0.0,36.67,0.0,0.0,0.0,0.0,100.0
Mendocino County,62.07,0.0,0.0,0.0,34.48,3.45,0.0,0.0,0.0,100.0
Mendocino Ridge,66.67,0.0,0.0,0.0,33.33,0.0,0.0,0.0,0.0,100.0
Mendocino/Lake Counties,56.12,0.0,0.51,0.0,42.86,0.51,0.0,0.0,0.0,100.0


## Linear Regression to predict Price of Wine based on year, points, region and Topic#

In [13]:
df1=df.drop(['winery','description','Review'],axis=1)
data_dictionary={'year': [0, (1985.0, 2016.0), [7461, 0]] ,
'points': [0, (80, 100), [0, 0]] ,
'price': [0, (4.0, 625.0), [50, 0]] ,
'Region': [2, ('California Other', 'South Coast'), [0, 0]] ,
'Topic': [2, (0, 4, 5, 8, 1, 2, 3, 6, 7), [0, 0]] }


In [14]:
df1.year=df1.year.fillna(df1.year.mode()[0])

In [15]:
rie_fit=rie.ReplaceImputeEncode(data_map=data_dictionary, display=False)
encoded_df = rie_fit.fit_transform(df1)

In [16]:
lr = LinearRegression()
y= np.asarray(encoded_df.price)
X = np.asarray(encoded_df.drop('price', axis=1))
lr_scores = cross_val_score(lr, X, y, cv=10)

## Cross validated error using 10-fold cross validation

In [17]:
print("Cross Validated error of Linear Regression Model : %.4f" %lr_scores.mean())

Cross Validated error of Linear Regression Model : 0.2937
