## Libraries

In [None]:
# !pip install -U sentence-transformers

In [4]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.decomposition import PCA

from nltk.tokenize import word_tokenize
nltk.download('punkt')

from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /Users/haris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/haris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Loading Data

In [2]:
apple_df = pd.read_csv('apple_25Nov.csv')
amazon_df = pd.read_csv('amazon_25Nov.csv')
google_df = pd.read_csv('google_25Nov.csv')

In [3]:
apple_df

Unnamed: 0,Posted,Rating,Content,NOUN_count,VERB_count,ADJ_count,OwnedFor,OwnedCategorized
0,2023-10-03 12:02:00,2,This is going on year 2 of having this product...,19,14,7,more than 2 years,6
1,2023-08-18 03:31:00,3,Good while it lasted. It started overheating a...,1,5,2,more than 2 years,6
2,2023-07-22 07:18:00,5,Awesome speaker especially for its size! We ...,4,1,3,more than 2 years,6
3,2023-07-13 02:28:00,5,Absolutely love them no issues at all from th...,1,2,1,more than 2 years,6
4,2023-07-08 09:27:00,5,Love this speaker! It’s so loud and the bass ...,2,2,2,more than 2 years,6
...,...,...,...,...,...,...,...,...
11237,2022-05-16 08:12:00,4,Being a subscriber to Apple Music this unit i...,25,10,10,10 months,3
11238,2022-05-15 04:32:00,5,This open box wasn’t even open it was just a s...,5,2,2,3 weeks,1
11239,2022-05-15 03:48:00,5,Absolutely love the HomePod mini! I use it for...,2,2,0,1 year,4
11240,2022-05-15 01:43:00,5,I absolutely love HomePod mini. I have two on...,18,11,6,7 months,3


In [4]:
print(len(apple_df))
print(len(amazon_df))
print(len(google_df))

11242
31646
10376


In [5]:
# pip install textblob 

In [6]:
# importing TextBlob function from textblob module 
from textblob import TextBlob
# passing the word to be pluralized as an argument to the TextBlob() function 
blobWord = TextBlob('flower')
 
# printing the plural word of the given blob word using the pluralize() function
print(blobWord.words.pluralize())

['flowers']


In [7]:
## Loading stop words
stop_words = list(stopwords.words('english'))

additional_words = ['applehome','googlehome','google','alexa','amazon','homepod','apple','siri', 'dot','pod', 'home','echo', 'amazonecho']

for word in additional_words:
    stop_words.append(word)

copy_stop_words = stop_words.copy()
for w in copy_stop_words:
#     print(w)
    blobword = TextBlob(str(w))
#     print(blobword.words.pluralize()[0])
    stop_words.append(str(blobword.words.pluralize()[0]))


## BERT BASE NLI

In [31]:
# ## BERT Model
# model = SentenceTransformer('distilbert-base-nli-mean-tokens')
# model1 = SentenceTransformer('bert-large-nli-cls-token')
# model3 = SentenceTransformer('bert-large-nli-mean-tokens')

In [32]:
model4 = SentenceTransformer('bert-large-nli-cls-token')
model5 = SentenceTransformer('bert-large-nli-cls-token')

In [34]:
liwc = pd.read_csv('LIWC.csv')
set(liwc['Product'])

{'AmazonEcho', 'AppleHome', 'GoogleHome'}

In [37]:
apple_df_liwc = liwc.loc[liwc['Product'] == 'AppleHome']
amazon_df_liwc = liwc.loc[liwc['Product'] == 'AmazonEcho']
google_df_liwc = liwc.loc[liwc['Product'] == 'GoogleHome']

In [38]:
apple_df_liwc = apple_df_liwc[apple_df_liwc['OwnedFor'] != ' ']
amazon_df_liwc = amazon_df_liwc[amazon_df_liwc['OwnedFor'] != ' ']
google_df_liwc = google_df_liwc[google_df_liwc['OwnedFor'] != ' ']

print(len(apple_df_liwc))
print(len(amazon_df_liwc))
print(len(google_df_liwc))

11242
31646
10376


In [60]:
apple_df_liwc = apple_df_liwc.reset_index(drop = True)
amazon_df_liwc = amazon_df_liwc.reset_index(drop = True)
google_df_liwc = google_df_liwc.reset_index(drop = True)

In [39]:
## Data Template - we get our data in this format

# sentences = ['This framework generates embeddings for each input sentence',
#     'Sentences are passed as a list of string.',
#     'The quick brown fox jumps over the lazy dog.']


def cleaning_reviews(df,stop_words):
    sentences_apple = df['Content'].tolist()

    ## lowering sentences
    reviews = [sentences.lower() for sentences in sentences_apple]

    filtered_reviews = []
    Llen = []
    
    
    for rev in reviews:
        ## removing non-alphaumeric
        rev = re.sub('\W+',' ', rev)

        word_tokens = word_tokenize(rev)
        filtered_sentence = []
        
        count = 0
        
        ## Remove stopwords
        for w in word_tokens:
            if w not in stop_words:
                filtered_sentence.append(w)
                count += 1

        filtered_sentence = ' '.join(filtered_sentence)
        Llen.append(count)
        filtered_reviews.append(filtered_sentence)
        
    return filtered_reviews, Llen


In [40]:
## Now create top 100 bigrams
## ngram_range needs to be a tuple (1,2) for Uni and bigrams, (2,2) for just bigrams
def ngram_df(reviews,n_gram_range):
    count_vectorizer = CountVectorizer(ngram_range=n_gram_range)

  # Fit the CountVectorizer on the documents and transform them into N-gram matrices
    ngram_matrix = count_vectorizer.fit_transform(reviews)
    
    ## Finding counts of each bigram
    bigram_counts = ngram_matrix.sum(axis=0)

    ngram_matrix_dense = ngram_matrix.toarray()

    vocab_ngram = list(count_vectorizer.get_feature_names_out())

    ngram_df = pd.DataFrame(ngram_matrix_dense, columns=vocab_ngram)

    return ngram_df, bigram_counts

In [41]:
## Make embeddings based on the model provided

def make_embeddings(model, selected_reviews):
    sentence_embeddings = model.encode(selected_reviews)
    embeds=[]
    for sentence, embedding in zip(selected_reviews, sentence_embeddings):
        # print("Sentence:", sentence)
        # print("Embedding:", embedding)
        embeds.append(embedding)
        # print("")
        
    return embeds

# print(len(embeds))

In [42]:
## Selecting n_top ngrams

def top_ngrams_df(selected_reviews, n_top, n_gram_range):
    
    ## makes a df of ngrams, and find counts of each ngram
    df, bi_counts = ngram_df(selected_reviews,n_gram_range)
    bigrams = df.columns
    
    counts = np.array(bi_counts).flatten()

    bigrams_dict = dict(zip(bigrams, counts))
    
    bigrams_dict_list = sorted(bigrams_dict.items(), key = lambda x:x[1], reverse = True)[0:n_top]
    
    top_bigrams = []
    for gram in bigrams_dict_list:
        top_bigrams.append(gram[0])
        
    df = df.loc[:, df.columns.isin(top_bigrams)]
    
    return df 

In [62]:
def merging_embeds_ngrams_pos(model, df,df_liwc, stop_words, top_bigram_count, n_gram_range):
    
    
    ## getting cleaned reviews
    selected_reviews, Llen = cleaning_reviews(df,stop_words)
    
#     print(selected_reviews)
    ## making embeddings
    embeds = make_embeddings(model,selected_reviews)
    embeddings_df = pd.DataFrame(embeds)
    
    ## Making ngram df for top n counts
    ngram_df = top_ngrams_df(selected_reviews, top_bigram_count, n_gram_range)
#                              selected_reviews
    
    combined_df = pd.concat([embeddings_df, ngram_df], axis = 1)
    
    for col_liwc in df_liwc.columns[5:]:
        combined_df[col_liwc] = df_liwc[0:len(combined_df)][col_liwc]    
    
    pos_cols = ['NOUN_count', 'VERB_count', 'ADJ_count','OwnedFor','OwnedCategorized']
    ## Ading pos counts to daataframes
    for col in pos_cols:
        combined_df[col] = df[0:len(combined_df)][col]
        
    combined_df['review_len'] = Llen
    combined_df['Rating'] = df['Rating']
    
    
    cols_list = []
    ## converting colnames to string as embeddings give integer col names
    for col in combined_df.columns:
        cols_list.append(str(col))
        
    combined_df.columns = cols_list
    
    return combined_df
    
    

In [43]:
# apple_df_liwc.columns[5:]

Index(['WC', 'Analytic', 'Clout', 'Authentic', 'Tone', 'WPS', 'Sixltr', 'Dic',
       'function', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they',
       'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj', 'negate',
       'verb', 'adj', 'compare', 'interrog', 'number', 'quant', 'affect',
       'posemo', 'negemo', 'anx', 'anger', 'sad', 'social', 'family', 'friend',
       'female', 'male', 'cogproc', 'insight', 'cause', 'discrep', 'tentat',
       'certain', 'differ', 'percept', 'see', 'hear', 'feel', 'bio', 'body',
       'health', 'sexual', 'ingest', 'drives', 'affiliation', 'achieve',
       'power', 'reward', 'risk', 'focuspast', 'focuspresent', 'focusfuture',
       'relativ', 'motion', 'space', 'time', 'work', 'leisure', 'home',
       'money', 'relig', 'death', 'informal', 'swear', 'netspeak', 'assent',
       'nonflu', 'filler', 'AllPunc', 'Period', 'Comma', 'Colon', 'SemiC',
       'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth', 'OtherP'],
      dtype='o

In [None]:
# apple_df_liwc['Comma']

## Processing Dataframes

### BERT Mean

In [63]:
apple_processed_BERT_mean = merging_embeds_ngrams_pos(model, apple_df,apple_df_liwc,stop_words, 100, (2,2))
apple_processed_BERT_mean

  combined_df['review_len'] = Llen
  combined_df['Rating'] = df['Rating']


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Apostro,Parenth,OtherP,NOUN_count,VERB_count,ADJ_count,OwnedFor,OwnedCategorized,review_len,Rating
0,-0.158468,-0.439611,0.853581,-0.538286,-0.102484,-0.495698,-0.631310,-0.336134,0.848012,-0.620194,...,0.0,0.00,0.0,19,14,7,more than 2 years,6,43,2
1,-0.186794,-0.766404,1.157155,-1.119730,0.105233,-0.556254,-0.458684,-0.698586,-0.657113,0.782858,...,0.0,0.00,0.0,1,5,2,more than 2 years,6,10,3
2,0.010278,-0.581126,0.858493,0.427046,-0.857490,-1.103230,-0.574962,-0.633358,-0.520994,-0.388783,...,0.0,0.00,0.0,4,1,3,more than 2 years,6,8,5
3,-0.473216,-0.694514,0.407750,0.040162,-0.775717,-0.663174,-0.080214,0.116128,0.143263,-0.570184,...,0.0,0.00,0.0,1,2,1,more than 2 years,6,5,5
4,-0.118972,-0.502742,0.340337,0.553690,-1.073774,-1.113690,-0.744634,-0.646628,0.594307,0.251025,...,0.0,0.00,0.0,2,2,2,more than 2 years,6,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11237,-0.576978,-0.043660,0.186282,-0.263439,-0.040494,-0.256193,-0.161123,-0.979091,0.452203,-0.167423,...,0.0,0.00,0.0,25,10,10,10 months,3,42,4
11238,-1.046488,-0.376314,0.276193,-0.479360,-0.866527,-0.484462,0.063963,0.343758,-0.351352,-0.475854,...,0.0,3.08,0.0,5,2,2,3 weeks,1,11,5
11239,-0.907599,-0.494132,0.168148,0.644229,-1.047821,-0.994493,-0.245592,0.573410,0.128813,-0.216985,...,0.0,0.00,0.0,2,2,0,1 year,4,5,5
11240,0.121507,0.021508,0.184715,0.364868,-0.215259,-0.511104,-0.091738,-0.138234,0.900011,-0.440557,...,0.0,0.00,0.0,18,11,6,7 months,3,38,5


In [64]:
amazon_processed_BERT_mean = merging_embeds_ngrams_pos(model, amazon_df, amazon_df_liwc, stop_words, 100, (2,2))
amazon_processed_BERT_mean

  combined_df['review_len'] = Llen
  combined_df['Rating'] = df['Rating']


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Apostro,Parenth,OtherP,NOUN_count,VERB_count,ADJ_count,OwnedFor,OwnedCategorized,review_len,Rating
0,-0.513297,-0.308858,0.526673,-0.023726,-0.140831,-0.247335,0.086016,-0.791743,0.410506,-0.937912,...,9.09,0.0,0.0,2,2,2,7 months,3,5,5
1,-0.180467,-0.379853,0.564663,-0.044409,-0.247112,-0.593197,-0.426389,0.282266,-0.668291,-0.233489,...,0.00,0.0,0.0,0,1,0,1.5 years,5,4,5
2,0.172752,-0.611308,-0.345173,-0.351720,-0.509771,-0.981686,0.186226,0.249284,0.434012,0.474398,...,3.70,0.0,0.0,2,6,2,10 months,3,12,5
3,-0.859924,-0.344483,0.951276,-0.908926,-0.257145,-0.550500,-0.104277,-0.375770,0.228479,-0.695640,...,0.00,0.0,0.0,5,4,0,1 year,4,11,5
4,-0.704675,-0.650758,1.024529,-0.469243,-0.332236,-0.549799,0.120132,-0.982889,0.914332,-0.520519,...,9.09,0.0,0.0,1,1,2,1.5 years,5,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31641,-0.832855,-0.233134,0.136406,0.217502,-0.632584,-0.844102,0.011074,-0.305169,-0.049922,0.126451,...,8.33,0.0,0.0,2,3,2,3 weeks,1,4,5
31642,-0.875283,-0.220690,0.064456,0.615256,-0.608861,-0.947127,0.208773,-0.805821,-0.275637,-0.210173,...,0.00,0.0,0.0,0,2,2,2 weeks,1,5,5
31643,-1.096967,-0.480100,0.887669,-0.073598,-0.745181,-0.744432,0.331998,-0.046960,-0.218125,-0.967949,...,0.00,0.0,0.0,5,2,1,1 month,2,6,5
31644,0.282827,-0.191884,0.645876,-0.484937,-0.342067,-0.177655,-0.784974,-0.017233,0.266659,0.123763,...,0.00,0.0,0.0,2,4,1,7 months,3,7,5


In [65]:
google_processed_BERT_mean = merging_embeds_ngrams_pos(model, google_df,google_df_liwc,stop_words, 100, (2,2))
google_processed_BERT_mean

  combined_df['review_len'] = Llen
  combined_df['Rating'] = df['Rating']


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Apostro,Parenth,OtherP,NOUN_count,VERB_count,ADJ_count,OwnedFor,OwnedCategorized,review_len,Rating
0,-0.779565,-0.926940,0.973392,-0.050409,-0.201227,-0.862517,-0.614672,-0.423510,0.823083,-0.591712,...,0.00,0.0,0.0,7,6,1,more than 2 years,6,17,3
1,-0.064655,-0.588466,0.811885,-0.573566,-0.388068,-0.393929,-0.736215,-0.422256,0.180595,0.304754,...,0.00,0.0,0.0,3,6,2,1.5 years,5,13,5
2,0.053550,-0.440856,1.099830,-0.541663,-0.044520,-0.358902,-0.163585,-1.019400,-0.163759,0.748017,...,4.00,0.0,0.0,17,6,6,more than 2 years,6,34,4
3,-0.600825,-1.273891,0.240493,0.271028,-0.572578,-0.636670,-0.438937,-0.461204,0.744259,-0.142931,...,0.00,0.0,0.0,4,3,3,1.5 years,5,10,5
4,0.203512,-0.377288,0.651243,-0.190404,-0.873383,-1.049136,-0.387723,-0.311397,-0.074188,-0.067937,...,0.00,0.0,0.0,2,1,3,1 year,4,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10371,-0.430505,-0.039394,-0.040874,0.098835,0.054938,-0.386315,-0.124647,-0.412220,0.836161,-0.966618,...,7.69,0.0,0.0,4,6,1,1 month,2,13,4
10372,-0.935373,-0.868338,0.363158,-0.143107,-0.164336,-0.098602,-0.415622,-0.809009,0.364562,1.143269,...,0.00,0.0,0.0,1,1,4,2 months,2,9,4
10373,-0.162951,-0.555421,0.797093,-0.611321,0.063156,-0.294918,-0.717150,-0.303369,0.135705,0.397984,...,7.14,0.0,0.0,5,4,2,1 month,2,12,5
10374,-0.404358,-0.558255,0.677919,-0.501117,-0.096864,-0.847526,-0.029185,-0.629467,0.390242,-0.879607,...,2.00,0.0,0.0,2,2,1,3 weeks,1,7,5


### BERT Large

In [66]:
apple_processed_df_bert_large = merging_embeds_ngrams_pos(model1,apple_df,apple_df_liwc, stop_words, 100, (2,2))
apple_processed_df_bert_large

  combined_df['review_len'] = Llen
  combined_df['Rating'] = df['Rating']


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Apostro,Parenth,OtherP,NOUN_count,VERB_count,ADJ_count,OwnedFor,OwnedCategorized,review_len,Rating
0,0.983310,0.661395,-0.005628,0.391475,0.599468,0.247196,-0.222022,-0.675306,1.326272,0.905621,...,0.0,0.00,0.0,19,14,7,more than 2 years,6,43,2
1,0.839777,0.546538,0.238667,0.423015,0.588731,-0.199983,0.761129,-0.460239,-0.040675,-0.389172,...,0.0,0.00,0.0,1,5,2,more than 2 years,6,10,3
2,1.205966,-0.820295,0.260041,0.282514,1.324543,1.304266,-0.897646,0.671183,1.109110,0.383356,...,0.0,0.00,0.0,4,1,3,more than 2 years,6,8,5
3,0.149299,-1.045238,0.882931,-0.172439,1.004971,0.572282,0.149080,0.606893,1.298057,-0.173340,...,0.0,0.00,0.0,1,2,1,more than 2 years,6,5,5
4,0.838390,-0.624167,-0.001048,0.621148,1.219351,1.190395,-0.657419,-0.120823,1.470320,0.367549,...,0.0,0.00,0.0,2,2,2,more than 2 years,6,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11237,0.431758,0.887711,-0.322128,-0.372409,1.138864,0.628092,0.167041,0.174815,0.932777,0.277480,...,0.0,0.00,0.0,25,10,10,10 months,3,42,4
11238,0.090965,-0.300278,0.422950,-0.572750,0.049459,-0.537889,0.469333,-0.961600,0.296112,0.995747,...,0.0,3.08,0.0,5,2,2,3 weeks,1,11,5
11239,-0.137958,-0.738065,0.862632,-0.567497,0.655615,0.090100,0.591886,1.233527,1.033565,0.100217,...,0.0,0.00,0.0,2,2,0,1 year,4,5,5
11240,0.375920,0.260230,0.174079,0.486693,0.451693,0.135253,-1.521774,0.598302,0.640672,-0.134451,...,0.0,0.00,0.0,18,11,6,7 months,3,38,5


In [67]:
amazon_processed_df_bert_large = merging_embeds_ngrams_pos(model1,amazon_df,amazon_df_liwc, stop_words, 100, (2,2))
amazon_processed_df_bert_large

  combined_df['review_len'] = Llen
  combined_df['Rating'] = df['Rating']


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Apostro,Parenth,OtherP,NOUN_count,VERB_count,ADJ_count,OwnedFor,OwnedCategorized,review_len,Rating
0,-0.523634,-0.575519,0.122008,-0.523230,0.911352,0.407205,0.246260,0.535383,0.472816,0.409182,...,9.09,0.0,0.0,2,2,2,7 months,3,5,5
1,0.173083,-0.943269,0.281548,-0.276475,0.927217,0.630539,-0.132075,0.274716,0.845284,0.751249,...,0.00,0.0,0.0,0,1,0,1.5 years,5,4,5
2,-0.144786,0.504305,-0.119536,0.480758,0.653224,0.446673,-0.067394,0.755906,0.615982,-0.311062,...,3.70,0.0,0.0,2,6,2,10 months,3,12,5
3,0.639106,0.572812,0.213166,-0.350255,0.703247,0.100455,0.682191,0.071930,0.762128,0.076072,...,0.00,0.0,0.0,5,4,0,1 year,4,11,5
4,0.060956,-0.729287,0.052710,-1.142745,0.680443,0.529802,0.875053,0.395508,1.674072,-0.373728,...,9.09,0.0,0.0,1,1,2,1.5 years,5,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31641,-0.506462,-0.511425,0.564278,-0.881517,0.134313,0.337366,0.503670,1.434380,1.080589,0.106008,...,8.33,0.0,0.0,2,3,2,3 weeks,1,4,5
31642,-0.159817,-0.287369,0.307417,-1.021482,0.876355,0.465893,0.334390,0.627001,0.338422,0.639352,...,0.00,0.0,0.0,0,2,2,2 weeks,1,5,5
31643,0.132346,-0.446611,0.380831,-0.488010,0.874243,0.737227,-0.733379,0.978790,0.651822,0.483251,...,0.00,0.0,0.0,5,2,1,1 month,2,6,5
31644,-0.322745,0.032657,-0.112538,-0.721012,0.494801,0.550869,-0.054576,0.846498,1.121733,0.464933,...,0.00,0.0,0.0,2,4,1,7 months,3,7,5


In [68]:
google_processed_df_bert_large = merging_embeds_ngrams_pos(model1,google_df,google_df_liwc, stop_words, 100, (2,2))
google_processed_df_bert_large

  combined_df['review_len'] = Llen
  combined_df['Rating'] = df['Rating']


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Apostro,Parenth,OtherP,NOUN_count,VERB_count,ADJ_count,OwnedFor,OwnedCategorized,review_len,Rating
0,0.920656,0.581586,0.235490,-0.168961,0.033644,-0.300076,0.621931,0.155857,0.839638,0.850075,...,0.00,0.0,0.0,7,6,1,more than 2 years,6,17,3
1,-0.338820,-0.162759,0.059124,-0.511081,0.361659,0.520024,0.061007,1.101731,1.387585,-0.272852,...,0.00,0.0,0.0,3,6,2,1.5 years,5,13,5
2,0.743301,0.480786,0.308656,0.640231,0.488483,-0.017956,0.579336,-0.272599,0.836121,0.650090,...,4.00,0.0,0.0,17,6,6,more than 2 years,6,34,4
3,-0.061119,-0.347670,-0.025990,-0.494387,1.346362,-0.148762,-0.354992,0.828807,-0.320098,0.382006,...,0.00,0.0,0.0,4,3,3,1.5 years,5,10,5
4,0.974787,-0.462810,0.807596,0.445203,1.290976,1.257783,-0.814080,0.247755,0.720234,0.424272,...,0.00,0.0,0.0,2,1,3,1 year,4,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10371,0.760643,-0.250060,-0.055046,-0.240622,0.498675,0.414672,-0.592885,0.317373,1.156722,0.993426,...,7.69,0.0,0.0,4,6,1,1 month,2,13,4
10372,0.506623,0.706060,0.596782,-0.974560,0.649936,0.146383,0.303182,1.306885,1.122338,0.376689,...,0.00,0.0,0.0,1,1,4,2 months,2,9,4
10373,0.224554,0.602828,0.193068,0.184279,1.550300,0.560154,0.028331,0.524627,1.647898,-0.121280,...,7.14,0.0,0.0,5,4,2,1 month,2,12,5
10374,0.114070,-0.119665,0.033353,-0.588963,0.988351,0.523315,0.526069,0.257438,1.291420,-0.633817,...,2.00,0.0,0.0,2,2,1,3 weeks,1,7,5


### BERT Large Mean

In [69]:
apple_processed_df_bert_large_mean = merging_embeds_ngrams_pos(model2,apple_df,apple_df_liwc, stop_words, 100, (2,2))
apple_processed_df_bert_large_mean

  combined_df['review_len'] = Llen
  combined_df['Rating'] = df['Rating']


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Apostro,Parenth,OtherP,NOUN_count,VERB_count,ADJ_count,OwnedFor,OwnedCategorized,review_len,Rating
0,0.543742,0.495437,0.472420,0.547962,0.214573,0.064335,0.336535,-1.102422,1.119357,1.134630,...,0.0,0.00,0.0,19,14,7,more than 2 years,6,43,2
1,0.484995,0.738702,1.067772,0.409680,0.526732,-0.152106,0.902397,-0.645426,-0.266552,0.200861,...,0.0,0.00,0.0,1,5,2,more than 2 years,6,10,3
2,0.798869,-0.738452,0.850227,0.035102,0.057909,1.133019,-0.613953,0.335822,1.166332,0.609124,...,0.0,0.00,0.0,4,1,3,more than 2 years,6,8,5
3,0.255021,-1.014077,0.872595,-0.211067,-0.525143,0.693060,-0.142708,-0.302166,1.062633,-0.470972,...,0.0,0.00,0.0,1,2,1,more than 2 years,6,5,5
4,0.583098,-0.210019,0.654762,0.709561,-0.009000,0.769282,-0.674490,-0.609358,0.829383,0.646651,...,0.0,0.00,0.0,2,2,2,more than 2 years,6,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11237,0.031482,0.384180,0.332649,-0.360375,0.058106,0.407465,0.391639,0.126641,0.558432,0.782480,...,0.0,0.00,0.0,25,10,10,10 months,3,42,4
11238,-0.509443,-0.444991,0.271826,-0.536648,-0.014803,-0.309213,0.198848,-1.303937,0.570770,1.296766,...,0.0,3.08,0.0,5,2,2,3 weeks,1,11,5
11239,-0.262478,-0.443471,0.764215,-0.064093,-0.751062,-0.376116,-0.129093,0.355424,0.484182,0.160346,...,0.0,0.00,0.0,2,2,0,1 year,4,5,5
11240,-0.250179,-0.130496,0.163798,0.728203,0.371455,0.130736,-0.920657,0.536769,0.465548,0.282484,...,0.0,0.00,0.0,18,11,6,7 months,3,38,5


In [70]:
amazon_processed_df_bert_large_mean = merging_embeds_ngrams_pos(model2,amazon_df,amazon_df_liwc, stop_words, 100, (2,2))
amazon_processed_df_bert_large_mean

  combined_df['review_len'] = Llen
  combined_df['Rating'] = df['Rating']


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Apostro,Parenth,OtherP,NOUN_count,VERB_count,ADJ_count,OwnedFor,OwnedCategorized,review_len,Rating
0,-0.387530,-0.343766,0.328157,-0.418332,-0.018955,-0.104976,0.370176,0.288072,0.521418,0.134975,...,9.09,0.0,0.0,2,2,2,7 months,3,5,5
1,-0.233446,0.176839,0.731137,-0.033977,0.359495,-0.209553,-0.359573,-0.352328,0.814820,0.772516,...,0.00,0.0,0.0,0,1,0,1.5 years,5,4,5
2,-0.590831,0.028837,-0.059291,-0.282376,-0.290941,0.551704,-0.745856,0.672127,-0.119256,-0.271754,...,3.70,0.0,0.0,2,6,2,10 months,3,12,5
3,0.134091,0.680461,0.506746,-0.026489,-0.209298,-0.155616,0.476643,0.130981,0.830065,-0.182159,...,0.00,0.0,0.0,5,4,0,1 year,4,11,5
4,0.199409,-0.149464,0.466718,-0.417564,-0.343135,0.189888,0.471262,0.005682,0.985675,-0.147950,...,9.09,0.0,0.0,1,1,2,1.5 years,5,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31641,-0.549887,-0.480782,0.822559,-0.504220,-0.976744,0.388145,0.398437,0.529197,0.332029,-0.473861,...,8.33,0.0,0.0,2,3,2,3 weeks,1,4,5
31642,-0.087678,-0.446432,0.533848,-0.723522,-0.287357,0.325775,0.587033,0.100868,0.077650,0.512428,...,0.00,0.0,0.0,0,2,2,2 weeks,1,5,5
31643,0.211648,-0.191998,0.408904,-0.741727,-0.174053,0.791228,-0.044261,0.730582,0.292432,0.354426,...,0.00,0.0,0.0,5,2,1,1 month,2,6,5
31644,-0.123762,-0.355554,0.049586,-0.268684,-0.423600,1.218652,-0.052844,0.347887,1.231456,0.177464,...,0.00,0.0,0.0,2,4,1,7 months,3,7,5


In [71]:
google_processed_df_bert_large_mean = merging_embeds_ngrams_pos(model2,google_df,amazon_df_liwc, stop_words, 100, (2,2))
google_processed_df_bert_large_mean

  combined_df['review_len'] = Llen
  combined_df['Rating'] = df['Rating']


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Apostro,Parenth,OtherP,NOUN_count,VERB_count,ADJ_count,OwnedFor,OwnedCategorized,review_len,Rating
0,0.672337,0.854248,0.773330,0.056404,-0.576963,-0.262524,0.436262,-0.529420,0.454456,1.043217,...,9.09,0.0,0.0,7,6,1,more than 2 years,6,17,3
1,-0.144254,-0.192437,0.225685,-0.360089,-0.246640,0.301679,0.048764,0.712613,1.171394,-0.475150,...,0.00,0.0,0.0,3,6,2,1.5 years,5,13,5
2,0.531002,-0.010234,0.930918,0.863019,0.845590,0.125862,0.625441,-0.829762,0.592936,0.895808,...,3.70,0.0,0.0,17,6,6,more than 2 years,6,34,4
3,-0.216513,-0.634806,0.379636,0.047578,0.484953,-0.093403,-0.586881,0.186238,-0.215680,0.526547,...,0.00,0.0,0.0,4,3,3,1.5 years,5,10,5
4,0.593953,-0.396204,1.024600,-0.234298,-0.509628,0.879982,-0.418010,-0.007302,0.586269,0.420421,...,9.09,0.0,0.0,2,1,3,1 year,4,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10371,0.152233,0.024385,0.470213,-0.081044,-0.064601,0.429439,-0.579068,-0.117074,0.633113,1.061356,...,0.00,0.0,0.0,4,6,1,1 month,2,13,4
10372,0.314987,0.967224,0.662877,-0.311762,0.165357,0.278207,0.093601,1.057282,0.895730,0.474023,...,0.00,0.0,0.0,1,1,4,2 months,2,9,4
10373,0.064644,0.593535,0.306331,0.880406,0.586682,0.754203,0.400304,0.149576,1.535935,-0.017747,...,0.00,0.0,0.0,5,4,2,1 month,2,12,5
10374,-0.042478,0.036142,0.446225,-0.648029,-0.387849,0.344451,0.686100,0.052986,0.980697,-0.216748,...,0.00,0.0,0.0,2,2,1,3 weeks,1,7,5


## Storing these Embedding as .csv to not load them again and again

In [74]:
apple_processed_BERT_mean.to_csv('apple_embeddings_df_BERT_mean.csv', index = False)

amazon_processed_BERT_mean.to_csv('amazon_embeddings_df_BERT_mean.csv', index = False)

google_processed_BERT_mean.to_csv('google_embeddings_df_BERT_mean.csv', index = False)


#######################################

apple_processed_df_bert_large_mean.to_csv('apple_embeddings_df_BERT_large_mean.csv', index = False)

amazon_processed_df_bert_large_mean.to_csv('amazon_embeddings_df_BERT_large_mean.csv', index = False)

google_processed_df_bert_large_mean.to_csv('google_embeddings_df_BERT_large_mean.csv', index = False)



#####################################

apple_processed_df_bert_large.to_csv('apple_embeddings_df_BERT_large.csv', index = False)

amazon_processed_df_bert_large.to_csv('amazon_embeddings_df_BERT_large.csv', index = False)

google_processed_df_bert_large.to_csv('google_embeddings_df_BERT_large.csv', index = False)

# Random Forrest

In [87]:
def RandomForrest_with_PCA_CV(df):
    X = df.drop(columns = ['OwnedFor','Rating'])
    y = df['Rating']
    
    X_std = StandardScaler().fit_transform(X)
#     output = np.mean(cross_val_score(X_std, y, cv = 5))
    
    pca = PCA().fit(X_std, y)
    print('currently here')
    for i in range(len(pca.explained_variance_ratio_.cumsum())):
        if (pca.explained_variance_ratio_.cumsum()[i] > 0.90):
    #         print(i)
            k = i
            break
    
    X_pca = PCA(n_components = k).fit_transform(X_std)
    
#     X_train, X_test, y_train, y_test = train_test_split(X_std,y,test_size = 0.3, random_state = 12)
    
    start = time.process_time()
#     rf = RandomForestClassifier(random_state = 42).fit(X_train, y_train)
    print('now started ')
    print(k)
    rf = RandomForestClassifier(random_state = 42)
    print('here')
    output = np.mean(cross_val_score(rf, X_pca, y, cv = 10))
    
#     pred = rf.predict(X_test)
    
#     acc = accuracy_score(y_test,pred)
#     f1 = f1_score(y_test,pred, average = 'macro')
#     prec = precision_score(y_test, pred,average = 'macro')
#     rec = recall_score(y_test, pred,average = 'macro')
    
#     features_dict = dict(zip(X.columns, rf.feature_importances_))
    
    end = time.process_time()
    
    time_taken = end - start
#     return acc,f1,prec,rec, time_taken,features_dict
    return output, time_taken

In [82]:
def RandomForrest_with_features(df):
    X = df.drop(columns = ['OwnedFor','Rating'])
    y = df['Rating']
    
    X_std = StandardScaler().fit_transform(X)
#     output = np.mean(cross_val_score(X_std, y, cv = 5))
    
    X_train, X_test, y_train, y_test = train_test_split(X_std,y,test_size = 0.3, random_state = 12)
    
    start = time.process_time()
    rf = RandomForestClassifier(random_state = 42).fit(X_train, y_train)
    
#     rf = RandomForestClassifier(random_state = 42)

#     output = np.mean(cross_val_score(rf, X_std, y, cv = 5))
    
    pred = rf.predict(X_test)
    
    acc = accuracy_score(y_test,pred)
    f1 = f1_score(y_test,pred, average = 'macro')
    prec = precision_score(y_test, pred,average = 'macro')
    rec = recall_score(y_test, pred,average = 'macro')
    
    features_dict = dict(zip(X.columns, rf.feature_importances_))
    
    end = time.process_time()
    
    time_taken = end - start
    return acc,f1,prec,rec, time_taken,features_dict
#     return output, time_taken

# BERT Mean Tokens

In [89]:
# # apple_acc_bert_mean,apple_f1_bert_mean,apple_prec_bert_mean,apple_rec_bert_mean, apple_time_bert_mean,features_dict_apple_mean = RandomForrest(apple_processed_BERT_mean)

# apple_cv_score_mean, apple_time_mean = RandomForrest_with_PCA_CV(apple_processed_BERT_mean)
# print('Cross Validation Score is  ' + str(round(apple_cv_score_mean,4)*100))
# print('Time taken is ' + str(round(apple_time_mean,4)) + ' Seconds')

# print('*****************************************************/n')

# amazon_cv_score_mean, amazon_time_mean = RandomForrest_with_PCA_CV(amazon_processed_BERT_mean)
# print('Cross Validation Score is  ' + str(round(amazon_cv_score_mean,4)*100))
# print('Time taken is ' + str(round(amazon_time_mean,4)) + ' Seconds')

# print('*****************************************************/n')

# google_cv_score_mean, google_time_mean = RandomForrest_with_PCA_CV(google_processed_BERT_mean)
# print('Cross Validation Score is  ' + str(round(google_cv_score_mean,4)*100))
# print('Time taken is ' + str(round(google_time_mean,4)) + ' Seconds')

# print('*****************************************************/n')

In [90]:
apple_acc_bert_mean,apple_f1_bert_mean,apple_prec_bert_mean,apple_rec_bert_mean, apple_time_bert_mean,features_dict_apple_mean = RandomForrest_with_features(apple_processed_BERT_mean)

print('Accuracy is ' + str(round(apple_acc_bert_mean,4)*100))
print('F1 Score is ' + str(round(apple_f1_bert_mean,4)*100))
print('Precision is ' + str(round(apple_prec_bert_mean,4)*100))
print('Recall is ' + str(round(apple_rec_bert_mean,4)*100))
print('Time taken is ' + str(round(apple_time_bert_mean,4)) + ' Seconds')

print('*****************************************************/n')

amazon_acc_bert_mean,amazon_f1_bert_mean,amazon_prec_bert_mean,amazon_rec_bert_mean, amazon_time_bert_mean,features_dict_amazon_mean = RandomForrest_with_features(amazon_processed_BERT_mean)

print('Accuracy is ' + str(round(amazon_acc_bert_mean,4)*100))
print('F1 Score is ' + str(round(amazon_f1_bert_mean,4)*100))
print('Precision is ' + str(round(amazon_prec_bert_mean,4)*100))
print('Recall is ' + str(round(amazon_rec_bert_mean,4)*100))
print('Time taken is ' + str(round(amazon_time_bert_mean,4)) + ' Seconds')

print('*****************************************************/n')

google_acc_bert_mean,google_f1_bert_mean,google_prec_bert_mean,google_rec_bert_mean, google_time_bert_mean,features_dict_google_mean = RandomForrest_with_features(google_processed_BERT_mean)

print('Accuracy is ' + str(round(google_acc_bert_mean,4)*100))
print('F1 Score is ' + str(round(google_f1_bert_mean,4)*100))
print('Precision is ' + str(round(google_prec_bert_mean,4)*100))
print('Recall is ' + str(round(google_rec_bert_mean,4)*100))
print('Time taken is ' + str(round(google_time_bert_mean,4)) + ' Seconds')

  _warn_prf(average, modifier, msg_start, len(result))


Accuracy is 81.03
F1 Score is 26.31
Precision is 46.75
Recall is 25.369999999999997
Time taken is 43.7958 Seconds
*****************************************************/n
Accuracy is 83.12
F1 Score is 19.57
Precision is 55.93
Recall is 20.68
Time taken is 171.2075 Seconds
*****************************************************/n
Accuracy is 79.41
F1 Score is 20.97
Precision is 49.08
Recall is 21.7
Time taken is 42.6332 Seconds


  _warn_prf(average, modifier, msg_start, len(result))


In [91]:
top_features_apple_mean = sorted(features_dict_apple_mean.items(), key = lambda x: x[1], reverse = True)[0:100]
top_features_apple_mean

[('164', 0.006655859938404499),
 ('216', 0.004843404515389105),
 ('296', 0.0046764371053631395),
 ('726', 0.004034249539700372),
 ('115', 0.004028637911025714),
 ('25', 0.004023378016462385),
 ('114', 0.003989766011096869),
 ('38', 0.003849327802701039),
 ('293', 0.00372966110405567),
 ('367', 0.0036389623059107632),
 ('244', 0.003337382856660342),
 ('713', 0.0032520364722990786),
 ('92', 0.0031708922816940815),
 ('372', 0.003036247803197854),
 ('698', 0.0030139682119988704),
 ('259', 0.0029053962226797438),
 ('298', 0.002897393249978358),
 ('342', 0.002866039471773699),
 ('253', 0.002840871323332145),
 ('643', 0.0028220966585298947),
 ('436', 0.0028142392395809597),
 ('74', 0.0027245678660506424),
 ('140', 0.0027051597609047064),
 ('407', 0.002661436342847428),
 ('378', 0.002638316912326967),
 ('620', 0.0026222956264448953),
 ('144', 0.002611097315101228),
 ('427', 0.00247785808377986),
 ('148', 0.002431669481217401),
 ('63', 0.002405902173619993),
 ('448', 0.0023636843766760753),
 ('

In [92]:
top_features_amazon_mean = sorted(features_dict_amazon_mean.items(), key = lambda x: x[1], reverse = True)[0:100]
top_features_amazon_mean

[('negate', 0.0053442822641999255),
 ('differ', 0.004708723858112266),
 ('293', 0.0037325661323156637),
 ('25', 0.00350152351968386),
 ('298', 0.0033681570492827034),
 ('164', 0.003275630699948743),
 ('77', 0.002984126245870934),
 ('296', 0.002908070656239921),
 ('362', 0.0026874697808022202),
 ('713', 0.0026705344193735424),
 ('38', 0.002661937276414778),
 ('596', 0.0026553379554884638),
 ('259', 0.0025460544238730854),
 ('cogproc', 0.002499690929330381),
 ('698', 0.0023852049083490172),
 ('bio', 0.0023147893909474027),
 ('216', 0.0020891312184546647),
 ('333', 0.002025310731381819),
 ('244', 0.0019627427055903253),
 ('427', 0.001933009278795068),
 ('tentat', 0.0019032553130806922),
 ('434', 0.0018988589076335744),
 ('440', 0.001890501127394744),
 ('63', 0.0018781539374138779),
 ('Clout', 0.0018775735388350328),
 ('114', 0.0018546022591322633),
 ('affiliation', 0.0018154577193334397),
 ('verb', 0.0017602316838526899),
 ('607', 0.0017295809757470705),
 ('304', 0.0017230625656478038),
 

In [93]:
top_features_google_mean = sorted(features_dict_google_mean.items(), key = lambda x: x[1], reverse = True)[0:100]
top_features_google_mean

[('25', 0.003665139939590966),
 ('293', 0.0035619352510940163),
 ('164', 0.0031333196833978677),
 ('698', 0.002882397257434495),
 ('113', 0.0028487384404385805),
 ('707', 0.002653208691706309),
 ('582', 0.0026393442938363854),
 ('253', 0.0025738355331458573),
 ('77', 0.0025396538080047514),
 ('427', 0.002482709384615533),
 ('713', 0.002452639634594493),
 ('618', 0.0024338029917716163),
 ('726', 0.0024202173346974157),
 ('216', 0.0024116855257090013),
 ('63', 0.002356867185812608),
 ('372', 0.0023204466872464745),
 ('259', 0.002313785753390429),
 ('296', 0.0022726096235186587),
 ('528', 0.002257884139104952),
 ('494', 0.0021726870968588317),
 ('730', 0.0021172790067178764),
 ('38', 0.0020809131784852147),
 ('596', 0.0020789476986556032),
 ('367', 0.00206138542297897),
 ('275', 0.0020597996376609663),
 ('620', 0.002049435246257355),
 ('342', 0.002047668989755273),
 ('114', 0.0020272156498460374),
 ('300', 0.0020143679981541093),
 ('56', 0.002014254761465132),
 ('376', 0.00199109592507570

##  BERT Large Mean Tokens

In [None]:
# # apple_acc_bert_mean,apple_f1_bert_mean,apple_prec_bert_mean,apple_rec_bert_mean, apple_time_bert_mean,features_dict_apple_mean = RandomForrest(apple_processed_BERT_mean)

# apple_cv_score_large_mean, apple_time_large_mean = RandomForrest_with_PCA_CV(apple_processed_BERT_mean)
# print('Cross Validation Score is  ' + str(round(apple_cv_score_large_mean,4)*100))
# print('Time taken is ' + str(round(apple_time_large_mean,4)) + ' Seconds')

# print('*****************************************************/n')

# amazon_cv_score_large_mean, amazon_time_large_mean = RandomForrest_with_PCA_CV(amazon_processed_BERT_mean)
# print('Cross Validation Score is  ' + str(round(amazon_cv_score_large_mean,4)*100))
# print('Time taken is ' + str(round(amazon_time_large_mean,4)) + ' Seconds')

# print('*****************************************************/n')

# google_cv_score_large_mean, google_time_large_mean = RandomForrest_with_PCA_CV(google_processed_BERT_mean)
# print('Cross Validation Score is  ' + str(round(google_cv_score_large_mean,4)*100))
# print('Time taken is ' + str(round(google_time_large_mean,4)) + ' Seconds')

# print('*****************************************************/n')

In [94]:
apple_acc_bert_large_mean,apple_f1_bert_large_mean,apple_prec_bert_large_mean,apple_rec_bert_large_mean,apple_time_bert_large_mean, features_apple_large_mean = RandomForrest_with_features(apple_processed_df_bert_large_mean)

print('Accuracy is ' + str(round(apple_acc_bert_large_mean,4)*100))
print('F1 Score is ' + str(round(apple_f1_bert_large_mean,4)*100))
print('Precision is ' + str(round(apple_prec_bert_large_mean,4)*100))
print('Recall is ' + str(round(apple_rec_bert_large_mean,4)*100))
print('Time taken is ' + str(round(apple_time_bert_large_mean,4)) + ' Seconds')


print('*****************************************************/n')
amazon_acc_bert_large_mean,amazon_f1_bert_large_mean,amazon_prec_bert_large_mean,amazon_rec_bert_large_mean, amazon_time_bert_large_mean,features_amazon_large_mean = RandomForrest_with_features(amazon_processed_df_bert_large_mean)

print('Accuracy is ' + str(round(amazon_acc_bert_large_mean,4)*100))
print('F1 Score is ' + str(round(amazon_f1_bert_large_mean,4)*100))
print('Precision is ' + str(round(amazon_prec_bert_large_mean,4)*100))
print('Recall is ' + str(round(amazon_rec_bert_large_mean,4)*100))
print('Time taken is ' + str(round(amazon_time_bert_large_mean,4)) + ' Seconds')


print('*****************************************************/n')
google_acc_bert_large_mean,google_f1_bert_large_mean,google_prec_bert_large_mean,google_rec_bert_large_mean, google_time_bert_large_mean,features_google_large_mean = RandomForrest_with_features(google_processed_df_bert_large_mean)

print('Accuracy is ' + str(round(google_acc_bert_large_mean,4)*100))
print('F1 Score is ' + str(round(google_f1_bert_large_mean,4)*100))
print('Precision is ' + str(round(google_prec_bert_large_mean,4)*100))
print('Recall is ' + str(round(google_rec_bert_large_mean,4)*100))
print('Time taken is ' + str(round(google_time_bert_large_mean,4)) + ' Seconds')

Accuracy is 80.97
F1 Score is 27.33
Precision is 49.370000000000005
Recall is 26.229999999999997
Time taken is 49.8972 Seconds
*****************************************************/n
Accuracy is 83.17
F1 Score is 19.73
Precision is 61.17
Recall is 20.77
Time taken is 188.1179 Seconds
*****************************************************/n
Accuracy is 79.38
F1 Score is 22.74
Precision is 71.34
Recall is 22.61
Time taken is 46.1146 Seconds


In [95]:
top_features_apple_large_mean = sorted(features_apple_large_mean.items(), key = lambda x: x[1], reverse = True)[0:100]
top_features_apple_mean

[('164', 0.006655859938404499),
 ('216', 0.004843404515389105),
 ('296', 0.0046764371053631395),
 ('726', 0.004034249539700372),
 ('115', 0.004028637911025714),
 ('25', 0.004023378016462385),
 ('114', 0.003989766011096869),
 ('38', 0.003849327802701039),
 ('293', 0.00372966110405567),
 ('367', 0.0036389623059107632),
 ('244', 0.003337382856660342),
 ('713', 0.0032520364722990786),
 ('92', 0.0031708922816940815),
 ('372', 0.003036247803197854),
 ('698', 0.0030139682119988704),
 ('259', 0.0029053962226797438),
 ('298', 0.002897393249978358),
 ('342', 0.002866039471773699),
 ('253', 0.002840871323332145),
 ('643', 0.0028220966585298947),
 ('436', 0.0028142392395809597),
 ('74', 0.0027245678660506424),
 ('140', 0.0027051597609047064),
 ('407', 0.002661436342847428),
 ('378', 0.002638316912326967),
 ('620', 0.0026222956264448953),
 ('144', 0.002611097315101228),
 ('427', 0.00247785808377986),
 ('148', 0.002431669481217401),
 ('63', 0.002405902173619993),
 ('448', 0.0023636843766760753),
 ('

In [96]:
top_features_amazon_large_mean = sorted(features_amazon_large_mean.items(), key = lambda x: x[1], reverse = True)[0:100]
top_features_amazon_mean

[('negate', 0.0053442822641999255),
 ('differ', 0.004708723858112266),
 ('293', 0.0037325661323156637),
 ('25', 0.00350152351968386),
 ('298', 0.0033681570492827034),
 ('164', 0.003275630699948743),
 ('77', 0.002984126245870934),
 ('296', 0.002908070656239921),
 ('362', 0.0026874697808022202),
 ('713', 0.0026705344193735424),
 ('38', 0.002661937276414778),
 ('596', 0.0026553379554884638),
 ('259', 0.0025460544238730854),
 ('cogproc', 0.002499690929330381),
 ('698', 0.0023852049083490172),
 ('bio', 0.0023147893909474027),
 ('216', 0.0020891312184546647),
 ('333', 0.002025310731381819),
 ('244', 0.0019627427055903253),
 ('427', 0.001933009278795068),
 ('tentat', 0.0019032553130806922),
 ('434', 0.0018988589076335744),
 ('440', 0.001890501127394744),
 ('63', 0.0018781539374138779),
 ('Clout', 0.0018775735388350328),
 ('114', 0.0018546022591322633),
 ('affiliation', 0.0018154577193334397),
 ('verb', 0.0017602316838526899),
 ('607', 0.0017295809757470705),
 ('304', 0.0017230625656478038),
 

In [97]:
top_features_google_large_mean = sorted(features_google_large_mean.items(), key = lambda x: x[1], reverse = True)[0:100]
top_features_google_large_mean

[('333', 0.004057265407007137),
 ('779', 0.0033046673441905995),
 ('841', 0.003234627297532726),
 ('258', 0.0029796659972322315),
 ('669', 0.0025556080131940565),
 ('329', 0.0024503431075571),
 ('714', 0.0023933211653613742),
 ('606', 0.002329636759997121),
 ('131', 0.0022669601297186214),
 ('510', 0.0022576857423980733),
 ('241', 0.0022426260274349278),
 ('776', 0.002204372133256452),
 ('161', 0.0021995489798777973),
 ('496', 0.002179149861743875),
 ('179', 0.0019848087907335522),
 ('676', 0.0019504091693421308),
 ('793', 0.001939980582170239),
 ('861', 0.0019320017228440763),
 ('610', 0.001859946254536507),
 ('470', 0.001858916361436683),
 ('794', 0.0018349564456495355),
 ('270', 0.001773163350789475),
 ('125', 0.0017563587907950661),
 ('937', 0.0017557481222245303),
 ('240', 0.0017504679579134735),
 ('752', 0.0017420723814628681),
 ('865', 0.0017389265690863334),
 ('519', 0.001726238202029413),
 ('471', 0.0016528515699107391),
 ('771', 0.0016118079642940767),
 ('156', 0.001598577199

## BERT Large

In [None]:
# apple_cv_score_large, apple_time_large = RandomForrest_with_PCA_CV(apple_processed_BERT_mean)
# print('Cross Validation Score is  ' + str(round(apple_cv_score_large,4)*100))
# print('Time taken is ' + str(round(apple_time_large,4)) + ' Seconds')

# print('*****************************************************/n')

# amazon_cv_score_large, amazon_time_large = RandomForrest_with_PCA_CV(amazon_processed_BERT_mean)
# print('Cross Validation Score is  ' + str(round(amazon_cv_score_large,4)*100))
# print('Time taken is ' + str(round(amazon_time_large,4)) + ' Seconds')

# print('*****************************************************/n')

# google_cv_score_large, google_time_large = RandomForrest_with_PCA_CV(google_processed_BERT_mean)
# print('Cross Validation Score is  ' + str(round(google_cv_score_large,4)*100))
# print('Time taken is ' + str(round(google_time_large,4)) + ' Seconds')

# print('*****************************************************/n')

In [99]:
apple_acc_bert_large,apple_f1_bert_large,apple_prec_bert_large,apple_rec_bert_large, apple_time_bert_large,features_apple_large = RandomForrest_with_features(apple_processed_df_bert_large)

print('Accuracy is ' + str(round(apple_acc_bert_large,4)*100))
print('F1 Score is ' + str(round(apple_f1_bert_large,4)*100))
print('Precision is ' + str(round(apple_prec_bert_large,4)*100))
print('Recall is ' + str(round(apple_rec_bert_large,4)*100))
print('Time taken is ' + str(round(apple_time_bert_large,4)) + ' Seconds')


print('*****************************************************/n')

amazon_acc_bert_large,amazon_f1_bert_large,amazon_prec_bert_large,amazon_rec_bert_large, amazon_time_bert_large,features_amazon_large = RandomForrest_with_features(amazon_processed_df_bert_large)

print('Accuracy is ' + str(round(amazon_acc_bert_large,4)*100))
print('F1 Score is ' + str(round(amazon_f1_bert_large,4)*100))
print('Precision is ' + str(round(amazon_prec_bert_large,4)*100))
print('Recall is ' + str(round(amazon_rec_bert_large,4)*100))
print('Time taken is ' + str(round(amazon_time_bert_large,4)) + ' Seconds')


print('*****************************************************/n')

google_acc_bert_large,google_f1_bert_large,google_prec_bert_large,google_rec_bert_large, google_time_bert_large,features_google_large = RandomForrest_with_features(google_processed_df_bert_large)

print('Accuracy is ' + str(round(google_acc_bert_large,4)*100))
print('F1 Score is ' + str(round(google_f1_bert_large,4)*100))
print('Precision is ' + str(round(google_prec_bert_large,4)*100))
print('Recall is ' + str(round(google_rec_bert_large,4)*100))
print('Time taken is ' + str(round(google_time_bert_large,4)) + ' Seconds')

  _warn_prf(average, modifier, msg_start, len(result))


Accuracy is 80.94
F1 Score is 26.36
Precision is 45.79
Recall is 25.69
Time taken is 49.3881 Seconds
*****************************************************/n
Accuracy is 83.13000000000001
F1 Score is 20.36
Precision is 66.64999999999999
Recall is 21.099999999999998
Time taken is 190.9297 Seconds
*****************************************************/n
Accuracy is 79.38
F1 Score is 22.97
Precision is 66.27
Recall is 22.759999999999998
Time taken is 45.5922 Seconds


In [100]:
top_features_apple_large = sorted(features_apple_large.items(), key = lambda x: x[1], reverse = True)[0:100]
top_features_apple_large

[('517', 0.004955228594413742),
 ('510', 0.004505836924417687),
 ('871', 0.0041223167901556015),
 ('7', 0.003770265197660906),
 ('828', 0.0036429098538945316),
 ('369', 0.0034381338977919652),
 ('749', 0.003401561421726106),
 ('448', 0.003134997803927302),
 ('340', 0.0029041985737662313),
 ('929', 0.0028900446791044203),
 ('668', 0.0028880734915540007),
 ('793', 0.002786147099550183),
 ('610', 0.002743283743280902),
 ('565', 0.002510070440225246),
 ('949', 0.002466290537975749),
 ('550', 0.002415799841425059),
 ('779', 0.0024106588781620157),
 ('669', 0.002390900835315595),
 ('198', 0.0023714815168222876),
 ('239', 0.002356472762097852),
 ('179', 0.0022904407614805066),
 ('649', 0.002258209818151832),
 ('991', 0.0022389247544529044),
 ('59', 0.0021637440552624123),
 ('911', 0.0021541419345338275),
 ('68', 0.002136658951271039),
 ('333', 0.0020943281167291466),
 ('163', 0.0020678450643822704),
 ('647', 0.0020588076125659634),
 ('841', 0.0020361427002062457),
 ('354', 0.00196117138688980

In [101]:
top_features_amazon_large = sorted(features_amazon_large.items(), key = lambda x: x[1], reverse = True)[0:100]
top_features_amazon_large

[('negate', 0.005223358180748212),
 ('differ', 0.00474445002441084),
 ('cogproc', 0.002538628396298205),
 ('644', 0.0025335396110180365),
 ('832', 0.002454120083691591),
 ('841', 0.0023190884036769633),
 ('852', 0.0022109081742938126),
 ('929', 0.0022103529994651626),
 ('871', 0.002082733889201047),
 ('267', 0.0019765483590792636),
 ('776', 0.0019719183517494985),
 ('861', 0.001822555438948009),
 ('949', 0.0018208602509766074),
 ('verb', 0.001795231040697074),
 ('510', 0.0017922133488334226),
 ('517', 0.0017867632845564938),
 ('Clout', 0.001778572960309962),
 ('720', 0.001740600020962703),
 ('258', 0.0017036413608232338),
 ('448', 0.0016820270390606419),
 ('210', 0.0016802013867205916),
 ('affiliation', 0.0016604304500209303),
 ('bio', 0.0016024394577741656),
 ('275', 0.0015969530953220722),
 ('8', 0.0015764214541432737),
 ('895', 0.0015760082893214705),
 ('369', 0.0015644508920755207),
 ('587', 0.0015530446516482954),
 ('669', 0.0015455829054781306),
 ('340', 0.0015220423150639957),
 

In [102]:
top_features_google_large = sorted(features_google_large.items(), key = lambda x: x[1], reverse = True)[0:100]
top_features_google_large

[('669', 0.003224202714675975),
 ('510', 0.0028659246489431654),
 ('832', 0.002839580698475774),
 ('448', 0.0028154856856362335),
 ('7', 0.002746376934251169),
 ('949', 0.002546771835232736),
 ('929', 0.0023220714845519365),
 ('340', 0.002158358492412847),
 ('644', 0.001975347083982662),
 ('841', 0.0019618190729588127),
 ('65', 0.0019264100604656279),
 ('241', 0.0019196964029105088),
 ('776', 0.001839832666721939),
 ('239', 0.0018396890784613566),
 ('22', 0.0018343809418170263),
 ('565', 0.0018332637045886163),
 ('771', 0.0018154367380137334),
 ('179', 0.0017955841755285021),
 ('517', 0.0017833015801701439),
 ('871', 0.001765431146693833),
 ('275', 0.0017635763152070425),
 ('730', 0.0017271672243834263),
 ('369', 0.001722515783384755),
 ('333', 0.001709364449166419),
 ('749', 0.0017001231977474137),
 ('217', 0.0016733520119691816),
 ('895', 0.0016314557677264178),
 ('606', 0.0016295138021404775),
 ('865', 0.0016034568594477435),
 ('331', 0.001579282815007862),
 ('610', 0.00157496837701

## TF-IDF Implementation

In [103]:
def tf_idf_df(df,df_liwc, stop_words):

    selected_tfidf_reviews,Llen = cleaning_reviews(df,stop_words)
    tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)
    tfidf_matrix = tfidf_vectorizer.fit_transform(selected_tfidf_reviews)

    tfidf_arr = tfidf_matrix.toarray()

    vocab = list(tfidf_vectorizer.get_feature_names_out())
    tf_idf_df = pd.DataFrame(tfidf_arr, columns=vocab)
    
    n_gram_df = top_ngrams_df(selected_tfidf_reviews, 100, (2,2))
    
    combined_df = pd.concat([tf_idf_df,n_gram_df], axis = 1)
    
    for col_liwc in df_liwc.columns[5:]:
        combined_df[col_liwc] = df_liwc[0:len(combined_df)][col_liwc] 
    
    pos_cols = ['VERB_count','NOUN_count','ADJ_count','OwnedFor','OwnedCategorized']

    for col in pos_cols:
        combined_df[col] = df[0:len(combined_df)][col]
    
    combined_df['review_len'] = Llen
    combined_df['Rating'] = df['Rating']
    cols_list = []

    for col in combined_df.columns:
        cols_list.append(str(col))

    combined_df.columns = cols_list
    
    return combined_df


In [105]:
apple_tfidf_ngram_df = tf_idf_df(apple_df, apple_df_liwc, stop_words)
amazon_tfidf_ngram_df = tf_idf_df(amazon_df,amazon_df_liwc, stop_words)
google_tfidf_ngram_df = tf_idf_df(google_df,google_df_liwc, stop_words)


  combined_df['review_len'] = Llen
  combined_df['Rating'] = df['Rating']
  combined_df['review_len'] = Llen
  combined_df['Rating'] = df['Rating']
  combined_df['review_len'] = Llen
  combined_df['Rating'] = df['Rating']


In [None]:
# apple_cv_score_tf, apple_time_tf = RandomForrest_with_PCA_CV(apple_tfidf_ngram_df)
# print('Cross Validation Score is  ' + str(round(apple_cv_score_tf,4)*100))
# print('Time taken is ' + str(round(apple_time_tf,4)) + ' Seconds')

# print('*****************************************************/n')

# amazon_cv_score_tf, amazon_time_tf = RandomForrest_with_PCA_CV(amazon_tfidf_ngram_df)
# print('Cross Validation Score is  ' + str(round(amazon_cv_score_tf,4)*100))
# print('Time taken is ' + str(round(amazon_time_tf,4)) + ' Seconds')

# print('*****************************************************/n')

# google_cv_score_tf, google_time_tf = RandomForrest_with_PCA_CV(google_tfidf_ngram_df)
# print('Cross Validation Score is  ' + str(round(google_cv_score_tf,4)*100))
# print('Time taken is ' + str(round(google_time_tf,4)) + ' Seconds')



In [195]:
apple_acc_tf,apple_f1_tf, apple_prec_tf, apple_rec_tf, apple_time_tf,features_apple_tfidf = RandomForrest_with_features(apple_tfidf_ngram_df)

print('Accuracy is ' + str(round(apple_acc_tf,4)*100))
print('F1 Score is ' + str(round(apple_f1_tf,4)*100))
print('Precision is ' + str(round(apple_prec_tf,4)*100))
print('Recall is ' + str(round(apple_rec_tf,4)*100))

print('Time taken is ' + str(round(apple_time_tf,4)) + ' Seconds')

print('*****************************************************/n')

amazon_acc_tf, amazon_f1_tf, amazon_prec_tf, amazon_rec_tf, amazon_time_tf,features_amazon_tfidf = RandomForrest_with_features(amazon_tfidf_ngram_df)

print('Accuracy is ' + str(round(amazon_acc_tf,4)*100))
print('F1 Score is ' + str(round(amazon_f1_tf,4)*100))
print('Precision is ' + str(round(amazon_prec_tf,4)*100))
print('Recall is ' + str(round(amazon_rec_tf,4)*100))

print('Time taken is ' + str(round(amazon_time_tf,4)) + ' Seconds')

print('*****************************************************/n')

google_acc_tf,google_f1_tf, google_prec_tf, google_rec_tf, google_time_tf, features_google_tfidf = RandomForrest_with_features(google_tfidf_ngram_df)


print('Accuracy is ' + str(round(google_acc_tf,4)*100))
print('F1 Score is ' + str(round(google_f1_tf,4)*100))
print('Precision is ' + str(round(google_prec_tf,4)*100))
print('Recall is ' + str(round(google_rec_tf,4)*100))

print('Time taken is ' + str(round(google_time_tf,4)) + ' Seconds')

Accuracy is 80.64
F1 Score is 20.91
Precision is 47.510000000000005
Recall is 21.560000000000002
Time taken is 19.2325 Seconds
*****************************************************/n
Accuracy is 83.13000000000001
F1 Score is 19.35
Precision is 60.75000000000001
Recall is 20.59
Time taken is 54.3962 Seconds
*****************************************************/n
Accuracy is 78.67
F1 Score is 18.029999999999998
Precision is 21.349999999999998
Recall is 20.169999999999998
Time taken is 18.8968 Seconds


In [196]:
top_features_apple_tf = sorted(features_apple_tfidf.items(), key = lambda x: x[1], reverse = True)[0:100]
top_features_apple_tf

[('review_len', 0.009034620277091617),
 ('VERB_count', 0.008557039371515984),
 ('NOUN_count', 0.007830610684558306),
 ('good', 0.007692847934132072),
 ('stars', 0.007277195294531953),
 ('WPS', 0.006579973716080085),
 ('Dic', 0.006343130667922992),
 ('affect', 0.006341342906622167),
 ('prep', 0.006090364755993284),
 ('WC', 0.005990933040532425),
 ('drives', 0.0059641901891720915),
 ('Sixltr', 0.0059496649050404275),
 ('verb', 0.0059463407550334815),
 ('great', 0.005942897108132193),
 ('function', 0.005871769333437533),
 ('sound', 0.005839342178245634),
 ('ADJ_count', 0.005833255527750975),
 ('adj', 0.005799824350830818),
 ('focuspresent', 0.005789244498546563),
 ('posemo', 0.005766227092838371),
 ('love', 0.005702614155440116),
 ('relativ', 0.005666232334215106),
 ('Analytic', 0.005651078933989155),
 ('Clout', 0.00561910544084035),
 ('cogproc', 0.005608654159867507),
 ('article', 0.005551839017796371),
 ('AllPunc', 0.005506056880397998),
 ('Authentic', 0.005486268275770692),
 ('conj', 0

In [197]:
top_features_amazon_tf = sorted(features_amazon_tfidf.items(), key = lambda x: x[1], reverse = True)[0:100]
top_features_amazon_tf

[('cogproc', 0.010878497252147555),
 ('verb', 0.010565832778072581),
 ('differ', 0.010553702008279103),
 ('Clout', 0.010138285205764817),
 ('negate', 0.009905120820430253),
 ('posemo', 0.009441017907794344),
 ('function', 0.009418623201705991),
 ('WPS', 0.009175190172861529),
 ('Analytic', 0.009031464976821594),
 ('affect', 0.008846008641352416),
 ('focuspresent', 0.008593289348130124),
 ('Sixltr', 0.008498421903425887),
 ('prep', 0.008449786974241293),
 ('Dic', 0.008423320088504136),
 ('AllPunc', 0.008381342269939819),
 ('drives', 0.008347767605125402),
 ('ppron', 0.008330217778514426),
 ('Period', 0.008175092679698525),
 ('pronoun', 0.00817177550724962),
 ('auxverb', 0.008136362162853307),
 ('ipron', 0.008106284532937576),
 ('adj', 0.008059752143489548),
 ('social', 0.007825787891316984),
 ('Authentic', 0.007802077751636775),
 ('conj', 0.00774038448874634),
 ('WC', 0.00771640700245023),
 ('relativ', 0.007670985364226602),
 ('article', 0.007501043189573227),
 ('review_len', 0.00744486

In [198]:
top_features_google_tf = sorted(features_google_tfidf.items(), key = lambda x: x[1], reverse = True)[0:100]
top_features_google_tf

[('sometimes', 0.0073096011036799845),
 ('verb', 0.007252641784253095),
 ('cogproc', 0.007210012551269113),
 ('prep', 0.0071457538603509115),
 ('WPS', 0.007121546407823655),
 ('posemo', 0.006952639544224939),
 ('review_len', 0.006930648576489614),
 ('function', 0.0068100911668355765),
 ('Analytic', 0.006805072244314985),
 ('Dic', 0.0067744045438283),
 ('pronoun', 0.006772319239921384),
 ('Clout', 0.006703012118690131),
 ('VERB_count', 0.006699090404138392),
 ('drives', 0.006645643642884344),
 ('Sixltr', 0.006601663895273946),
 ('AllPunc', 0.006551776120299729),
 ('focuspresent', 0.006398265847073929),
 ('adj', 0.006274308104221735),
 ('social', 0.006232976841200604),
 ('conj', 0.006216836513688644),
 ('WC', 0.006211429562766524),
 ('affect', 0.006201848633821324),
 ('Authentic', 0.0061958716988061826),
 ('NOUN_count', 0.006103798416758455),
 ('auxverb', 0.006042771308652453),
 ('relativ', 0.0060369010475304),
 ('ppron', 0.005975646271151488),
 ('Period', 0.005935915080422335),
 ('love'

## Topic Modeling

In [5]:

apple_df = pd.read_csv('apple_25Nov.csv')
amazon_df = pd.read_csv('amazon_25Nov.csv')
google_df = pd.read_csv('google_25Nov.csv')

In [6]:
from bertopic import BERTopic

In [7]:
## Extracting data for each category in Owned For

app_gby = apple_df.groupby(['OwnedCategorized'])

for gby, subdf in app_gby:
    print (gby)

app_0 = app_gby.get_group(0).reset_index(drop = True)
app_1 = app_gby.get_group(1).reset_index(drop = True)
app_2 = app_gby.get_group(2).reset_index(drop = True)
app_3 = app_gby.get_group(3).reset_index(drop = True)
app_4 = app_gby.get_group(4).reset_index(drop = True)
app_5 = app_gby.get_group(5).reset_index(drop = True)
app_6 = app_gby.get_group(6).reset_index(drop = True)



amz_gby = amazon_df.groupby(['OwnedCategorized'])

amz_0 = amz_gby.get_group(0).reset_index(drop = True)
amz_1 = amz_gby.get_group(1).reset_index(drop = True)
amz_2 = amz_gby.get_group(2).reset_index(drop = True)
amz_3 = amz_gby.get_group(3).reset_index(drop = True)
amz_4 = amz_gby.get_group(4).reset_index(drop = True)
amz_5 = amz_gby.get_group(5).reset_index(drop = True)
amz_6 = amz_gby.get_group(6).reset_index(drop = True)


goog_gby = google_df.groupby(['OwnedCategorized'])

goog_0 = goog_gby.get_group(0).reset_index(drop = True)
goog_1 = goog_gby.get_group(1).reset_index(drop = True)
goog_2 = goog_gby.get_group(2).reset_index(drop = True)
goog_3 = goog_gby.get_group(3).reset_index(drop = True)
goog_4 = goog_gby.get_group(4).reset_index(drop = True)
goog_5 = goog_gby.get_group(5).reset_index(drop = True)
goog_6 = goog_gby.get_group(6).reset_index(drop = True)


0
1
2
3
4
5
6


In [8]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

In [9]:
def get_topics(df_categorized, stop_words):
    docs,_ = cleaning_reviews(df_categorized, stop_words)
    embeddings = sentence_model.encode(docs, show_progress_bar=True)

    # Create topic model
    topic_model = BERTopic()
    topics, probs = topic_model.fit_transform(docs, embeddings)
    
    return topic_model 
    

In [10]:
app0_topic_model = get_topics(app_0, stop_words)
app1_topic_model = get_topics(app_1, stop_words)
app2_topic_model = get_topics(app_2, stop_words)
app3_topic_model = get_topics(app_3, stop_words)
app4_topic_model = get_topics(app_4, stop_words)
app5_topic_model = get_topics(app_5, stop_words)
app6_topic_model = get_topics(app_6, stop_words)

#-----------------------------------------------#
amz0_topic_model = get_topics(amz_0, stop_words)
amz1_topic_model = get_topics(amz_1, stop_words)
amz2_topic_model = get_topics(amz_2, stop_words)
amz3_topic_model = get_topics(amz_3, stop_words)
amz4_topic_model = get_topics(amz_4, stop_words)
amz5_topic_model = get_topics(amz_5, stop_words)
amz6_topic_model = get_topics(amz_6, stop_words)

#-----------------------------------------------#

goog0_topic_model = get_topics(goog_0, stop_words)
goog1_topic_model = get_topics(goog_1, stop_words)
goog2_topic_model = get_topics(goog_2, stop_words)
goog3_topic_model = get_topics(goog_3, stop_words)
goog4_topic_model = get_topics(goog_4, stop_words)
goog5_topic_model = get_topics(goog_5, stop_words)
goog6_topic_model = get_topics(goog_6, stop_words)

NameError: name 'stop_words' is not defined

## Apple Topics

In [None]:
app0_topic_model.generate_topic_labels(nr_words=10)
app1_topic_model.generate_topic_labels(nr_words=10)
app2_topic_model.generate_topic_labels(nr_words=10)
app3_topic_model.generate_topic_labels(nr_words=10)
app4_topic_model.generate_topic_labels(nr_words=10)
app5_topic_model.generate_topic_labels(nr_words=10)
app6_topic_model.generate_topic_labels(nr_words=10)

In [None]:
app0_topic_model.visualize_barchart(top_n_topics=4)

In [None]:
app1_topic_model.visualize_barchart(top_n_topics=4)

In [None]:
app2_topic_model.visualize_barchart(top_n_topics=4)

In [None]:
app3_topic_model.visualize_barchart(top_n_topics=4)

In [None]:
app4_topic_model.visualize_barchart(top_n_topics=4)

In [None]:
app5_topic_model.visualize_barchart(top_n_topics=4)

In [None]:
app6_topic_model.visualize_barchart(top_n_topics=4)

## Amazon Topics

In [None]:
amz0_topic_model.generate_topic_labels(nr_words=10)
amz1_topic_model.generate_topic_labels(nr_words=10)
amz2_topic_model.generate_topic_labels(nr_words=10)
amz3_topic_model.generate_topic_labels(nr_words=10)
amz4_topic_model.generate_topic_labels(nr_words=10)
amz5_topic_model.generate_topic_labels(nr_words=10)
amz6_topic_model.generate_topic_labels(nr_words=10)

In [None]:
amz0_topic_model.visualize_barchart(top_n_topics=4)


In [None]:
amz1_topic_model.visualize_barchart(top_n_topics=4)


In [None]:
amz2_topic_model.visualize_barchart(top_n_topics=4)


In [None]:
amz3_topic_model.visualize_barchart(top_n_topics=4)


In [None]:
amz4_topic_model.visualize_barchart(top_n_topics=4)


In [None]:
amz5_topic_model.visualize_barchart(top_n_topics=4)


In [None]:
amz6_topic_model.visualize_barchart(top_n_topics=4)

## Google Topics

In [None]:
goog0_topic_model.generate_topic_labels(nr_words=10)
goog1_topic_model.generate_topic_labels(nr_words=10)
goog2_topic_model.generate_topic_labels(nr_words=10)
goog3_topic_model.generate_topic_labels(nr_words=10)
goog4_topic_model.generate_topic_labels(nr_words=10)
goog5_topic_model.generate_topic_labels(nr_words=10)
goog6_topic_model.generate_topic_labels(nr_words=10)

In [None]:
goog0_topic_model.visualize_barchart(top_n_topics=4)


In [None]:
goog1_topic_model.visualize_barchart(top_n_topics=4)


In [None]:
goog2_topic_model.visualize_barchart(top_n_topics=4)


In [None]:
goog3_topic_model.visualize_barchart(top_n_topics=4)


In [11]:
goog4_topic_model.visualize_barchart(top_n_topics=4)


NameError: name 'goog4_topic_model' is not defined

In [193]:
goog5_topic_model.visualize_barchart(top_n_topics=4)


In [194]:
goog6_topic_model.visualize_barchart(top_n_topics=4)

## Top 10 Bigram and Trigrams for each df

In [None]:
def top_ngrams_df(selected_reviews, n_top, n_gram_range):

In [160]:
apple_cleaned,_ = cleaning_reviews(apple_df,stop_words)

app_bigrams = top_ngrams_df(apple_cleaned, 10, (2,2)).columns
app_trigrams = top_ngrams_df(apple_cleaned, 10, (3,3)).columns

In [165]:
app_bigrams

Index(['easy set', 'easy use', 'good sound', 'great sound', 'small speaker',
       'smart speaker', 'sound great', 'sound quality', 'sounds great',
       'works great'],
      dtype='object')

In [166]:
app_trigrams

Index(['good sound quality', 'great little speaker', 'great sound easy',
       'great sound quality', 'great sound small', 'love sound quality',
       'sound quality amazing', 'sound quality good', 'sound quality great',
       'sound small speaker'],
      dtype='object')

In [167]:
amazon_cleaned,_ = cleaning_reviews(amazon_df,stop_words)

amz_bigrams = top_ngrams_df(amazon_cleaned, 10, (2,2)).columns
amz_trigrams = top_ngrams_df(amazon_cleaned, 10, (3,3)).columns

In [171]:
amz_bigrams

Index(['easy set', 'easy use', 'every room', 'great product', 'great sound',
       'listen music', 'play music', 'sound quality', 'works great',
       'works well'],
      dtype='object')

In [172]:
amz_trigrams

Index(['easy set use', 'easy use set', 'every room house', 'free tv purchase',
       'good sound quality', 'great sound quality', 'love easy use',
       'one every room', 'product easy use', 'sound quality great'],
      dtype='object')

In [168]:
google_cleaned,_ = cleaning_reviews(google_df,stop_words)

goog_bigrams = top_ngrams_df(google_cleaned, 10, (2,2)).columns
goog_trigrams = top_ngrams_df(google_cleaned, 10, (3,3)).columns

In [169]:
goog_bigrams

Index(['easy set', 'easy use', 'great product', 'listen music', 'nest mini',
       'play music', 'smart speaker', 'sound quality', 'works great',
       'works well'],
      dtype='object')

In [170]:
goog_trigrams

Index(['easy set easy', 'easy set use', 'easy setup use', 'every room house',
       'good sound quality', 'great sound quality', 'nest mini 2nd',
       'one every room', 'set easy use', 'sound quality good'],
      dtype='object')

In [203]:
set(apple_df['OwnedFor'])

{'1 month',
 '1 week',
 '1 year',
 '1.5 years',
 '10 months',
 '11 months',
 '2 months',
 '2 weeks',
 '3 months',
 '3 weeks',
 '4 months',
 '5 months',
 '6 months',
 '7 months',
 '8 months',
 '9 months',
 'less than 1 week',
 'more than 2 years'}

In [204]:
set(apple_df['OwnedCategorized'])

{0, 1, 2, 3, 4, 5, 6}

In [210]:
apple_df.loc[apple_df['OwnedCategorized'] == 3]['OwnedFor']

19        8 months
23        9 months
24       11 months
25       11 months
28       11 months
           ...    
11216     9 months
11218     6 months
11229     9 months
11237    10 months
11240     7 months
Name: OwnedFor, Length: 980, dtype: object

In [211]:
from sklearn.linear_model import LogisticRegression
def Logistic_with_features(df):
    X = df.drop(columns = ['OwnedFor','Rating'])
    y = df['Rating']
    
#     X_std = StandardScaler().fit_transform(X)
#     output = np.mean(cross_val_score(X_std, y, cv = 5))
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 12)
    
    start = time.process_time()
    lr = LogisticRegression(multi_class='multinomial', solver='lbfgs').fit(X_train, y_train)
    
#     rf = RandomForestClassifier(random_state = 42)

#     output = np.mean(cross_val_score(rf, X_std, y, cv = 5))
    
    pred = lr.predict(X_test)
    
    acc = accuracy_score(y_test,pred)
    f1 = f1_score(y_test,pred, average = 'macro')
    prec = precision_score(y_test, pred,average = 'macro')
    rec = recall_score(y_test, pred,average = 'macro')
    
#     features_dict = dict(zip(X.columns, rf.feature_importances_))
    
    end = time.process_time()
    
    time_taken = end - start
    return acc,f1,prec,rec, time_taken
#     return output, time_taken

In [215]:
# apple_acc,apple_f1, apple_prec, apple_rec, apple_time = Logistic_with_features(amazon_processed_df_bert_large)

In [216]:
# apple_acc

0.8301032230882662

In [249]:
temp_app_0 = apple_processed_df_bert_large.loc[apple_processed_df_bert_large['OwnedCategorized'] == 6]

In [250]:
temp_app_0 = temp_app_0.reset_index(drop = True)

In [251]:
apple_acc,apple_f1, apple_prec, apple_rec, apple_time, ft = RandomForrest_with_features(temp_app_0)

In [228]:
apple_acc ## 0

0.8232558139534883

In [232]:
apple_acc ## 1

0.7976291278577476

In [236]:
apple_acc ## 2

0.7925824175824175

In [240]:
apple_acc ## 3

0.7687074829931972

In [244]:
apple_acc ## 4

0.7819548872180451

In [248]:
apple_acc ## 5

0.7936507936507936

In [252]:
apple_acc ## 6

0.6060606060606061

In [253]:
apple_df_liwc.columns[5:]

Index(['WC', 'Analytic', 'Clout', 'Authentic', 'Tone', 'WPS', 'Sixltr', 'Dic',
       'function', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they',
       'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj', 'negate',
       'verb', 'adj', 'compare', 'interrog', 'number', 'quant', 'affect',
       'posemo', 'negemo', 'anx', 'anger', 'sad', 'social', 'family', 'friend',
       'female', 'male', 'cogproc', 'insight', 'cause', 'discrep', 'tentat',
       'certain', 'differ', 'percept', 'see', 'hear', 'feel', 'bio', 'body',
       'health', 'sexual', 'ingest', 'drives', 'affiliation', 'achieve',
       'power', 'reward', 'risk', 'focuspast', 'focuspresent', 'focusfuture',
       'relativ', 'motion', 'space', 'time', 'work', 'leisure', 'home',
       'money', 'relig', 'death', 'informal', 'swear', 'netspeak', 'assent',
       'nonflu', 'filler', 'AllPunc', 'Period', 'Comma', 'Colon', 'SemiC',
       'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth', 'OtherP'],
      dtype='o

In [254]:
from sklearn.metrics import ndcg_score