In [17]:
import pandas as pd
import polars as pl
import numpy as np

In [18]:
train_df = pl.read_csv("/Users/jmadu1/Documents/DS_Projects/yelp_review_polarity_csv/train.csv")
train_df

1,"Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff. It seems that his staff simply never answers the phone. It usually takes 2 hours of repeated calling to get an answer. Who has time for that or wants to deal with it? I have run into this problem with many other doctors and I just don't get it. You have office workers, you have patients with medical needs, why isn't anyone answering the phone? It's incomprehensible and not work the aggravation. It's with regret that I feel that I have to give Dr. Goldberg 2 stars."
i64,str
2,"""Been going to …"
1,"""I don't know w…"
1,"""I'm writing th…"
2,"""All the food i…"
1,"""Wing sauce is …"
1,"""Owning a drivi…"
1,"""This place is …"
2,"""Before I final…"
2,"""I drove by yes…"
1,"""After waiting …"


In [23]:
#train_df = train_df.set_axis(['target', 'reviews'], axis=1)
#train_df

In [24]:
train_df = train_df.rename(mapping = {"1": "target", "Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars.":"reviews"})
train_df

SchemaFieldNotFoundError: 1

# DL context

## load data

In [25]:
sentences = train_df['reviews']
y = train_df['target']

In [26]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from sklearn.model_selection import train_test_split

def load_data(percentage_of_sentences=None):

    train_sentences, test_sentences, y_train, y_test = train_test_split(sentences, y, test_size=0.3)

    # Take only a given percentage of the entire data
    if percentage_of_sentences is not None:
        assert(percentage_of_sentences> 0 and percentage_of_sentences<=100)

        len_train = int(percentage_of_sentences/100*len(train_sentences))
        train_sentences, y_train = train_sentences[:len_train], y_train[:len_train]

        len_test = int(percentage_of_sentences/100*len(test_sentences))
        test_sentences, y_test = test_sentences[:len_test], y_test[:len_test]

    X_train = [text_to_word_sequence(_) for _ in train_sentences]
    X_test = [text_to_word_sequence(_) for _ in test_sentences]

    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = load_data(percentage_of_sentences=10)

## convert loaded data into components to feed into RNN

In [34]:
!pip freeze | grep gensim

gensim==4.2.0


In [38]:
from gensim.models import Word2Vec

word2vec = Word2Vec(sentences=X_train, vector_size=60, min_count=10, window=10)

In [30]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])

    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []

    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)

    return embed

# Embed the training and test sentences
X_train_embed = embedding(word2vec, X_train)
X_test_embed = embedding(word2vec, X_test)


# Pad the training and test embedded sentences
X_train_pad = pad_sequences(X_train_embed, dtype='float32', padding='post', maxlen=200)
X_test_pad = pad_sequences(X_test_embed, dtype='float32', padding='post', maxlen=200)

## check embeddings

In [31]:
for X in [X_train_pad, X_test_pad]:
    assert type(X) == np.ndarray
    assert X.shape[-1] == word2vec.wv.vector_size


assert X_train_pad.shape[0] == len(X_train)
assert X_test_pad.shape[0] == len(X_test)

## baseline model

In [32]:
from sklearn.metrics import accuracy_score

unique, counts = np.unique(y_train, return_counts=True)
counts = dict(zip(unique, counts))
print('Number of labels in train set', counts)

y_pred = 0 if counts[0] > counts[1] else 1

print('Baseline accuracy: ', accuracy_score(y_test, [y_pred]*len(y_test)))


Number of labels in train set {1: 19586, 2: 19613}


KeyError: 0

In [4]:
#train_df = train_df.drop(columns=["target"])

# ML context

***from here i have three options:***



1. cut my dataset in half because the full size of it is too much for the function to process
2. modify the function to take the dataset in, in chunks
3. do this all from a cloud platform so that running commands doesn't make my laptop pack up

In [4]:
train_df_1 = train_df.iloc[1:186666]
train_df_2 = train_df.iloc[186666:]

train_df_1

Unnamed: 0,target,reviews
1,1,I don't know what Dr. Goldberg was like before...
2,1,I'm writing this review to give you a heads up...
3,2,All the food is great here. But the best thing...
4,1,Wing sauce is like water. Pretty much a lot of...
5,1,Owning a driving range inside the city limits ...
...,...,...
186661,1,Happy hour in the bar is a steal if you can fa...
186662,2,Truly the most beautiful restaurant I have din...
186663,2,My ablsolute favorite restaurant in Las Vegas....
186664,2,"Nice interior with warm read colors, high ceil..."


In [5]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import string
import unidecode

In [7]:
def clean(text):
    
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
        
    lowercased = text.lower() # Lower Case
    
    unaccented_string = unidecode.unidecode(lowercased) # remove accents
    
    tokenized = word_tokenize(unaccented_string) # Tokenize
    
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    
    return " ".join(words_only)

train_df_1['clean_text'] = train_df_1['reviews'].apply(clean)

train_df_1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_1['clean_text'] = train_df_1['reviews'].apply(clean)


Unnamed: 0,target,reviews,clean_text
1,1,I don't know what Dr. Goldberg was like before...,i don t know what dr goldberg was like before ...
2,1,I'm writing this review to give you a heads up...,i m writing this review to give you a heads up...
3,2,All the food is great here. But the best thing...,all the food is great here but the best thing ...
4,1,Wing sauce is like water. Pretty much a lot of...,wing sauce is like water pretty much a lot of ...
5,1,Owning a driving range inside the city limits ...,owning a driving range inside the city limits ...


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range = (1,1), 
                             min_df=0.01, 
                             max_df = 0.05).fit(train_df_1.clean_text)

ngram_range is the lower and upper boundary of the range of n-values for different n-grams to be extracted. min_df and max_df are the thresholds as percentages of a vocabulary's document frequency to ignore. we are fitting the vectorizer model to the cleaned version of the training dataset.

In [9]:
vectors = pd.DataFrame(vectorizer.transform(train_df_1.clean_text).toarray(),
                       columns = vectorizer.get_feature_names_out())
vectors.head()

Unnamed: 0,able,above,absolutely,across,actual,add,added,afternoon,ago,ahead,...,wynn,yeah,year,yelp,yes,yet,young,yourself,yum,yummy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.155048,0.0,0.0,0.326018,0.0,0.0,0.0,0.0


In [10]:
sum_tfidf = vectors.sum(axis = 0)
sum_tfidf

able          1077.261046
above          593.254408
absolutely    1171.546406
across         827.982614
actual         374.840962
                 ...     
yet           1032.427903
young          480.115928
yourself       723.044510
yum            530.978946
yummy          844.501830
Length: 855, dtype: float64

In [11]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
train_df_1["target_encoded"] =  le.fit_transform(train_df_1.target)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_1["target_encoded"] =  le.fit_transform(train_df_1.target)


In [12]:
train_df_1

Unnamed: 0,target,reviews,clean_text,target_encoded
1,1,I don't know what Dr. Goldberg was like before...,i don t know what dr goldberg was like before ...,0
2,1,I'm writing this review to give you a heads up...,i m writing this review to give you a heads up...,0
3,2,All the food is great here. But the best thing...,all the food is great here but the best thing ...,1
4,1,Wing sauce is like water. Pretty much a lot of...,wing sauce is like water pretty much a lot of ...,0
5,1,Owning a driving range inside the city limits ...,owning a driving range inside the city limits ...,0
...,...,...,...,...
186661,1,Happy hour in the bar is a steal if you can fa...,happy hour in the bar is a steal if you can fa...,0
186662,2,Truly the most beautiful restaurant I have din...,truly the most beautiful restaurant i have din...,1
186663,2,My ablsolute favorite restaurant in Las Vegas....,my ablsolute favorite restaurant in las vegas ...,1
186664,2,"Nice interior with warm read colors, high ceil...",nice interior with warm read colors high ceili...,1


In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate

naivebayes = MultinomialNB()

X_bow = vectorizer.fit_transform(train_df_1.clean_text)

cv_nb = cross_validate(
    naivebayes,
    X_bow,
    train_df_1.target_encoded,
    scoring = "accuracy"
)

round(cv_nb['test_score'].mean(),2)

0.81

In [47]:
#import pickle

# Export Pipeline as pickle file
#with open("pipeline.pkl", "wb") as file:
    #pickle.dump(pipeline, file)

# Load Pipeline from pickle file
#my_pipeline = pickle.load(open("pipeline.pkl","rb"))

In [28]:
#tfidf_list = [(word, sum_tfidf[word]) 
              #for word, idx in vectorizer.vocabulary_.items() 
              #if word in vectorizer.vocabulary_.keys() ]
#tfidf_list

In [29]:
#sorted_tfidf_list =sorted(tfidf_list, key = lambda x: x[1], reverse=True)
#sorted_tfidf_list