In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
pd.options.display.max_rows = 20

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cols = ['productId','userId','profileName','helpfulness','score','time','summary','text']
#df = pd.DataFrame(columns = cols)
content = []
encoding = 'latin-1'
with open("/content/drive/MyDrive/Colab Notebooks/finefoods_training.txt","rb")as foods: 
    lines = foods.readlines()
    #print(lines[1])
    for i in lines: 
        temp = i.decode(encoding)
        content.append(temp)
#content

In [4]:
content_grouped = []
temp = []
for i in content: 
    if i!= '\n': 
        temp.append(i)
    else:
        content_grouped.append(temp)
        # restore temp
        temp = []
#content_grouped

In [5]:
for i in content_grouped: 
    if len(i)!= 8: 
        print("here is the failure")
        print(len(i))
        print(i)

In [6]:
# since there is some sample do not contain len(i) = 8, I need to delete that row for further data frame transform
for i in content_grouped: 
    if len(i)!= 8: 
        content_grouped.remove(i)


In [7]:
# re-arrange the data as the data frame format
cleaned_data = []
for i in content_grouped:
    temp = []
    for j in i: 
        in_list = j.split(":")
        temp.append(in_list[1][1:-1])
    cleaned_data.append(temp)
#cleaned_data


In [8]:
df = pd.DataFrame(cleaned_data,columns = cols)
#df

In [9]:
df.dropna(inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   productId    2000 non-null   object
 1   userId       2000 non-null   object
 2   profileName  2000 non-null   object
 3   helpfulness  2000 non-null   object
 4   score        2000 non-null   object
 5   time         2000 non-null   object
 6   summary      2000 non-null   object
 7   text         2000 non-null   object
dtypes: object(8)
memory usage: 140.6+ KB


In [11]:
print(len(df["productId"].unique()))

362


Therefore, produce Id is not unique

In [12]:
print(len(df["userId"].unique()))

1947


Therefore, userID is not unique

In [13]:
df["score_int"] = df["score"].astype(float).astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   productId    2000 non-null   object
 1   userId       2000 non-null   object
 2   profileName  2000 non-null   object
 3   helpfulness  2000 non-null   object
 4   score        2000 non-null   object
 5   time         2000 non-null   object
 6   summary      2000 non-null   object
 7   text         2000 non-null   object
 8   score_int    2000 non-null   int64 
dtypes: int64(1), object(8)
memory usage: 156.2+ KB


what I need to do: 

1. give label (score >= 4 is 1 and, score<4 is 0 )

2. data cleaning: 

    1. clean duplicate
    
    2. spacy clean stop and Text Normalization using spacy

3. Bag of Word

In [14]:
df["label"] = 0 # set to negative
for index,row in df.iterrows(): 
    if row["score_int"] >= 4: 
        df.at[index,"label"] = 1 # if score is equal or larger than 4, replace the value with 1 (true)
#df
    

In [15]:
df["label"].value_counts()

1    1557
0     443
Name: label, dtype: int64

In [16]:
sorted_data=df.sort_values('productId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
#sorted_data

In [17]:
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm
#sorted_data_new["token_text"] = ""
sorted_data.insert(1, "token_text", "")
nlp_s = en_core_web_sm.load()
nlp = English()
#filtered_sentence =[]
for index,row in sorted_data.iterrows(): 
    lemma_word1 = []
    doc_s = nlp_s(row["text"])
    for token in doc_s:
      lemma_word1.append(token.lemma_)
    temp_sentence = ' '.join(lemma_word1)
    doc = nlp(temp_sentence)
    token_list = []
    for token in doc:
        token_list.append(token.text)
    filtered_sentence =[] 
    for word in token_list:
        tex = nlp.vocab[word]
        if tex.is_stop == False:
            filtered_sentence.append(word) 
    sorted_data.at[index, "token_text"] = filtered_sentence
    
#sorted_data    



In [18]:
data_pos = sorted_data[sorted_data["label"] == 1].sample(n = 400)
data_neg = sorted_data[sorted_data["label"] == 0].sample(n = 400)
data_set = pd.concat([data_pos, data_neg])


In [19]:
X = data_set["token_text"]
X_str = []
for i in X:
  X_str.append(' '.join(i)) 
Y = data_set["label"]
from sklearn.feature_extraction.text import CountVectorizer 
vectorizer = CountVectorizer()
vect = vectorizer.fit(X_str)
X_vectorized = vect.transform(X_str)
#X_vectorized.toarray()

In [20]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_vectorized, Y, test_size = 0.3, random_state = 42)

In [21]:
model = GaussianNB()
model.fit(x_train.toarray(), y_train)

GaussianNB()

In [22]:
predictions = model.predict(x_test.toarray())
#print(predictions)
print('AUC score: ', roc_auc_score(y_test, predictions ,average=None))

AUC score:  0.5744347826086956
