# 讀取評論資料和欄位處理

In [1]:
# read json file

import json
import pandas as pd

with open('./Employee Review about their organization.json') as file:
    contents = file.read()
    contents = eval(contents)
    
df_list = []
for i in range(len(contents)):
    row_list = []
    row = contents[i]
    row_list.append(row["ReviewTitle"])
    row_list.append(row["CompleteReview"])
    row_list.append(row["URL"])
    row_list.append(row["Rating"])
    row_list.append(row["ReviewDetails"])
    df_list.append(row_list)
    
df = pd.DataFrame(data = df_list, columns = ['ReviewTitle', 'CompleteReview', 'URL', 'Rating', 'ReviewDetails'])

In [2]:
# 欄位處理

df['Company'] = df.URL.str.split('/')[:].str[4]
df['isCurrentEmployee'] = df.ReviewDetails.apply(lambda x: 0 if 'Former Employee' in x else 1)
rating = df.pop('Rating')
df['Rating'] = rating
df.drop(columns = ['URL', 'ReviewDetails'], inplace = True)
df.head()

Unnamed: 0,ReviewTitle,CompleteReview,Company,isCurrentEmployee,Rating
0,Productive,"Good company, cool workplace, work load little...",Reliance-Industries-Ltd,1,3.0
1,Stressful,1. Need to work on boss's whims and fancies 2....,Reliance-Industries-Ltd,0,3.0
2,Good Company for Every employee,"Good company for every Engineers dream, Full M...",Reliance-Industries-Ltd,0,5.0
3,Productive,I am just pass out bsc in chemistry Typical da...,Reliance-Industries-Ltd,1,5.0
4,Non productive,Not so fun at work just blame games Target pe...,Reliance-Industries-Ltd,0,1.0


In [3]:
# 有 14 多萬筆評論
len(df)

145209

In [4]:
df['Rating'] = df['Rating'].astype(float)
df['Rating'].mean()

4.053660585776364

In [5]:
mean = df['Rating'].mean()
df['Rating'].iloc[df['Rating'] < mean] = 0
df['Rating'].iloc[df['Rating'] >= mean] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [6]:
df['Rating']

0         0.0
1         0.0
2         1.0
3         1.0
4         0.0
         ... 
145204    0.0
145205    0.0
145206    0.0
145207    0.0
145208    1.0
Name: Rating, Length: 145209, dtype: float64

In [7]:
df_copy = df.copy()

In [8]:
from sklearn.model_selection import train_test_split
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

#先抽 10 %
X_other, X_use, y_other, y_use = train_test_split(X, y, test_size = 0.1, random_state = 8, stratify = y)

In [9]:
len(X_use)

14521

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_use, y_use, test_size = 0.1, random_state = 8, stratify = y_use)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1, random_state = 8, stratify = y)

In [11]:
X_train['id'] = X_train.index
X_train

Unnamed: 0,ReviewTitle,CompleteReview,Company,isCurrentEmployee,id
56902,na,boring work life\r learnt nothing\r management...,Cognizant-Technology-Solutions,1,56902
142877,Good place for learning,"good exposure to mnc environment,. better unde...",IBM,0,142877
732,Productive and Fun Environment,Each individual has target and if they complet...,Maersk,0,732
51585,Good place for work,Work culture is very good. Job security was go...,Cognizant-Technology-Solutions,1,51585
22881,Have Assigend task ofr vessel recorn.,I have to prepare the load list for the vessel...,Maersk,1,22881
...,...,...,...,...,...
105019,great place to work at,spent 2.5 years in accenture and loved everyda...,Accenture,0,105019
4199,Productive and fun place,§ Designed and developed an ETL Framework for ...,UnitedHealth-Group,0,4199
54943,Was Good Company but recently many changes,Was Good Company but recently many changes hap...,Cognizant-Technology-Solutions,1,54943
108121,Exposure to business consulting,Consulting as well as solutions - biggest stre...,Accenture,0,108121


# 文本前處理

In [12]:
#沿用 PA1 的內容
def syllables(word):
    #當字尾是e時音節數會減一
    if word.endswith("e"):
        word.replace("e", "")
    
    #當字裡包含a e i o u y時，會增加音節
    word = word.replace("a", ",")
    word = word.replace("e", ",")
    word = word.replace("i", ",")
    word = word.replace("o", ",")
    word = word.replace("u", ",")
    word = word.replace("y", ",")
    
    count = word.count(",")
    return count

In [13]:
#沿用 PA1 的內容
def tokenization(text):
    punctuation = [",", "'", ".", "?", "!", "(", ")", ":", "`", "``", "%", "{", "}", ";", "*", "#", "+", "$", 
                   "\\", "/", "_", "=", "^", "&", "<", ">", "@", "|", "[", "]"]
    for i in punctuation:
        text = text.replace(i, " ")
    
    #新增移除數字
    numbers = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "0"]
    for j in numbers:
        text = text.replace(j, " ")
    
    text = text.replace("\n", " ")
    text = text.replace("\t", " ") 
    text = text.replace("\r", " ") 
    text = text.replace('"', "")
    
    #檢查 hyphens
    if "-" in text:
        t = text.count("-")
        
        #如果那個字的 hyphen 數小於2時，就會進入音節數的判斷
        if t < 2:
            count_syllables_word = text.split("-")
            s = syllables(count_syllables_word[0]) #計算第一個字音節
            
            #如果那個字的音節數小於2時，就會把字合併
            if(s < 2): 
                text = text.replace("-", "") #ex. co-worker --> coworker  
            else:
                text = text.replace("-", " ") #ex. semi-supervised --> semi supervised
        else:
            text = text.replace("-", " ") #ex. state-of-art --> state of art

    
    text = text.split(" ")
    text = list(filter(None, text)) #remove null
    
    return text

In [14]:
from nltk.stem.porter import *
import nltk
from nltk.corpus import stopwords
stemmer = PorterStemmer()
stopword_list = set(stopwords.words('english'))

docID = []
t_index = []
term = []
df = []
tf = []

for i in range(len(X_train)):
    token_list = []    

    token_list.extend(tokenization(X_train['CompleteReview'].iloc[i].encode('utf-8', 'ignore').decode('ascii', 'ignore')))
    
    #lower case
    for j in range (len(token_list)):
        if(isinstance(token_list[j], str)): #only string has to lower case 
            token_list[j] = token_list[j].lower()  
    
    #stemming
    new_token_list = [stemmer.stem(token) for token in token_list]    
    
    #stopword removal
    filtered_tokens = [token for token in new_token_list if token not in stopword_list] 
    
    #remove strange token
    filtered_tokens = [token for token in filtered_tokens if len(token) < 50]
    
    #紀錄 doc_i 中的 docID, term, tf
    for t in filtered_tokens: 
        docID.append(X_train['id'].iloc[i])
        term.append(t)
        tf.append(filtered_tokens.count(t)) #tf 的平均是用原始檔案中的字數去除

In [15]:
#step_one: 在這個步驟中將 term 跟 docID 取出
step_one = []
for i in range(len(term)):
    tmp = [term[i], str(docID[i])] #[apple, 1]
    step_one.append(tmp)

In [16]:
#step_two: 這個步驟中將 term 按照字母順序排列，並去除重複的字及記錄 df

import numpy as np
#sort alphabetically
step_two = sorted(step_one, key=lambda l:l[0])

step_two = np.array(step_two) #list to numpy

step_two = np.unique(step_two, axis=0) #去除同一個 docID 中的相同字
step_two = step_two.T
t, df = np.unique(step_two[0], return_counts=True) #計算不同檔案中的相同字數量
print(t)
print(df)

['aa' 'aaa' 'abandon' ... 'zone' 'zoveili' '~']
[ 3  1  1 ... 18  1  1]


In [17]:
my_dic ={}
my_dic["term"] = t
my_dic["df"] = df
my_dic["t_index"] = [i for i in range (1, len(t)+1)] #給定 term index (從1開始)

In [18]:
#把字典的字和對應到的文件 id 取出，方便之後找字對應到的 class
dict_ids = {}
c = 0
for t in step_two[0]:
    if t in term:
        if t not in dict_ids:
            dict_ids[t] = list()
        dict_ids[t].append(step_two[1,c])      
        
    c+=1

In [19]:
#寫入 dictionary.txt 中
dic = open("dictionary.txt", "w")
dic.write("t_index term df")
for i in range (len(my_dic["term"])):
    dic.write("\n" + str(my_dic["t_index"][i]).encode('utf-8','ignore').decode("utf-8") + " " + str(my_dic["term"][i]).encode('utf-8','ignore').decode("utf-8") + " " + str(my_dic["df"][i]).encode('utf-8','ignore').decode("utf-8"))
    
dic.close()

In [20]:
#將 docID, term, tf, t_index, df 放到同一個 dataframe，以方便計算 tf-idf
#dataframe for tf
import pandas as pd
tf_dataframe = pd.DataFrame(list(zip(docID, term, tf)), columns =['docID', 'term', 'tf'])

In [21]:
tf_dataframe

Unnamed: 0,docID,term,tf
0,56902,bore,1
1,56902,work,1
2,56902,life,1
3,56902,learnt,1
4,56902,noth,1
...,...,...,...
292444,97627,age,1
292445,97627,veri,1
292446,97627,difficult,1
292447,97627,get,1


In [22]:
#dataframe for dictionary
dict_df = pd.read_csv('dictionary.txt', delimiter = " ", keep_default_na=False)

In [23]:
#join tf_dataframe with dataframes
df3 = tf_dataframe.join(dict_df.set_index('term'), on='term')

In [24]:
df3

Unnamed: 0,docID,term,tf,t_index,df
0,56902,bore,1,935,19
1,56902,work,1,9135,9456
2,56902,life,1,4818,1959
3,56902,learnt,1,4757,686
4,56902,noth,1,5613,151
...,...,...,...,...,...
292444,97627,age,1,181,23
292445,97627,veri,1,8821,3264
292446,97627,difficult,1,2186,172
292447,97627,get,1,3447,1253


In [25]:
#先計算 tf-idf
N = len(X_train)
tfidf = df3['tf'] * np.log10(N / df3['df'])
df3 = df3.assign(tf_idf = tfidf)

In [26]:
#將 tf-idf 轉為 unit vector
new_df3 = df3.copy()
for i in (X_train['id']):
    #完整 vector space 大小
    matrix = df3.shape[0]   
    vector_space = np.zeros(matrix)
    rows = df3.loc[df3['docID'] == i]
    
    #取出對應的 docID 的 vector 後算距離
    vector_space[rows['t_index']] = rows['tf_idf']
    new_tfidf = rows.iloc[:, 5] / np.linalg.norm(vector_space)
    
    #改成 unit vector
    new_df3.loc[new_df3['docID'] == i, 'tf_idf'] = new_tfidf

In [27]:
new_df3

Unnamed: 0,docID,term,tf,t_index,df,tf_idf
0,56902,bore,1,935,19,0.304721
1,56902,work,1,9135,9456,0.015089
2,56902,life,1,4818,1959,0.088510
3,56902,learnt,1,4757,686,0.137450
4,56902,noth,1,5613,151,0.208044
...,...,...,...,...,...,...
292444,97627,age,1,181,23,0.438715
292445,97627,veri,1,8821,3264,0.095956
292446,97627,difficult,1,2186,172,0.299542
292447,97627,get,1,3447,1253,0.162181


# Normalized TF-IDF DataFrame 整理

In [28]:
# 字典的字
terms = sorted(set(new_df3['term']))
len(terms)

9289

In [29]:
# 評論 ID
doc_IDs = sorted(set(new_df3['docID']))
len(doc_IDs)

13065

In [30]:
# 轉成 normalized TF-IDF matrix

normalized_TFIDF_matrix = pd.DataFrame(columns = terms, index = doc_IDs)

for i in doc_IDs:
    
    terms_of_docID = new_df3[new_df3['docID'] == i].term.values
    
    for t in terms:
        if t not in terms_of_docID:
            normalized_TFIDF_matrix.at[i, t] = 0.
        else:
            normalized_TFIDF_matrix.at[i, t] = new_df3[(new_df3['docID'] == i) & (new_df3['term'] == t)].tf_idf.values[0]

normalized_TFIDF_matrix

KeyboardInterrupt: 

# 降維：SVD

In [None]:
# SVD (10 個 topic)

from sklearn.decomposition import TruncatedSVD

model_SVD = TruncatedSVD(n_components = 10)
vectors_SVD = model_SVD.fit_transform(normalized_TFIDF_matrix)

In [None]:
vectors_SVD

In [None]:
df_copy

In [None]:
y = [df_copy[i,'Rating'] for i in doc_IDs]
X_train_vectors, X_val_vectors, y_train, y_val = train_test_split(vectors_SVD, y, test_size = 0.1, random_state = 8)

In [None]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_vectors, y_train_vectors)
y_pred = rf.predict(X_val_vectors)

In [None]:
print(classification_report(y_val_vectors, y_pred))

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train_vectors, y_train_vectors)
y_pred = xgb.predict(X_val_vectors)

In [None]:
print(classification_report(y_val_vectors, y_pred))

In [None]:
from sklearn.svm import SVC

clf = SVC(kernel = 'linear')
clf.fit(X_train_vectors, y_train_vectors)
y_pred = clf.predict(X_val_vectors)

In [None]:
print(classification_report(y_val_vectors, y_pred))

# TF 向量處理

In [None]:
# 轉成 TF 向量

TF_matrix = pd.DataFrame(columns = terms, index = doc_IDs)

for i in doc_IDs:
    
    terms_of_docID = new_df3[new_df3['docID'] == i].term.values
    
    for t in terms:
        if t not in terms_of_docID:
            TF_matrix.at[i, t] = 0
        else:
            TF_matrix.at[i, t] = new_df3[(new_df3['docID'] == i) & (new_df3['term'] == t)].tf.values[0]

# 降維：LDA

In [None]:
# LDA (10 個 topic)

from sklearn.decomposition import LatentDirichletAllocation as LDA

model_LDA = LDA(n_components = 10)
vectors_LDA = model_LDA.fit_transform(TF_matrix)

In [None]:
vectors_LDA

In [None]:
y = [df_copy[i,'Rating'] for i in doc_IDs]
X_train_vectors, X_val_vectors, y_train, y_val = train_test_split(vectors_LDA, y, test_size = 0.1, random_state = 8)

In [None]:
X_train_vectors, X_val_vectors, y_train, y_val = train_test_split(vectors_LDA, y_use, test_size = 0.1, random_state = 8)

In [None]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_vectors, y_train_vectors)
y_pred = rf.predict(X_val_vectors)

In [None]:
print(classification_report(y_val_vectors, y_pred))

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train_vectors, y_train_vectors)
y_pred = xgb.predict(X_val_vectors)

In [None]:
print(classification_report(y_val_vectors, y_pred))

In [None]:
from sklearn.svm import SVC

clf = SVC(kernel = 'linear')
clf.fit(X_train_vectors, y_train_vectors)
y_pred = clf.predict(X_val_vectors)

In [None]:
print(classification_report(y_val_vectors, y_pred))