In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.models.word2vec import PathLineSentences



In [2]:
import jieba
import re
import os

In [3]:
def seg_words_write(contents):
    contents_segs = []
    stopwords = getStopwords()
    f = open('contents.txt','wb')
    for content in contents:
        c = re.sub(' ', '', content)
        segs = list(jieba.lcut(c)) #Sentence segment
        sentence_segment = []
        for word in segs:
            if word not in stopwords:
                sentence_segment.append(word) 
        contents_segs.append(" ".join(sentence_segment))
        f.write(" ".join(sentence_segment).encode('utf-8')) #Write sentence segment as contents.txt
    f.close
    return contents_segs

In [4]:
def seg_words(contents):
    contents_segs = []
    stopwords = getStopwords()
    for content in contents:
        c = re.sub(' ', '', content)
        segs = list(jieba.lcut(c))
        sentence_segment = []
        for word in segs:
            if word not in stopwords:
                sentence_segment.append(word)
        contents_segs.append(" ".join(sentence_segment)) #Get sentence segment without writing
    return contents_segs

In [5]:
def getStopwords(): #Get stop words list
    stopwords = []
    with open("chineseStopWords.txt", "r") as f:
        lines = f.readlines()
        for line in lines:
            stopwords.append(line.strip())
    stopwords.append('"')
    stopwords.append('\n')
    stopwords.append('～')
    return stopwords

In [6]:
def sentence_vectorize(size,contents,model):
    contents_vector = np.zeros((len(contents),size))
    for i in range(len(contents)):
        n = 0
        segs = contents[i].split()
        vector_sum = np.zeros(size)
        for j in range(len(segs)):
            try:
                vector_sum = vector_sum + model[segs[j]] #Sum word vector
                n = n + 1
            except KeyError:
                pass
        contents_vector[i] = vector_sum/n 
    return contents_vector

In [7]:
class Word_to_Vec():
    def __init__(self,embedder = Word2Vec(),size = 0,contents_segs = []):
        self.embedder = embedder
        self.size = size
    
    def fit(self, X, seg_contents_exist = False, size = 256, min_count=5, iter=10): 
        #size is ncol of word vector, min_count represent the minimum appearence of a word be count in the model
        self.size = 256
        if(seg_contents_exist == False):
            if os.path.exists("content.txt"):
                os.remove("content.txt")
            else:
                pass
            contents_segs = seg_words_write(X)
        else:
            contents_segs = seg_words(X)
        self.embedder = Word2Vec(PathLineSentences("contents.txt"),size=size, window=10, min_count=min_count, iter=iter)
        return self
    
    def transform(self, X):
        contents_segs = seg_words(X)
        return sentence_vectorize(self.size,contents_segs,self.embedder)
    
    def fit_transform(self, X, seg_contents_exist = False, size = 256, min_count=5, iter=10):
        self.size = 256
        if(seg_contents_exist == False):
            if os.path.exists("content.txt"):
                os.remove("content.txt")
            else:
                pass
            contents_segs = seg_words_write(X)
        else:
            contents_segs = seg_words(X)
        # I still need to do word2vec by reading data from a file directly. Don't know how to handle it
        self.embedder = Word2Vec(PathLineSentences("contents.txt"),size=size, window=10, min_count=min_count, iter=iter) 
        return sentence_vectorize(self.size,contents_segs,self.embedder)

In [8]:
data = pd.read_csv("sentiment_analysis_trainingset.csv", encoding="utf-8") 
text = data['content'] #Training data

In [9]:
valdf = pd.read_csv("sentiment_analysis_validationset.csv", encoding="utf-8") 
text_val = valdf['content'] #Validation data

In [16]:
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import RidgeClassifier

In [11]:
transformer = Word_to_Vec()

In [12]:
X = transformer.fit_transform(text)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ZHANKE~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.876 seconds.
Prefix dict has been built succesfully.


In [13]:
X_val = transformer.transform(text_val)

In [14]:
transformer.embedder.most_similar('中饭') #Get most similar words

[('午饭', 0.8135820627212524),
 ('晚饭', 0.7785147428512573),
 ('吃晚饭', 0.6976807117462158),
 ('就近', 0.6894402503967285),
 ('办事', 0.6685695052146912),
 ('肚子饿', 0.658237874507904),
 ('早饭', 0.6402450799942017),
 ('觅食', 0.6394287347793579),
 ('顺便来', 0.6358612179756165),
 ('顺道', 0.629756510257721)]

Word2vec + RidgeClassifier

In [17]:
for i in range(20):
    j = i + 2
    y = list(data[data.columns[j]])
    y_val = list(valdf[data.columns[j]])
    clf = RidgeClassifier().fit(X, y)
    clf.fit(X, y)
    print(data.columns[j]) #col name
    print(clf.score(X_val,y_val)) # score
    print(confusion_matrix(y_val,clf.predict(X_val))) # confusion matrix

location_traffic_convenience
0.874933333333
[[11584     0     0   173]
 [  133     0     0    49]
 [  100     0     0    36]
 [ 1385     0     0  1540]]
location_distance_from_business_district
0.8296
[[11781     0     0   251]
 [   88     0     0     2]
 [   70     0     0    10]
 [ 2135     0     0   663]]
location_easy_to_find
0.821933333333
[[11257     6     0   253]
 [  414    23     0   115]
 [  251     4     0    74]
 [ 1553     1     0  1049]]
service_wait_time
0.887866666667
[[13216     7    10     4]
 [  380    60    20     5]
 [  549    11    31     8]
 [  674     8     6    11]]
service_waiters_attitude
0.695666666667
[[5455   26    4  507]
 [ 361  529   14  305]
 [ 868  136   17  809]
 [1494   38    3 4434]]
service_parking_convenience
0.940066666667
[[14042     0     0     4]
 [  168     0     0    20]
 [  186     0     0    18]
 [  503     0     0    59]]
service_serving_speed
0.8674
[[12633    13     0    31]
 [  629    81     0    95]
 [  311     5     0    52]
 [  847

Word2vec + LinearSVC

In [18]:
for i in range(20):
    j = i + 2
    y = list(data[data.columns[j]])
    y_val = list(valdf[data.columns[j]])
    clf =  LinearSVC(random_state=0,max_iter=700)
    clf.fit(X, y)
    print(data.columns[j])
    print(clf.score(X_val,y_val))
    print(confusion_matrix(y_val,clf.predict(X_val)))



location_traffic_convenience
0.882066666667
[[11445     0     0   312]
 [  124     0     0    58]
 [   92     0     0    44]
 [ 1139     0     0  1786]]




location_distance_from_business_district
0.831533333333
[[11714     0     0   318]
 [   87     0     0     3]
 [   68     0     0    12]
 [ 2039     0     0   759]]




location_easy_to_find
0.829733333333
[[11192    21     0   303]
 [  382    58     0   112]
 [  239    14     0    76]
 [ 1402     5     0  1196]]




service_wait_time
0.8902
[[13178    22    15    22]
 [  343   106    12     4]
 [  512    42    28    17]
 [  634    17     7    41]]




service_waiters_attitude
0.721333333333
[[5356   58   10  568]
 [ 211  671   18  309]
 [ 634  240   44  912]
 [1112   90   18 4749]]




service_parking_convenience
0.947866666667
[[14028     0     0    18]
 [  111     1     0    76]
 [  151     0     0    53]
 [  373     0     0   189]]




service_serving_speed
0.8756
[[12573    33     0    71]
 [  556   162     0    87]
 [  288    20     0    60]
 [  727    24     0   399]]




price_level
0.625
[[6999   62  297  139]
 [ 614  475  550  121]
 [1729  261 1229  296]
 [ 952   95  509  672]]




price_cost_effective
0.7982
[[11090     1     0   337]
 [  369     1     1    74]
 [  297     2     0    99]
 [ 1847     0     0   882]]




price_discount
0.747533333333
[[8817    0  191  254]
 [ 138    0   86   42]
 [1372    0  860  399]
 [ 956    0  349 1536]]




environment_decoration
0.739133333333
[[6975    0    4  822]
 [ 150    0    2  123]
 [ 578    0    5  729]
 [1505    0    0 4107]]




environment_noise
0.749066666667
[[9886    1    0  634]
 [ 354    7    0  120]
 [ 488    1    0  176]
 [1990    0    0 1343]]




environment_space
0.675333333333
[[8783   20   14  674]
 [ 579   38    9  148]
 [ 990   12   17  291]
 [2113   12    8 1292]]




environment_cleaness
0.7408
[[8885   14    0  697]
 [ 369   66    0  190]
 [ 397    4    0  226]
 [1985    6    0 2161]]




dish_portion
0.654133333333
[[7474   57    1  580]
 [ 854  247    2  339]
 [ 739   62    2  602]
 [1904   47    1 2089]]




dish_taste
0.680533333333
[[ 138   30  150  438]
 [  17  115  381   67]
 [  39   27 3510 2244]
 [  33    9 1357 6445]]




dish_look
0.750066666667
[[10387     1     0   370]
 [  405     3     0    47]
 [  572     0     0    89]
 [ 2265     0     0   861]]




dish_recommendation
0.8198
[[11872     0     0   211]
 [  290     0     0    45]
 [  201     0     0    86]
 [ 1870     0     0   425]]




others_overall_experience
0.7608
[[   0   12   33  240]
 [   0  811  262  210]
 [   0  256  953 2148]
 [   0   50  377 9648]]
others_willing_to_consume_again
0.718733333333
[[8533   18    0  803]
 [ 502   38    0   37]
 [ 335    6    0   54]
 [2461    3    0 2210]]


