In [67]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
import pdir as pr
import pandas as pd
import os
from tqdm import tqdm, tnrange, tqdm_notebook

DF = pd.DataFrame
arr = np.array

import re
import string
import operator

# 读取数据集

In [68]:
def loadDataSet(filePath):
    articles, labels = [], []
    with open(filePath, 'r', encoding="utf-8") as f:
        for line in f.readlines():
            part = line.split("\t")
            #这里已经将标签转换为大写，数据转换成小写了
            label, article = part[0].upper().strip(), part[2].lower().replace("<sssss>", " ").strip().split(" ")
            articles.append(article)
            labels.append(label)
    return articles, labels
        

train_articles, train_labels = loadDataSet(".\\data\\MulLabelTrain.ss")
test_articles, test_labels = loadDataSet(".\\data\\MulLabelTest.ss")

len(train_articles), len(test_articles)

(62522, 8671)

In [69]:
print(train_articles[0], "\n\n", train_labels[0])

['we', 'went', 'on', 'a', 'sunday', 'around', '11am', 'we', 'got', 'seated', 'right', 'away', '.', '', '', 'the', 'menu', 'is', 'trying', 'to', 'be', 'like', 'a', 'new-american', 'style', 'type', 'of', 'menu', '.', '', '', 'i', 'ordered', 'chicken', 'and', 'waffles', 'and', 'the', 'bo', 'ordered', 'a', 'breakfast', 'burrito', '.', '', '', 'our', 'meals', 'took', 'around', '25', 'minutes', 'to', 'come', 'out', 'and', 'my', 'boyfriend', 'even', 'commented', 'saying', 'how', 'all', 'the', 'people', 'sitting', 'next', 'to', 'us', 'no', 'one', 'had', 'a', 'plate', 'in', 'front', 'of', 'them', '.', '', '', 'sunday', 'must', 'have', 'been', 'a', 'new', 'employee', 'training', 'day', 'because', 'there', 'was', 'so', 'many', 'kids', 'wandering', 'around', 'and', 'around', 'the', 'restaurant', 'with', 'no', 'real', 'task', 'at', 'hand', '.', '', '', 'the', 'coffee', '...', 'the', 'coffee', 'was', 'good', '.', '', '', 'then', ',', 'finally', 'our', 'food', 'came', 'out', '.', '', '', 'my', 'chick

# 去除停用词

In [70]:
stopwords = []
with open(".\\data\\stopwords-en.txt", "r", encoding='utf-8') as f:
    for i in f.readlines():
        stopwords.append(i.lower().strip())
        
len(stopwords)
print(stopwords[0:100])

922

['', "''", '``', '$', '?', '...', '●', '-', '–', '*', '.', ',', ':', '0', '1', '2', ')', '(', '/', '3', '4', '5', '6', ';', '7', '8', '9', '’', '“', "'d", "'ll", "'m", "'re", "'s", "'t", "'ve", 'zt', 'zz', 'a', 'our', 'yours', "a's", 'able', 'about', 'above', 'abst', 'accordance', 'according', 'accordingly', 'across', 'act', 'actually', 'added', 'adj', 'adopted', 'affected', 'affecting', 'affects', 'after', 'afterwards', 'again', 'against', 'ah', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'announce', 'another', 'any', 'anybody', 'anyhow', 'anymore', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'apparently', 'appear', 'appreciate', 'appropriate', 'approximately', 'are', 'area', 'areas', 'aren']


In [71]:
train_articles_backup = train_articles
test_articles_backup = test_articles

In [72]:
def remove_stopwords(articles):
    new_articles = []
    for article in articles:
        new_articles.append([j  for j in article if j not in stopwords and len(j)>1])
    return new_articles

train_articles = remove_stopwords(train_articles_backup)
test_articles = remove_stopwords(test_articles_backup)

In [73]:
print(train_articles[0])
print(test_articles[0])

['sunday', '11am', 'seated', 'menu', 'new-american', 'style', 'type', 'menu', 'chicken', 'waffles', 'bo', 'breakfast', 'burrito', 'meals', '25', 'minutes', 'boyfriend', 'commented', 'people', 'sitting', 'plate', 'front', 'sunday', 'employee', 'training', 'day', 'kids', 'wandering', 'restaurant', 'real', 'task', 'hand', 'coffee', 'coffee', 'finally', 'food', 'chicken', 'baked', 'chicken', 'breast', 'breaded', 'waffle', 'half', 'waffle', 'pieces', 'toasted', 'cajun', 'style', '-lrb-', 'brick', '-rrb-', 'seasoned', 'pepper', 'scrambled', 'eggs', 'mixed', 'syrup', 'boy', 'burrito', 'green', 'pita', 'wrap', 'tortilla', 'carne', 'mixed', 'eggs', 'avocado', 'carne', 'meat', 'lot', 'fat', 'breakfast', '30', 'wont']
['love', 'mojo', 'evil', 'treat', 'flavors', 'yummy', 'minor', 'downfall', 'mix-ins', 'hard', 'reach', 'girl', 'literally', 'head', 'bar', 'grossed', 'idea', 'aka', 'human', 'debris', 'hair', 'yogurt']


# 去除高频词和低频词得到所有词列表

In [75]:
all_word_train_mix = [j for i in train_articles for j in i]
all_word_test_mix = [j for i in test_articles for j in i]

len(all_word_train_mix), len(all_word_test_mix)

(3461195, 475782)

In [125]:
all_word_mix = all_word_train_mix + all_word_test_mix
len(all_word_mix)

3936977

In [143]:
#根据Counter统计的词频初始化df
df = DF.from_dict(Counter(all_word_mix), orient='index').reset_index()
#根据count值降序排序
df = df.rename(columns={'index':'words',
                              0:'count'}).sort_values(["count"],ascending=False).reset_index(drop=True) 
df

Unnamed: 0,words,count
0,-rrb-,57854
1,-lrb-,52751
2,food,50410
3,time,28683
4,service,23633
5,love,19774
6,nice,18475
7,menu,18262
8,pretty,16828
9,bar,16221


In [141]:
low, high = 500, 50000
df2 = df[low <= df['count']]
df2 = df2[df2['count'] <= high].reset_index(drop=True) 
df2.shape
df2

(1334, 2)

Unnamed: 0,words,count
0,time,28683
1,service,23633
2,love,19774
3,nice,18475
4,menu,18262
5,pretty,16828
6,bar,16221
7,chicken,15716
8,restaurant,14619
9,cheese,14202


In [167]:
df3 = df2.drop(["count"], axis=1)
df3.head(20)

Unnamed: 0,words
0,time
1,service
2,love
3,nice
4,menu
5,pretty
6,bar
7,chicken
8,restaurant
9,cheese


## 去除数字以及其他无用词汇

In [159]:
# df3.to_csv("temp.csv")

In [190]:
df3.shape
df4 = df3["words"].apply(lambda x: re.sub("[0-9||\.||\?||!]*", "", x))
df4 = df4[df4 != ""]
df4.shape

(1334, 1)

(1316,)

In [189]:
all_word_unique = df4.values
all_word_unique

array(['time', 'service', 'love', ..., 'fondue', 'cards', 'miles'], dtype=object)

## 测试-1

检查训练集和测试集的词的不重复数

In [113]:
all_word_train = set(all_word_train_mix)
all_word_test = set(all_word_test_mix)

len(all_word_train), len(all_word_test), (len(all_word_train) - len(all_word_test))

diff1 = all_word_train.difference(all_word_test)
diff2 = all_word_test.difference(all_word_train)

print("number of words in train but not in test: ", len(diff1))
print("number of words in test but not in train: ", len(diff2))

all_word = all_word_train.union(all_word_test)
len(all_word)

(89453, 31612, 57841)

number of words in train but not in test:  64358
number of words in test but not in train:  6517


95970

## 测试-2
检查词频分布

In [129]:
def show_common_word_rate(lst, k):
    word_counter = Counter(lst)
    cnt = 0
    tot_cnt = len(lst)
    common_pair = word_counter.most_common(k)
    for key, val in common_pair:
        cnt += val
    print("max frequent word and count: ", common_pair[0])
    print("min frequent word and count: ", common_pair[-1])
    print("frequent word count: ", cnt)
    print("total word count: ", tot_cnt)
    print("remain word count: ", tot_cnt-cnt)
    print("rate:%.5f%%" % (cnt/tot_cnt*100))

#测试
#show_common_word_rate([1,2,3,4,2,2], 2)
show_common_word_rate(all_word_mix, 500)
show_common_word_rate(all_word_mix, 50000)

max frequent word and count:  ('-rrb-', 57854)
min frequent word and count:  ('cocktails', 1404)
frequent word count:  1996109
total word count:  3936977
remain word count:  1940868
rate:50.70157%
max frequent word and count:  ('-rrb-', 57854)
min frequent word and count:  ('candeliar', 1)
frequent word count:  3891007
total word count:  3936977
remain word count:  45970
rate:98.83235%


# 得到TF-IDF矩阵

In [199]:
def getTF(dataSet, allWords):
    '''得到输入数据集的TF矩阵'''
    def safeDivide(a, b):
        return a/b if b!=0 else 0
    
    TF=[]
    for index in tnrange(len(dataSet)):
        TF.append([])
        wordCounter = Counter(dataSet[index])
        for word in allWords:
            TF[index].append(safeDivide(wordCounter.get(word,0), len(dataSet[index])))
    return arr(TF)

train_TF = getTF(train_articles, all_word_unique)
test_TF = getTF(test_articles, all_word_unique)

train_TF.shape, test_TF.shape





((62522, 1316), (8671, 1316))

In [220]:
dirPath = "data preprocessed\\tf-idf"
if not os.path.exists(dirPath):
    os.makedirs(dirPath)

DF(train_TF).to_csv(dirPath + '\\train_tf.csv', index=False, header=False)
DF(test_TF).to_csv(dirPath + '\\test_tf.csv', index=False, header=False)
DF(all_word_unique).to_csv(dirPath + '\\all_word_unique.csv', index=False, header=False)

这里需要注意，有一些样本去掉停用词后整个都没有数据了，这时候就默认TF那一列都为0了。

In [200]:
train_articles[5773]

[]

In [205]:
import math

def getIDF(dataSet, allWords):
    '''得到输入数据集的IDF矩阵'''
    def calcIDF(num):
        '''计算对应数据集的单词的IDF值'''
        return math.log(len(dataSet)/(1+num), 2)
    
    IDF=[]
    for i in tnrange(len(allWords)):
        cnt = 0
        #计算词在每个文档出现的次数
        for doc in dataSet: 
            if allWords[i] in doc:
                cnt += 1
        IDF.append(calcIDF(cnt))
    return arr(IDF)

train_IDF = getIDF(train_articles, all_word_unique)
test_IDF = getIDF(test_articles, all_word_unique)

train_IDF.shape, test_IDF.shape





((1316,), (1316,))

In [206]:
train_TFIDF = train_TF * train_IDF
test_TFIDF = test_TF * test_IDF

train_TFIDF.shape, test_TFIDF.shape

((62522, 1316), (8671, 1316))

## 划分数据集

In [208]:
#划分比例
splitRate = 0.3
#划分的数目
splitNum = int(train_TFIDF.shape[0]*splitRate) 
#得到 训练集 和验证集
trainSet = train_TFIDF[:-splitNum]
validateSet = train_TFIDF[-splitNum:]

trainSet.shape, validateSet.shape

((43766, 1316), (18756, 1316))

In [210]:
train_labels = arr(train_labels)
test_labels = arr(test_labels)

trainSetLabel = train_labels[:-splitNum]
validateSetLabel = train_labels[-splitNum:]

trainSetLabel.shape, validateSetLabel.shape

((43766,), (18756,))

In [218]:
DF(trainSetLabel[0:20]).replace("LOW",0).replace("MID",1).replace("HIG",2)

Unnamed: 0,0
0,0
1,2
2,0
3,2
4,0
5,0
6,1
7,1
8,0
9,0


## 保存结果

In [212]:
dirPath = "data preprocessed\\tf-idf"
if not os.path.exists(dirPath):
    os.makedirs(dirPath)

DF(trainSet).to_csv(dirPath + '\\train.csv', index=False, header=False)
DF(validateSet).to_csv(dirPath + '\\validate.csv', index=False, header=False)
DF(test_TFIDF).to_csv(dirPath + '\\test.csv', index=False, header=False)

输出标签映射为数字。

In [219]:
trainSetLabel = DF(trainSetLabel).replace("LOW",0).replace("MID",1).replace("HIG",2)
validateSetLabel = DF(validateSetLabel).replace("LOW",0).replace("MID",1).replace("HIG",2)

DF(trainSetLabel).to_csv(dirPath + '\\train_label.csv', index=False, header=False)
DF(validateSetLabel).to_csv(dirPath + '\\validate_label.csv', index=False, header=False)