In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
import pdir as pr
import pandas as pd
import os
from tqdm import tqdm, tnrange, tqdm_notebook

DF = pd.DataFrame
arr = np.array

import re
import string
import operator

# 读取数据集

In [2]:
def loadDataSet(filePath):
    articles, labels = [], []
    with open(filePath, 'r', encoding="utf-8") as f:
        for line in f.readlines():
            part = line.split("\t")
            #这里已经将标签转换为大写，数据转换成小写了
            label, article = part[0].upper().strip(), part[2].lower().replace("<sssss>", " ").strip().split(" ")
            articles.append(article)
            labels.append(label)
    return articles, labels
        

train_articles, train_labels = loadDataSet(".\\data\\MulLabelTrain.ss")
test_articles, test_labels = loadDataSet(".\\data\\MulLabelTest.ss")

len(train_articles), len(test_articles)

(62522, 8671)

In [3]:
print(train_articles[0], "\n\n", train_labels[0])

['we', 'went', 'on', 'a', 'sunday', 'around', '11am', 'we', 'got', 'seated', 'right', 'away', '.', '', '', 'the', 'menu', 'is', 'trying', 'to', 'be', 'like', 'a', 'new-american', 'style', 'type', 'of', 'menu', '.', '', '', 'i', 'ordered', 'chicken', 'and', 'waffles', 'and', 'the', 'bo', 'ordered', 'a', 'breakfast', 'burrito', '.', '', '', 'our', 'meals', 'took', 'around', '25', 'minutes', 'to', 'come', 'out', 'and', 'my', 'boyfriend', 'even', 'commented', 'saying', 'how', 'all', 'the', 'people', 'sitting', 'next', 'to', 'us', 'no', 'one', 'had', 'a', 'plate', 'in', 'front', 'of', 'them', '.', '', '', 'sunday', 'must', 'have', 'been', 'a', 'new', 'employee', 'training', 'day', 'because', 'there', 'was', 'so', 'many', 'kids', 'wandering', 'around', 'and', 'around', 'the', 'restaurant', 'with', 'no', 'real', 'task', 'at', 'hand', '.', '', '', 'the', 'coffee', '...', 'the', 'coffee', 'was', 'good', '.', '', '', 'then', ',', 'finally', 'our', 'food', 'came', 'out', '.', '', '', 'my', 'chick

# 去除停用词

In [4]:
stopwords = []
with open(".\\data\\stopwords-en.txt", "r", encoding='utf-8') as f:
    for i in f.readlines():
        stopwords.append(i.lower().strip())
        
len(stopwords)
print(stopwords)

170

["'d", "'ll", "'m", "'re", "'s", "'t", "'ve", "n't", "''", '``', 'etc', 'got', 'go', 'get', 'also', 'would', 'could', 'through', 'all', 'ours', 'being', 'if', 'is', 'did', 's', 'hasn', 'only', 'against', 'each', 'how', 'nor', 'needn', 'for', 'until', 'them', 'yourself', 'ourselves', 'your', 'once', 'my', 'they', 'll', 'couldn', 'won', 're', 'him', 'had', 'me', 'further', 'such', 'too', 'are', 'our', 'can', 'where', 'same', 'am', 'why', 'the', 'yours', 'does', 'after', 'on', 'mightn', 'their', 'his', 'over', 'were', 'shouldn', 'about', 'very', 'aren', 'it', 'not', 'its', 'was', 'few', 'haven', 'because', 'theirs', 'down', 'from', 'd', 'you', 'which', 'than', 'do', 'an', 'been', 'off', 'who', 'now', 'what', 'below', 'while', 'both', 'more', 'this', 'himself', 'when', 'wouldn', 'he', 'just', 'a', 'don', 'up', 'shan', 'during', 'we', 'didn', 'or', 'o', 'y', 've', 'yourselves', 'in', 'own', 'again', 'here', 'have', 'to', 'between', 'that', 'at', 'ain', 'into', 'and', 'doesn', 'weren', 'of',

In [5]:
train_articles_backup = train_articles.copy()
test_articles_backup = test_articles.copy()

In [6]:
def remove_stopwords_and_useless_words(articles):
    
    def is_useless(j):
        if len(j) <= 1 and j != "!" and j != "?":
            return True
        else:
            return False
        
    new_articles = []
    
    for article in articles:
        new_articles.append([j  for j in article if j not in stopwords and not is_useless(j)])
        
    return new_articles

train_articles = remove_stopwords_and_useless_words(train_articles_backup)
test_articles = remove_stopwords_and_useless_words(test_articles_backup)

In [7]:
print(train_articles[0])
print(test_articles[0])

['went', 'sunday', 'around', '11am', 'seated', 'right', 'away', 'menu', 'trying', 'like', 'new-american', 'style', 'type', 'menu', 'ordered', 'chicken', 'waffles', 'bo', 'ordered', 'breakfast', 'burrito', 'meals', 'took', 'around', '25', 'minutes', 'come', 'boyfriend', 'even', 'commented', 'saying', 'people', 'sitting', 'next', 'us', 'one', 'plate', 'front', 'sunday', 'must', 'new', 'employee', 'training', 'day', 'many', 'kids', 'wandering', 'around', 'around', 'restaurant', 'real', 'task', 'hand', 'coffee', '...', 'coffee', 'good', 'finally', 'food', 'came', 'chicken', 'good', 'baked', 'chicken', 'breast', 'breaded', 'waffle', 'half', 'waffle', 'two', 'pieces', 'came', 'toasted', 'cajun', 'style', '-lrb-', 'like', 'brick', '-rrb-', 'think', 'seasoned', 'pepper', 'something', '?', 'scrambled', 'eggs', 'came', 'already', 'mixed', 'syrup', 'boy', 'burrito', 'came', 'green', 'pita', 'wrap', 'tortilla', 'carne', 'mixed', 'eggs', 'avocado', 'side', 'carne', 'meat', 'lot', 'fat', 'breakfast'

# 去除高频词和低频词得到所有词列表

In [8]:
all_word_train_mix = [j for i in train_articles for j in i]
all_word_test_mix = [j for i in test_articles for j in i]

len(all_word_train_mix), len(all_word_test_mix)

(4971274, 683802)

In [9]:
all_word_mix = all_word_train_mix + all_word_test_mix
len(all_word_mix)

5655076

In [10]:
def getCountDF(in_list):  
    #根据Counter统计的词频初始化df
    df = DF.from_dict(Counter(in_list), orient='index').reset_index()
    #根据count值降序排序
    df = df.rename(columns={'index':'words',
                                  0:'count'}).sort_values(["count"],ascending=False).reset_index(drop=True)
    return df

all_word_mix_df = getCountDF(all_word_mix)
all_word_mix_df

Unnamed: 0,words,count
0,!,79097
1,good,60518
2,-rrb-,57854
3,place,55116
4,-lrb-,52751
5,...,50579
6,food,50410
7,like,45040
8,great,39185
9,one,33928


In [11]:
low = 500
df2 = all_word_mix_df[low < all_word_mix_df['count']].reset_index(drop=True) 
df2.shape
df2

(1665, 2)

Unnamed: 0,words,count
0,!,79097
1,good,60518
2,-rrb-,57854
3,place,55116
4,-lrb-,52751
5,...,50579
6,food,50410
7,like,45040
8,great,39185
9,one,33928


In [12]:
df3 = df2.drop(["count"], axis=1)
df3.head(20)

Unnamed: 0,words
0,!
1,good
2,-rrb-
3,place
4,-lrb-
5,...
6,food
7,like
8,great
9,one


## 去除数字以及其他无用词汇

In [13]:
df3["words"].shape
df4 = df3["words"].apply(lambda x: re.sub("[0-9||\.||\?||!]*", "", x))
df4 = df4[df4 != ""]
df4.shape

(1665,)

(1644,)

In [14]:
all_word_unique = df4.values
all_word_unique

array(['good', '-rrb-', 'place', ..., 'fault', 'shake', 'fondue'], dtype=object)

## 测试-1

检查训练集和测试集的词的不重复数

In [15]:
# all_word_train = set(all_word_train_mix)
# all_word_test = set(all_word_test_mix)

# len(all_word_train), len(all_word_test), (len(all_word_train) - len(all_word_test))

# diff1 = all_word_train.difference(all_word_test)
# diff2 = all_word_test.difference(all_word_train)

# print("number of words in train but not in test: ", len(diff1))
# print("number of words in test but not in train: ", len(diff2))

# all_word = all_word_train.union(all_word_test)
# len(all_word)

## 测试-2
检查词频分布

In [16]:
# def show_common_word_rate(lst, k):
#     word_counter = Counter(lst)
#     cnt = 0
#     tot_cnt = len(lst)
#     common_pair = word_counter.most_common(k)
#     for key, val in common_pair:
#         cnt += val
#     print("max frequent word and count: ", common_pair[0])
#     print("min frequent word and count: ", common_pair[-1])
#     print("frequent word count: ", cnt)
#     print("total word count: ", tot_cnt)
#     print("remain word count: ", tot_cnt-cnt)
#     print("rate:%.5f%%" % (cnt/tot_cnt*100))

# #测试
# #show_common_word_rate([1,2,3,4,2,2], 2)
# show_common_word_rate(all_word_mix, 500)
# show_common_word_rate(all_word_mix, 50000)

# 得到TF-IDF矩阵

In [17]:
def getTF(dataSet, allWords):
    '''得到输入数据集的TF矩阵'''
    def safeDivide(a, b):
        return a/b if b!=0 else 0
    
    TF=[]
    for index in tnrange(len(dataSet)):
        TF.append([])
        wordCounter = Counter(dataSet[index])
        for word in allWords:
            TF[index].append(safeDivide(wordCounter.get(word,0), len(dataSet[index])))
    return arr(TF)

train_TF = getTF(train_articles, all_word_unique)
test_TF = getTF(test_articles, all_word_unique)

train_TF.shape, test_TF.shape





((62522, 1644), (8671, 1644))

## 保存TF矩阵

In [18]:
dirPath = "E:\\Code\\_largeData\\Github--Open-Course-Learning--A04\\Project\\multiclass classification\\data preprocessed\\tf-idf"
if not os.path.exists(dirPath):
    os.makedirs(dirPath)

    
train_tf_DF = DF(train_TF)
train_tf_DF.columns = all_word_unique

test_tf_DF = DF(test_TF)
test_tf_DF.columns = all_word_unique

train_tf_DF.to_csv(dirPath + '\\train_tf.csv', index=False, header=True)
test_tf_DF.to_csv(dirPath + '\\test_tf.csv', index=False, header=True)

DF(all_word_unique).to_csv(dirPath + '\\all_word_unique.csv', index=False, header=True)

这里需要注意，有一些样本去掉停用词后整个都没有数据了，这时候就默认TF那一列都为0了。

In [19]:
train_articles[5773]

['tried',
 'place',
 'twice',
 'different',
 'different',
 'place',
 'much',
 'else',
 'going']

## 计算TF-IDF矩阵

In [20]:
import math

def getIDF(dataSet, allWords):
    '''得到输入数据集的IDF矩阵'''
    def calcIDF(num):
        '''计算对应数据集的单词的IDF值'''
        return math.log(len(dataSet)/(1+num), 2)
    
    IDF=[]
    for i in tnrange(len(allWords)):
        cnt = 0
        #计算词在每个文档出现的次数
        for doc in dataSet: 
            if allWords[i] in doc:
                cnt += 1
        IDF.append(calcIDF(cnt))
    return arr(IDF)

train_IDF = getIDF(train_articles, all_word_unique)
test_IDF = getIDF(test_articles, all_word_unique)

train_IDF.shape, test_IDF.shape





((1644,), (1644,))

In [21]:
import math

def getIDF(dataSet, allWords):
    '''得到输入数据集的IDF矩阵'''
    def calcIDF(num):
        '''计算对应数据集的单词的IDF值'''
        return math.log(len(dataSet)/(1+num), 2)
    
    IDF=[]
    for i in tnrange(len(allWords)):
        cnt = 0
        #计算词在每个文档出现的次数
        for doc in dataSet: 
            if allWords[i] in doc:
                cnt += 1
        IDF.append(calcIDF(cnt))
    return arr(IDF)

train_IDF = getIDF(train_articles, all_word_unique)
test_IDF = getIDF(test_articles, all_word_unique)

train_IDF.shape, test_IDF.shape





((1644,), (1644,))

In [22]:
train_TFIDF = train_TF * train_IDF
test_TFIDF = test_TF * test_IDF

train_TFIDF.shape, test_TFIDF.shape

((62522, 1644), (8671, 1644))

### 保存TF-IDF矩阵

In [23]:
train_tfidf = DF(train_TFIDF)
train_tfidf.columns = all_word_unique

test_tfidf = DF(test_TFIDF)
test_tfidf.columns = all_word_unique

train_tfidf.to_csv(dirPath + '\\train_tfidf.csv', index=False, header=True)
test_tfidf.to_csv(dirPath + '\\test_tfidf.csv', index=False, header=True)

## 划分数据集

In [24]:
#划分比例
splitRate = 0.3
#划分的数目
splitNum = int(train_TFIDF.shape[0]*splitRate) 
#得到 训练集 和验证集
trainSet = train_TFIDF[:-splitNum]
validateSet = train_TFIDF[-splitNum:]

trainSet.shape, validateSet.shape

((43766, 1644), (18756, 1644))

In [25]:
train_labels = arr(train_labels)
test_labels = arr(test_labels)

trainSetLabel = train_labels[:-splitNum]
validateSetLabel = train_labels[-splitNum:]

trainSetLabel.shape, validateSetLabel.shape

((43766,), (18756,))

In [26]:
DF(trainSetLabel[0:20]).replace("LOW",0).replace("MID",1).replace("HIG",2)

Unnamed: 0,0
0,0
1,2
2,0
3,2
4,0
5,0
6,1
7,1
8,0
9,0


## 保存结果

In [27]:
DF(trainSet).to_csv(dirPath + '\\train.csv', index=False, header=False)
DF(validateSet).to_csv(dirPath + '\\validate.csv', index=False, header=False)
DF(test_TFIDF).to_csv(dirPath + '\\test.csv', index=False, header=False)

输出标签映射为数字。

In [28]:
trainSetLabel = DF(trainSetLabel).replace("LOW",0).replace("MID",1).replace("HIG",2)
validateSetLabel = DF(validateSetLabel).replace("LOW",0).replace("MID",1).replace("HIG",2)

DF(trainSetLabel).to_csv(dirPath + '\\train_label.csv', index=False, header=False)
DF(validateSetLabel).to_csv(dirPath + '\\validate_label.csv', index=False, header=False)