In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import spacy
from nltk.stem import PorterStemmer,LancasterStemmer
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,accuracy_score,recall_score,precision_score,classification_report
from sklearn.model_selection import train_test_split

data = pd.read_csv('data/train.tsv',sep='\t')
x_train,x_test,y_train,y_test = train_test_split(data['Phrase'],data['Sentiment'],test_size=0.2,random_state=42,shuffle=False)
df = pd.DataFrame(x_train,y_train)
df

Unnamed: 0_level_0,Phrase
Sentiment,Unnamed: 1_level_1
1,A series of escapades demonstrating the adage ...
2,A series
2,A series
2,A series
2,A series
...,...
3,A
3,A
4,series
4,series


In [13]:
# 文本清洗
class TextCleaner(object):
    d1 = "We're having a.b.c.d or i.e. u.s.a and 16.2 and U.K. and others."
    d2 = "We buy 12 apples, good apples, from Ms. ABC at 16.2 dollars/pound 24/7 from Monday-Friday. How's that?"
    d3 = "1. I won't eat 1/12 of the #1 top cakes. I got 1.2 dollars or 1.2usd and a long-term/short-term goal"
    d4 = "I lost 22 pounds at 9:30, 11:05 or 1:30pm or 2pm or 3pm or 1-2pm on 2016-01-02 or 01-02-2016."
    d5 = "  --He's a dedicated person. \t He dedicated his time to work! \n --are you sure?"
    d6 = "I am interested in these interesting things that interested me."
    sample_docs = [d1, d2, d3, d4, d5, d6]

    def __init__(self, option='none'):
        self.option = option
        self.nlp = spacy.load('en_core_web_sm')
        self.stemmer = PorterStemmer()

    def lemmatize(self, text):
        tuples = [(str(token), token.lemma_) for token in self.nlp(text)]
        text = ' '.join([lemma if lemma != '-PRON-' else token for token, lemma in tuples])
        text = re.sub(' ([_-]) ', '\\1', text)  # 'short - term' -> 'short-term'
        return text

    def text_cleanner(self, text):
        stopwords = set(['i', 'I', 'you', 'he', 'she', 'it', 'we', 'they', 'am', 'is', 'are'])
        text = text.lower()  # 1.lower strings
        text = text.replace("'s", 's')  # 2.handle contractions
        text = re.sub('([a-zA-Z])\\.([a-zA-Z])\\.*', '\\1\\2', text)  # 3. handle abbreviations
        text = re.sub('\\d+', '', text)  # 4. handle numbers
        text = re.sub('[^a-zA-Z0-9\\s_-]', ' ', text)  # 5. remove punctuations
        if 'lemma' in self.option:
            text = self.lemmatize(text)  # 6. perform lemmatization
        elif 'stem' in self.option:
            text = ' '.join(self.stemmer.stem(x) for x in text.split(' '))  # 6. perform stemming
        text = ' '.join(x for x in text.split(' ') if
                        x not in stopwords)  # 7. remove stopwords(stopwords类型是set而不是list，因为set的查找速度更快)
        text = ' '.join(x.strip('_-') for x in text.split())  # 8. remove spaces,'_' and '-'
        return text

    def transform(self, docs):  # transformer必须要有fit()和transform(）方法
        clean_docs = []
        self.fails = []
        for doc in tqdm(docs):
            try:
                clean_docs.append(self.text_cleanner(doc))
            except:
                self.fails.append(doc)
        if len(self.fails) > 0:
            print("Some documents failed to be converted. Check self.fails for failed documents")
        return clean_docs

    def fit(self, docs, y=None):
        return self

    def fit_transform(self, docs, y=None):
        return self.fit(docs,y).transform(docs)

In [14]:
cleanner = TextCleaner(option='lemma')
x_train = cleanner.transform(x_train)
# for raw,clean in zip(x_train, x_train):
#     # print('original doc:', raw)
#     print(clean)
#     print()
print('X_train shape:', len(x_train), '\t', 'Y_train shape:', len(y_train))

100%|██████████| 124848/124848 [06:29<00:00, 320.59it/s]

X_train shape: 124848 	 Y_train shape: 124848





In [15]:
# Tf-idf特征提取
vec = TfidfVectorizer(
    input='content',                     # default；{‘filename’，‘file’，‘content’}；input must be a list
    encoding='utf-8',                    # default; same options as in str.decode(encoding='utf-8')
    decode_error='strict',               # default; same options as in str.decode(errors='strict')
    # preprocessing arguments
    lowercase=True,                      # default; convert strings to lowercase
    strip_accents='unicode',             # default=None; remove accents; 'unicode' is slower but universal
    preprocessor=None,                   # default
    # tokenization arguments
    analyzer='word',                     # default;{'word','char','char_wb'}
    token_pattern=u'(?u)\\b\\w\\w+\\b',  # default; equivalent to using a nltk.RegexpTokenizer()
    tokenizer=None,                      # default; can use a nltk tokenizer
    # vocabulary arguments
    stop_words=None,                     # default=None; try stop_words='english' for example
    ngram_range=(1,2),                   # default=(1,1)
    min_df=1,                            # default=1  ; int or [0.0, 1.0]; ignore terms with a doc-freq < cutoff
    max_df=0.8,                          # default=1.0; [0.0, 1.0] or int; ignore terms with a doc-freq > cutoff; 整数是多少个，小数是百分比
    max_features=None,                   # default; keep only the top max_features ordered by term-freq; 保持满足上述条件的词汇量大小
    vocabulary=None,                     # default; if provided, max_df, min_df ,max_features are ignored
    # TF - IDF adjustment arguments
    binary=False,                        # default; if True, all non-zero term counts are set to 1
    sublinear_tf=True,                   # default; if True, use 1 + log(tf) for non-zeros; else use tf
    use_idf=True,                        # default; if True, enable IDF re-weighting
    smooth_idf=True,                     # default; if True, use 1 + log((N_docs+1)/(df+1)), else use 1 + log(N_docs/df)
    norm='l2'                            # default; if True, preform post TF-IDF normalization such that output row has unit norm
)

In [111]:
# understand TfidfVectorizer - before performing vec.fit_transform()
preprocessor = vec.build_preprocessor()       #lower strings & remove accents
print(preprocessor("Let's TRY this OUT: aigué béonté"))

tokenizer = vec.build_tokenizer()        #tokenize documents
print(tokenizer("let's try this out: aigue beonte"))

analyzer = vec.build_analyzer()      # preprocess > tokenize > remove > stopwords > get n-grams > prune vocabulary
print(analyzer("let's try this out: aigue beonte"))

let's try this out: aigue beonte
['let', 'try', 'this', 'out', 'aigue', 'beonte']
['let', 'try', 'this', 'out', 'aigue', 'beonte', 'let try', 'try this', 'this out', 'out aigue', 'aigue beonte']


In [112]:
# understand TfidfVectorizer - after performing vec.fit_transform()
document_term_matrix = vec.fit_transform(x_train)  # scipy sparse csr matrix
idx2term = vec.get_feature_names_out()
term2idx = vec.vocabulary_
vec.get_stop_words()

In [113]:
# term2idx

In [17]:
x_train = vec.fit_transform(x_train).toarray()
x_test = vec.transform(x_test).toarray()
print('x_train shape:',x_train.shape, '\t', 'y_train shape:', len(y_train))
print('x_test shape:',x_test.shape, '\t', 'y_test shape:',len(y_test))

MemoryError: Unable to allocate 67.9 GiB for an array with shape (124848, 72958) and data type float64

In [115]:
# 多分类 multi-class classification； 注意和 multi-label 的区别
logreg = LogisticRegression(
    penalty='l2',                # default; {'l1','l2','elasticnet','none'}
    l1_ratio=None,               # range (0,1) used if penalty='elasticnet'
    C=1.0,                       # default; inverse of alphs - smaller valves give stronger regularization
    multi_class='ovr',   # default='multinomial'; {'auto','ovr','multinomial'} handles multi-class only
    solver='liblinear',          # default; {'liblinear','lbfgs','newton-cg','sag','saga'}
    max_iter=100,                # default;
    tol=1e-4,                    # default; tolerance for stopping criteria
)
# 'ovr' fits multiple binary classifiers while 'multinomial' fits one multi-class classifier

In [116]:
logreg.fit(x_train, y_train[:1000])
y_prob = logreg.predict_proba(x_test)  # 先预测概率probability
print(y_prob.shape);y_prob   # 每行概率相加【
y_pred = logreg.predict(x_test)
print(y_pred.shape);y_pred
print(y_prob.argmax(1))
print(y_prob)
# print('accuracy:',accuracy_score(Y_test,y_pred).round(4))
# print('precision-macro:',precision_score(Y_test,y_pred,average='macro').round(4))
# print('precision-micro:',precision_score(Y_test,y_pred,average='micro').round(4))
# print('precision-weighted:',precision_score(Y_test,y_pred,average='weighted').round(4))
# print('recall-macro:',recall_score(Y_test,y_pred,average='macro').round(4))
# print('recall-micro:',recall_score(Y_test,y_pred,average='micro').round(4))
# print('recall-weighted:',recall_score(Y_test,y_pred,average='weighted').round(4))
# print('f1-macro:',f1_score(Y_test,y_pred,average='macro').round(4))
# print('f1-micro:',f1_score(Y_test,y_pred,average='micro').round(4))
# print('f1-weighted:',f1_score(Y_test,y_pred,average='weighted').round(4))
# # or
# print(classification_report(Y_test,y_pred,digits=4))
# pdat = pd.DataFrame(classification_report(Y_test,y_pred,digits=4,output_dict=True)).round(4).T
# print(pdat)
# print(pdat[:4].mean(0).round(4))

(1000, 5)
(1000,)
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 

In [121]:
from collections import Counter
Counter(y_prob.argmax(1))

Counter({2: 993, 3: 6, 1: 1})