In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings 
warnings.filterwarnings("ignore")
import seaborn as sns
import os
from datetime import date
cwd = os.getcwd()

sns.set(style="white", color_codes=True)
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [None]:
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1, pat2))
www_pat = r'www.[^ ]+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def tweet_cleaner_updated(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        bom_removed = souped
    stripped = re.sub(combined_pat, '', bom_removed)
    stripped = re.sub(www_pat, '', stripped)
    lower_case = stripped.lower()
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = [x for x  in tok.tokenize(letters_only) if len(x) > 1]
    return (" ".join(words)).strip()

In [2]:
csv = 'clean_tweet.csv'
my_df = pd.read_csv(csv,index_col=0)
my_df.head()

Unnamed: 0,text,target
0,sad time of night is not it,0.0
1,rt please get things under control so do not h...,0.0
2,apology accepted,1.0
3,tammy received the fifth component of the grea...,1.0
4,what becomes of the broken hearted they buy sh...,1.0


In [3]:
my_df.dropna(inplace=True)
my_df.reset_index(drop=True,inplace=True)
my_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 643029 entries, 0 to 643028
Data columns (total 2 columns):
text      643029 non-null object
target    643029 non-null float64
dtypes: float64(1), object(1)
memory usage: 9.8+ MB


In [4]:
x = my_df.text
y = my_df.target

In [5]:
from sklearn.cross_validation import train_test_split
SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.02, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)



In [6]:
print("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train),
                                                                             (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,
                                                                            (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
print("Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_validation),
                                                                             (len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100,
                                                                            (len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100))
print("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test),
                                                                             (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
                                                                            (len(x_test[y_test == 1]) / (len(x_test)*1.))*100))

Train set has total 630168 entries with 49.14% negative, 50.86% positive
Validation set has total 6430 entries with 49.41% negative, 50.59% positive
Test set has total 6431 entries with 50.19% negative, 49.81% positive


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer(max_features=100000,ngram_range=(1, 3))
tvec.fit(x_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=100000, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [8]:
# saving
with open('tvec.pickle', 'wb') as handle:
    pickle.dump(tvec, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
# loading
with open('tvec.pickle', 'rb') as handle:
    tvec = pickle.load(handle)

In [11]:
x_train_tfidf = tvec.transform(x_train)
x_test_tfidf = tvec.transform(x_test)

In [18]:
x_test_10 = tvec.transform(x_test[:10])
y_test_10 = y_test[:10]
print(y_test_10)

71160     0.0
290198    0.0
283413    0.0
227678    1.0
505112    1.0
308433    0.0
545336    1.0
19397     0.0
532450    0.0
317038    1.0
Name: target, dtype: float64


In [12]:
from sklearn.linear_model import LogisticRegression
lr_with_tfidf = LogisticRegression()
lr_with_tfidf.fit(x_train_tfidf,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
lr_with_tfidf.score(x_test_tfidf,y_test)

0.8698491680920541

In [24]:
yhat_lr = lr_with_tfidf.predict_proba(x_test_tfidf)

In [25]:
yhat_lr[:10]

array([[0.87563308, 0.12436692],
       [0.80898444, 0.19101556],
       [0.91286171, 0.08713829],
       [0.48043856, 0.51956144],
       [0.14530519, 0.85469481],
       [0.66837506, 0.33162494],
       [0.00490413, 0.99509587],
       [0.86463396, 0.13536604],
       [0.59664801, 0.40335199],
       [0.32338311, 0.67661689]])

In [22]:
y_10 = lr_with_tfidf.predict_proba(x_test_10)
print(y_10)

[[0.87563308 0.12436692]
 [0.80898444 0.19101556]
 [0.91286171 0.08713829]
 [0.48043856 0.51956144]
 [0.14530519 0.85469481]
 [0.66837506 0.33162494]
 [0.00490413 0.99509587]
 [0.86463396 0.13536604]
 [0.59664801 0.40335199]
 [0.32338311 0.67661689]]


In [24]:
tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)

In [None]:
# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [19]:
# loading
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [20]:
x_train_seq = pad_sequences(sequences, maxlen=45)
print('Shape of data tensor:', x_train_seq.shape)

Shape of data tensor: (630168, 45)


In [21]:
sequences_val = tokenizer.texts_to_sequences(x_validation)
x_val_seq = pad_sequences(sequences_val, maxlen=45)

In [22]:
sequences_test = tokenizer.texts_to_sequences(x_test)
x_test_seq = pad_sequences(sequences_test, maxlen=45)

In [23]:
seed = 7

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D

In [24]:
from keras.models import load_model
loaded_CNN_model = load_model('CNN_best_weights.02-0.8820.hdf5')
loaded_CNN_model.evaluate(x=x_val_seq, y=y_validation)

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


[0.280831264216644, 0.8819595644299758]

In [25]:
loaded_CNN_model.evaluate(x=x_test_seq, y=y_test)



[0.2818646466327926, 0.8830663969998337]

用这个出结果

In [28]:
loaded_CNN_model.predict(x_test_seq[:5])

array([[0.01306436],
       [0.06669962],
       [0.02319559],
       [0.5504947 ],
       [0.7241536 ]], dtype=float32)