In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('dataset/Tweets.csv')

In [3]:
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [4]:
data = data[['airline_sentiment','text']] #只需要评价类型与评价内容

In [5]:
data.head() #可以去掉中性评价

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [6]:
data.airline_sentiment.unique()

array(['neutral', 'positive', 'negative'], dtype=object)

In [7]:
data.airline_sentiment.value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [8]:
#提取思路 negative 与positive相同数量

In [9]:
data_p = data[data.airline_sentiment == 'positive']

In [10]:
data_n = data[data.airline_sentiment == 'negative']

In [11]:
data_n = data_n.iloc[:len(data_p)]

In [12]:
len(data_n),len(data_p)

(2363, 2363)

In [13]:
data = pd.concat([data_n, data_p])

In [14]:
data

Unnamed: 0,airline_sentiment,text
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
5,negative,@VirginAmerica seriously would pay $30 a fligh...
15,negative,@VirginAmerica SFO-PDX schedule is still MIA.
17,negative,@VirginAmerica I flew from NYC to SFO last we...
...,...,...
14623,positive,@AmericanAir Love the new planes for the JFK-L...
14625,positive,@AmericanAir Flight 236 was great. Fantastic c...
14628,positive,Thank you. “@AmericanAir: @jlhalldc Customer R...
14630,positive,@AmericanAir Thanks! He is.


In [15]:
data = data.sample(len(data)) #通过取值来乱序

In [16]:
data

Unnamed: 0,airline_sentiment,text
8420,positive,@JetBlue OH YEAH!!! great flight down to Mexic...
1199,negative,"@united JH, thanks so much for reaching out. I..."
7434,positive,@JetBlue oh. And thank you for responding
2328,negative,@united call wait times are over 20 minutes an...
833,positive,@united Made the upgrade list. Will fly 1st to...
...,...,...
2903,negative,@united I get that. But doing it by giving ina...
7294,positive,@JetBlue awesome
7731,positive,@JetBlue thank you!
741,negative,@united The agent that met us at the gate said...


In [17]:
data['review'] = (data.airline_sentiment == 'positive').astype('int')

In [18]:
del data['airline_sentiment']

tf.keras.layers.Embedding #把文本向量化

In [19]:
import re

In [20]:
token = re.compile('[A-Za-z]+ | [!?,.()]') #只要这些符号

In [21]:
def reg_text(text):
    new_text = token.findall(text) 
    new_text = [word.lower() for word in new_text] #全部变为小写
    return new_text

In [22]:
data['text'] = data.text.apply(reg_text)

In [23]:
data

Unnamed: 0,text,review
8420,"[jetblue , oh , great , flight , down , to , m...",1
1199,"[united , thanks , so , much , for , reaching ...",0
7434,"[jetblue , and , thank , you , for ]",1
2328,"[united , call , wait , times , are , over , m...",0
833,"[united , made , the , upgrade , will , fly , ...",1
...,...,...
2903,"[united , i , get , but , doing , it , by , gi...",0
7294,[jetblue ],1
7731,"[jetblue , thank ]",1
741,"[united , the , agent , that , met , us , at ,...",0


In [24]:
#将英文单词变成整数

In [25]:
word_set = set() #set会把重复值去掉
for text in data.text:
    for word in text:
        word_set.add(word)

In [45]:
max_word = len(word_set) + 1
max_word

5323

In [28]:
word_list = list(word_set)

In [33]:
word_index = dict((word, word_list.index(word)+1) for word in word_list)

In [34]:
word_index

{'choosing ': 1,
 'group ': 2,
 'custs ': 3,
 'alison ': 4,
 'resolution ': 5,
 'screensand ': 6,
 'giants ': 7,
 'problem ': 8,
 'departure ': 9,
 'please ': 10,
 'virginatlantic ': 11,
 'pre ': 12,
 'hostage ': 13,
 'costing ': 14,
 'conf ': 15,
 'copy ': 16,
 'transatlantic ': 17,
 'access ': 18,
 'members ': 19,
 'best ': 20,
 'daydreaming ': 21,
 'exception ': 22,
 'aboard ': 23,
 'faint ': 24,
 'formed ': 25,
 'anna ': 26,
 'thecandacesmith ': 27,
 'govt ': 28,
 'provided ': 29,
 'streamline ': 30,
 'lame ': 31,
 'thks ': 32,
 'move ': 33,
 'vs ': 34,
 'condo ': 35,
 'wht ': 36,
 'baseball ': 37,
 'junction ': 38,
 'columbus ': 39,
 'lowered ': 40,
 'tuned ': 41,
 'memory ': 42,
 'kiosk ': 43,
 'personally ': 44,
 'appropriate ': 45,
 'child ': 46,
 'following ': 47,
 'jmercadomma ': 48,
 'fare ': 49,
 'agree ': 50,
 'sent ': 51,
 'knew ': 52,
 'qc ': 53,
 'rosetta ': 54,
 'lovejetblue ': 55,
 'airport ': 56,
 'pray ': 57,
 'painless ': 58,
 'led ': 59,
 'evening ': 60,
 'build '

In [37]:
data_ok = data.text.apply(lambda x:[word_index.get(word, 0) for word in x])

In [47]:
data_ok

8420     [4004, 4114, 4078, 4790, 2309, 4583, 2131, 864...
1199     [4259, 4430, 158, 2130, 588, 4711, 3379, 651, ...
7434                          [4004, 5023, 3392, 756, 588]
2328     [4259, 1354, 81, 3122, 1366, 3373, 4612, 5023,...
833      [4259, 5025, 532, 3224, 2385, 4501, 1815, 850,...
                               ...                        
2903     [4259, 3379, 345, 761, 3534, 613, 652, 4057, 4...
7294                                                [4004]
7731                                          [4004, 3392]
741      [4259, 532, 3247, 303, 70, 963, 3873, 532, 203...
10835    [3753, 3379, 1443, 1458, 5023, 2823, 3235, 458...
Name: text, Length: 4726, dtype: object

In [42]:
len(data_ok.iloc[1])

15

In [46]:
maxlen = max(len(x) for x in data_ok)
maxlen

29

In [48]:
data_ok = keras.preprocessing.sequence.pad_sequences(data_ok.values, maxlen=maxlen) #将每一条评论填充到maxlen

In [49]:
data_ok.shape

(4726, 29)

In [50]:
data.review.values

array([1, 0, 1, ..., 1, 0, 1])

In [51]:
model = keras.Sequential() #顺序模型

In [52]:
#Embeding:把文本映射为一个密集向量，另一个思路：每一个字符变为独热编码

In [53]:
model.add(layers.Embedding(max_word, 50, input_length=maxlen))

In [54]:
model.add(layers.LSTM(64)) #64 包含多少个隐藏单元

In [55]:
model.add(layers.Dense(1, activation='sigmoid'))

In [56]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 29, 50)            266150    
                                                                 
 lstm (LSTM)                 (None, 64)                29440     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 295,655
Trainable params: 295,655
Non-trainable params: 0
_________________________________________________________________


In [57]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc']
)

In [58]:
model.fit(data_ok, data.review.values, epochs=10, batch_size=128, validation_split=0.2)#validation_split切分20%为测试数据

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f372b89308>