In [1]:
import pandas as pd
import time
import numpy as np
import json
import tensorflow as tf
import re
from nltk.tokenize import TweetTokenizer
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors
tweet_tokenizer = TweetTokenizer(reduce_len=True)

In [2]:
from keras.models import Model, Sequential
from keras.layers import Dense, Input, Embedding
from keras.layers import ReLU, Softmax, Dropout
from keras.layers import LSTM, Bidirectional
from keras.callbacks import Callback

In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

## Date Preprocessing

In [5]:
json_file=list()
data=list()

#read json file
with open('tweets_DM.json','r')as file:
    for line in file:
        json_file.append(json.loads(line))
        
#processing the data in json file
for line in json_file:
    tweet_id=line['_source']['tweet']['tweet_id']
    text=line['_source']['tweet']['text']
    hashtags=line['_source']['tweet']['hashtags']
    data.append([tweet_id,text,hashtags])

In [6]:
data_df=pd.DataFrame(data,columns=['tweet_id','text','hashtags'])
data_df[0:5]

Unnamed: 0,tweet_id,text,hashtags
0,0x376b20,"People who post ""add me on #Snapchat"" must be ...",[Snapchat]
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...","[freepress, TrumpLegacy, CNN]"
2,0x28b412,"Confident of your obedience, I write to you, k...",[bibleverse]
3,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,[]
4,0x2de201,"""Trust is not the same as faith. A friend is s...",[]


In [7]:
#read emotion.csv
emo_df=pd.read_csv('emotion.csv')

#read data_identification.csv
id_df=pd.read_csv('data_identification.csv')

In [8]:
#spilt test and train data
test_df_1=id_df[id_df['identification']=='test']

public_test_df=data_df.merge(test_df_1,left_on='tweet_id', right_on='tweet_id')
print(public_test_df)

train_df=data_df.merge(emo_df,left_on='tweet_id', right_on='tweet_id')
print(train_df)

        tweet_id                                               text  \
0       0x28b412  Confident of your obedience, I write to you, k...   
1       0x2de201  "Trust is not the same as faith. A friend is s...   
2       0x218443  When do you have enough ? When are you satisfi...   
3       0x2939d5  God woke you up, now chase the day #GodsPlan #...   
4       0x26289a  In these tough times, who do YOU turn to as yo...   
...          ...                                                ...   
411967  0x2913b4  "For this is the message that ye heard from th...   
411968  0x2a980e  "There is a lad here, which hath five barley l...   
411969  0x316b80  When you buy the last 2 tickets remaining for ...   
411970  0x29d0cb  I swear all this hard work gone pay off one da...   
411971  0x2a6a4f  @Parcel2Go no card left when I wasn't in so I ...   

                                 hashtags identification  
0                            [bibleverse]           test  
1                            

In [10]:
def df_preprocess(df):
    # remove <LH> text
    df['text'] = df['text'].apply(lambda s : s.replace('<LH>',''))    
    df['tmp'] = df['text'].apply(lambda s : tweet_tokenizer.tokenize(s))

    df['p_text'] = df['tmp'].apply(lambda a : ' '.join(a))
    df.drop(['tmp'], axis=1, inplace=True)
    

In [11]:
df_preprocess(train_df)
train_df.head()

Unnamed: 0,tweet_id,text,hashtags,emotion,p_text
0,0x376b20,"People who post ""add me on #Snapchat"" must be ...",[Snapchat],anticipation,"People who post "" add me on #Snapchat "" must b..."
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...","[freepress, TrumpLegacy, CNN]",sadness,"useruser As we see , Trump is dangerous to #fr..."
2,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂,[],fear,Now ISSA is stalking Tasha face with tears of ...
3,0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,"[authentic, LaughOutLoud]",joy,useruser useruser Thx for the BEST TIME tonigh...
4,0x2c91a8,Still waiting on those supplies Liscus.,[],anticipation,Still waiting on those supplies Liscus .


In [12]:
df_preprocess(public_test_df)
public_test_df.head()

Unnamed: 0,tweet_id,text,hashtags,identification,p_text
0,0x28b412,"Confident of your obedience, I write to you, k...",[bibleverse],test,"Confident of your obedience , I write to you ,..."
1,0x2de201,"""Trust is not the same as faith. A friend is s...",[],test,""" Trust is not the same as faith . A friend is..."
2,0x218443,When do you have enough ? When are you satisfi...,"[materialism, money, possessions]",test,When do you have enough ? When are you satisfi...
3,0x2939d5,"God woke you up, now chase the day #GodsPlan #...","[GodsPlan, GodsWork]",test,"God woke you up , now chase the day #GodsPlan ..."
4,0x26289a,"In these tough times, who do YOU turn to as yo...",[],test,"In these tough times , who do YOU turn to as y..."


In [13]:
x_list=train_df['p_text']
y_list=train_df['emotion']

In [14]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(x_list, y_list, test_size=0.2, random_state=42)
print(len(train_x), len(train_y))
print(len(test_x), len(test_y))

1164450 1164450
291113 291113


In [15]:
c={"p_text" : train_x,
   "emotion" : train_y}
train_df1=pd.DataFrame(c)
print(train_df1)

                                                    p_text  emotion
834097   For those of you who have followed me , thanks...      joy
355739   When you have to take a day off from work in o...  sadness
625638   Wherever you are ; be all there . That ’ s how...      joy
678647   useruser Would be nice for all the PSA players...      joy
441397   69 The moments in your life are only once #Lif...    trust
...                                                    ...      ...
259178   And above all these things put on charity , wh...      joy
1414414  useruser Still a devastating poll for Trump .....  sadness
131932                             useruser Ohhh , tears !      joy
671155   Sponsor shirts are completed and Spirit Gear h...      joy
121958   Their behavior is due to a lack of love . We n...      joy

[1164450 rows x 2 columns]


In [16]:
d={"p_text" : test_x,
   "emotion" : test_y}
test_df1=pd.DataFrame(d)
print(test_df1)

                                                    p_text   emotion
970345   Been a #week now #since I my #Mom . I #miss he...  surprise
1145883  Follow our Librarian , Ms . Bird bird for more...       joy
468264   Wonder if the guys who skate in Foxboro over t...       joy
949718   useruser Bloody puts it mildly , will be posti...       joy
982592   Beat the Dolphins next week and we are back to...       joy
...                                                    ...       ...
752756   Cars in the shop and apparently wings and beer...     trust
1096194  I have been awake since just before 5am . Babi...       joy
105639   I worked hard to save the situation but it fai...   sadness
842760   and must bear and kept spontaneously Always no...       joy
1425856                                Nobody texting back   disgust

[291113 rows x 2 columns]


## Method 1 : Naive bayes

### Train "Naive bayes" classifier as baseline

In [17]:
MAX_FEATURES = 50000
df = train_df1

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import time
import nltk
from nltk.corpus import stopwords

start = time.time()

BOW = CountVectorizer(max_features=MAX_FEATURES, 
                             tokenizer=nltk.word_tokenize, 
                             ngram_range=(1,2))

BOW_f = BOW.fit_transform(df['p_text'])

print(f'time : {time.time() - start} sec')
print(BOW_f.shape)

time : 292.5648500919342 sec
(1164450, 50000)


In [19]:
# cross value score on naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
start = time.time()
X = BOW_f
y = df['emotion']

clf = MultinomialNB()
print(cross_val_score(clf, X, y, cv=10, scoring='f1_macro').mean())
print(f'time : {time.time() - start} sec')

0.43761596620999355
time : 48.38650584220886 sec


In [20]:
clf.fit(X, y)
X_private_test = BOW.transform(test_df1['p_text'])
y_pred = clf.predict(X_private_test)
y_true = test_df1['emotion']

In [21]:
# use navie bayas as baseline
from sklearn.metrics import f1_score, classification_report
print(classification_report(y_true=y_true, y_pred=y_pred))

              precision    recall  f1-score   support

       anger       0.32      0.33      0.33      7946
anticipation       0.59      0.54      0.56     49984
     disgust       0.35      0.55      0.43     27669
        fear       0.32      0.49      0.38     12846
         joy       0.69      0.52      0.59    102943
     sadness       0.40      0.50      0.45     38745
    surprise       0.32      0.29      0.30      9816
       trust       0.45      0.43      0.44     41164

    accuracy                           0.50    291113
   macro avg       0.43      0.46      0.44    291113
weighted avg       0.53      0.50      0.51    291113



In [22]:
y_pred[:5]

array(['joy', 'trust', 'sadness', 'fear', 'trust'], dtype='<U12')

### Predict on public_test_df

In [23]:
X_private_test1 = BOW.transform(public_test_df['p_text'])
y_pred1 = clf.predict(X_private_test1)

In [24]:
public_test_df['predict'] = y_pred1

In [25]:
output_df = public_test_df[['tweet_id', 'predict']]
output_df = output_df.rename(columns={'tweet_id':'id', 'predict':'emotion'})

In [26]:
output_df.to_csv('Naiyve.csv', index=False, header=True)