In [17]:
#DATASET LINK
#https://github.com/megagonlabs/HappyDB

In [18]:
#LIBRARY 
import numpy as np 
import pandas as pd
import gensim
import os
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, GRU, LSTM
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

In [19]:
#RNN is good for sequence learning 

In [20]:
data = pd.read_csv("cleaned_hm.csv")
data.head()

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,27673,2053,24h,I went on a successful date with someone I fel...,I went on a successful date with someone I fel...,True,1,,affection
1,27674,2,24h,I was happy when my son got 90% marks in his e...,I was happy when my son got 90% marks in his e...,True,1,,affection
2,27675,1936,24h,I went to the gym this morning and did yoga.,I went to the gym this morning and did yoga.,True,1,,exercise
3,27676,206,24h,We had a serious talk with some friends of our...,We had a serious talk with some friends of our...,True,2,bonding,bonding
4,27677,6227,24h,I went with grandchildren to butterfly display...,I went with grandchildren to butterfly display...,True,1,,affection


In [21]:
#Do categories wise value count 
data['predicted_category'].value_counts()

affection           34168
achievement         33993
enjoy_the_moment    11144
bonding             10727
leisure              7458
nature               1843
exercise             1202
Name: predicted_category, dtype: int64

In [22]:
# we have 7 difefrent class which designate cause of happiness 
#for a given happy moment we need to find the cause of happiness 



In [23]:
#remove the column with more number of sentence with less counts 
data['num_sentence'].value_counts()

1     83711
2      9542
3      3847
4      1624
5       821
6       336
7       183
8       107
10       68
9        61
11       35
13       26
12       21
16       17
18       17
14       14
17       14
19       12
21       10
25        7
15        7
23        7
24        5
26        5
22        4
29        3
31        3
30        3
20        3
27        2
32        2
37        2
40        2
56        1
46        1
53        1
51        1
48        1
69        1
35        1
45        1
44        1
42        1
58        1
34        1
28        1
60        1
Name: num_sentence, dtype: int64

In [24]:
# deleting happy moments with more than 10 sentences
mod_data = data.loc[data['num_sentence'] <= 10]
mod_data["predicted_category"].value_counts()

affection           34020
achievement         33966
enjoy_the_moment    11115
bonding             10700
leisure              7458
nature               1839
exercise             1202
Name: predicted_category, dtype: int64

In [25]:

## categorical to numerical
encode = {
    "affection" : 0,
    "achievement"  : 1,       
    "bonding" : 2,    
    "enjoy_the_moment" : 3,     
    "leisure"  : 4,    
    "nature" : 5,    
    "exercise" : 6
}
mod_data["predicted_category"] = mod_data["predicted_category"].apply(lambda x: encode[x])
mod_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,27673,2053,24h,I went on a successful date with someone I fel...,I went on a successful date with someone I fel...,True,1,,0
1,27674,2,24h,I was happy when my son got 90% marks in his e...,I was happy when my son got 90% marks in his e...,True,1,,0
2,27675,1936,24h,I went to the gym this morning and did yoga.,I went to the gym this morning and did yoga.,True,1,,6
3,27676,206,24h,We had a serious talk with some friends of our...,We had a serious talk with some friends of our...,True,2,bonding,2
4,27677,6227,24h,I went with grandchildren to butterfly display...,I went with grandchildren to butterfly display...,True,1,,0


In [26]:
#text cleaning for NLP

import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

In [27]:

def clean_text(df):
    all_reviews = list()
    lines = df["cleaned_hm"].values.tolist()
    for text in lines:
        text = text.lower()
        text = re.sub('[^a-zA-Z]', ' ', text)
        words=nltk.word_tokenize(text)
        stop_words= set(stopwords.words('english'))
        stop_words.discard("not")
        words_without_stop_words=[word for word in words if word not in stop_words]
        #words=[lemmatizer.lemmatize(word) for word in words_without_stop_words ]
        words=[ps.stem(word) for word in words_without_stop_words ]
        words = ' '.join(words)
        all_reviews.append(words)
    return all_reviews



In [28]:
happy_lines = clean_text(mod_data)
happy_lines[0:5]  

['went success date someon felt sympathi connect',
 'happi son got mark examin',
 'went gym morn yoga',
 'seriou talk friend flaki late understood good even hang',
 'went grandchildren butterfli display crohn conservatori']

In [32]:
print("Now we have no of happy moments as features :",len(happy_lines))
print("Number of target label", len(mod_data['predicted_category']))

Now we have no of happy moments as features : 100300
Number of target label 100300


In [33]:
#here we can see every lines have different length 
#to process the same in neural network we need to do below 3 things 

#a.represent text in numerical format 
#b.make all setence of fixed length to process in neural network if not present than pad 0 pre or post 
#c.

In [34]:
#number of words in each review/ happymoments/sentence=55
validation_split = 0.2
max_length = 55
#tokenize te sentences and represent them in numbers other way is to todo one hot 
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(happy_lines)
sequences = tokenizer_obj.texts_to_sequences(happy_lines)
sequences

[[8, 210, 196, 205, 52, 6718, 762],
 [1, 31, 2, 597, 1006],
 [8, 307, 55, 878],
 [1464, 68, 4, 9874, 340, 2021, 12, 95, 383],
 [8, 1642, 2396, 2161, 9875, 9876],
 [1209, 10, 34],
 [3, 9, 569, 7802, 1116, 43, 4292],
 [2, 112, 1751, 96, 14, 88],
 [32, 107, 46, 33],
 [22, 1752, 1452, 175, 3086, 268],
 [43, 1329, 127, 94, 1753, 130, 42],
 [93, 431, 164, 170, 392, 29, 28, 1403],
 [8, 79, 4, 100],
 [540, 1049, 3, 1050],
 [318,
  227,
  1018,
  2128,
  13,
  1799,
  1,
  77,
  174,
  1851,
  1799,
  28,
  143,
  984,
  1,
  429,
  81,
  39,
  529,
  222,
  984,
  5074,
  308,
  12],
 [31, 332, 70, 299, 350],
 [69, 2129, 253, 71, 1754],
 [220, 11, 27, 2162, 42, 26],
 [437, 406, 176, 4, 10, 34, 3, 6],
 [9, 7803, 43, 248, 3495, 1276, 3390, 1181, 7804, 29, 491, 1],
 [2, 140],
 [83, 94, 754, 55, 39, 24, 1, 6719, 19, 236],
 [359, 3, 18, 43, 231, 434, 231, 70],
 [73,
  909,
  121,
  308,
  1283,
  1,
  1283,
  2050,
  3087,
  1491,
  2261,
  1,
  97,
  3001,
  1621,
  121,
  1533,
  2540],
 [581, 59

In [35]:
word_index = tokenizer_obj.word_index
print("unique tokens - "+str(len(word_index)))
vocab_size = len(tokenizer_obj.word_index) + 1
print('vocab_size - '+str(vocab_size))

unique tokens - 15862
vocab_size - 15863


In [54]:
word_index

{'happi': 1,
 'got': 2,
 'made': 3,
 'friend': 4,
 'work': 5,
 'day': 6,
 'time': 7,
 'went': 8,
 'new': 9,
 'last': 10,
 'get': 11,
 'good': 12,
 'go': 13,
 'realli': 14,
 'month': 15,
 'one': 16,
 'abl': 17,
 'today': 18,
 'famili': 19,
 'see': 20,
 'first': 21,
 'watch': 22,
 'week': 23,
 'year': 24,
 'home': 25,
 'play': 26,
 'final': 27,
 'feel': 28,
 'make': 29,
 'found': 30,
 'son': 31,
 'yesterday': 32,
 'enjoy': 33,
 'night': 34,
 'daughter': 35,
 'dinner': 36,
 'great': 37,
 'moment': 38,
 'long': 39,
 'not': 40,
 'job': 41,
 'game': 42,
 'came': 43,
 'old': 44,
 'nice': 45,
 'birthday': 46,
 'life': 47,
 'bought': 48,
 'favorit': 49,
 'love': 50,
 'like': 51,
 'felt': 52,
 'wife': 53,
 'husband': 54,
 'morn': 55,
 'receiv': 56,
 'dog': 57,
 'lot': 58,
 'start': 59,
 'took': 60,
 'event': 61,
 'back': 62,
 'school': 63,
 'hous': 64,
 'hour': 65,
 'visit': 66,
 'want': 67,
 'talk': 68,
 'help': 69,
 'well': 70,
 'car': 71,
 'finish': 72,
 'two': 73,
 'girlfriend': 74,
 'ago': 

In [37]:
#do padding 
lines_pad = pad_sequences(sequences, maxlen=max_length, padding='post')
lines_pad

array([[  8, 210, 196, ...,   0,   0,   0],
       [  1,  31,   2, ...,   0,   0,   0],
       [  8, 307,  55, ...,   0,   0,   0],
       ...,
       [563,  74,  10, ...,   0,   0,   0],
       [ 37, 136,  32, ...,   0,   0,   0],
       [ 37, 442,  10, ...,   0,   0,   0]])

In [41]:
print("Now we have no of happy moments as features :",len(lines_pad))
print("Number of target label", len(mod_data['predicted_category']))

Now we have no of happy moments as features : 100300
Number of target label 100300


In [48]:
X=lines_pad
y=mod_data['predicted_category'].values
type(y)

numpy.ndarray

In [50]:
#train test Split :
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print('Shape of X_train_pad:', X_train.shape)
print('Shape of y_train:', y_train.shape)

print('Shape of X_test_pad:', X_test.shape)
print('Shape of y_test:', y_test.shape)

Shape of X_train_pad: (80240, 55)
Shape of y_train: (80240,)
Shape of X_test_pad: (20060, 55)
Shape of y_test: (20060,)


#NOW WE WILL DO WORD2 VEC USING TRAINED MODEL GoogleNews-vectors-negative300

In [51]:
word_index 

{'happi': 1,
 'got': 2,
 'made': 3,
 'friend': 4,
 'work': 5,
 'day': 6,
 'time': 7,
 'went': 8,
 'new': 9,
 'last': 10,
 'get': 11,
 'good': 12,
 'go': 13,
 'realli': 14,
 'month': 15,
 'one': 16,
 'abl': 17,
 'today': 18,
 'famili': 19,
 'see': 20,
 'first': 21,
 'watch': 22,
 'week': 23,
 'year': 24,
 'home': 25,
 'play': 26,
 'final': 27,
 'feel': 28,
 'make': 29,
 'found': 30,
 'son': 31,
 'yesterday': 32,
 'enjoy': 33,
 'night': 34,
 'daughter': 35,
 'dinner': 36,
 'great': 37,
 'moment': 38,
 'long': 39,
 'not': 40,
 'job': 41,
 'game': 42,
 'came': 43,
 'old': 44,
 'nice': 45,
 'birthday': 46,
 'life': 47,
 'bought': 48,
 'favorit': 49,
 'love': 50,
 'like': 51,
 'felt': 52,
 'wife': 53,
 'husband': 54,
 'morn': 55,
 'receiv': 56,
 'dog': 57,
 'lot': 58,
 'start': 59,
 'took': 60,
 'event': 61,
 'back': 62,
 'school': 63,
 'hous': 64,
 'hour': 65,
 'visit': 66,
 'want': 67,
 'talk': 68,
 'help': 69,
 'well': 70,
 'car': 71,
 'finish': 72,
 'two': 73,
 'girlfriend': 74,
 'ago': 

In [None]:
#we will use google word2vec pretrained model 
#here we dont need to train embedding layer as it is already trained we just need to fit that model to
#our requirement for better prediction 