In [21]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

In [2]:
data = pd.read_csv('hm_train.csv')
data.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category
0,27673,24h,I went on a successful date with someone I fel...,1,affection
1,27674,24h,I was happy when my son got 90% marks in his e...,1,affection
2,27675,24h,I went to the gym this morning and did yoga.,1,exercise
3,27676,24h,We had a serious talk with some friends of our...,2,bonding
4,27677,24h,I went with grandchildren to butterfly display...,1,affection


### Data Cleaning

In [3]:
# Remove stop words
stop = stopwords.words('english')
data['hm_without_stopwords'] = data['cleaned_hm'].apply(lambda x: ' '.join(
        [word for word in x.split() if word not in (stop)]))

data.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category,hm_without_stopwords
0,27673,24h,I went on a successful date with someone I fel...,1,affection,I went successful date someone I felt sympathy...
1,27674,24h,I was happy when my son got 90% marks in his e...,1,affection,I happy son got 90% marks examination
2,27675,24h,I went to the gym this morning and did yoga.,1,exercise,I went gym morning yoga.
3,27676,24h,We had a serious talk with some friends of our...,2,bonding,We serious talk friends flaky lately. They und...
4,27677,24h,I went with grandchildren to butterfly display...,1,affection,I went grandchildren butterfly display Crohn C...


In [4]:
# Filter out numeric characters
data['hm_without_stopwords'] = data['hm_without_stopwords'].apply(lambda x: ' '.join(
        [word for word in x.split() if word.isalpha()]))

data.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category,hm_without_stopwords
0,27673,24h,I went on a successful date with someone I fel...,1,affection,I went successful date someone I felt sympathy...
1,27674,24h,I was happy when my son got 90% marks in his e...,1,affection,I happy son got marks examination
2,27675,24h,I went to the gym this morning and did yoga.,1,exercise,I went gym morning
3,27676,24h,We had a serious talk with some friends of our...,2,bonding,We serious talk friends flaky They understood ...
4,27677,24h,I went with grandchildren to butterfly display...,1,affection,I went grandchildren butterfly display Crohn C...


In [5]:
# Stemming
porter = PorterStemmer()
data['hm_without_stopwords'] = data['hm_without_stopwords'].apply(lambda x: ' '.join(
        [porter.stem(word) for word in x.split()]))
data.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category,hm_without_stopwords
0,27673,24h,I went on a successful date with someone I fel...,1,affection,I went success date someon I felt sympathi con...
1,27674,24h,I was happy when my son got 90% marks in his e...,1,affection,I happi son got mark examin
2,27675,24h,I went to the gym this morning and did yoga.,1,exercise,I went gym morn
3,27676,24h,We had a serious talk with some friends of our...,2,bonding,We seriou talk friend flaki They understood go...
4,27677,24h,I went with grandchildren to butterfly display...,1,affection,I went grandchildren butterfli display Crohn C...


### Count Vectorizer

In [6]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(data.hm_without_stopwords)

In [7]:
X_train_counts.shape

(60321, 11937)

### TF-IDF

In [8]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(60321, 11937)

In [9]:
y = data['predicted_category']

### Training the classifier

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_train_tf, y, test_size=0.20, random_state=18)

In [18]:
X_train.shape

(48256, 11937)

In [19]:
clf = MultinomialNB().fit(X_train, y_train)

In [20]:
predicted = clf.predict(X_test)

In [23]:
accuracy_score(y_test, predicted)

0.70907583920430994