## Processing data

Get data from csv files and process them


#### Find current path

In [1]:
import os
current_path = os.path.dirname(os.path.abspath(''))
current_path

'/Users/drchasekim/Documents/emotion-detection'

##### Get data directory path

In [2]:
data_path = os.path.join(current_path, "data/raw")
data_path

'/Users/drchasekim/Documents/emotion-detection/data/raw'

#### Read data from csv files

In [3]:
import pandas as pd

tweets_df = pd.read_csv(data_path+"/tweet_emotions.csv")
tweets_df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [4]:
text_emotion_df = pd.read_csv(data_path+"/Text_Emotion/train.csv")
text_emotion_df.head()

Unnamed: 0,text,emotion
0,carefully word blog posts amount criticism hea...,0
1,cannot remember little mermaid feeling carefre...,1
2,not feeling super well turns cold knocked next...,1
3,feel honored part group amazing talents,1
4,think helping also began feel pretty lonely lo...,0


##### Concat dataframes and drop na values

In [5]:
print(len(tweets_df) + len(text_emotion_df))

text_df = pd.concat([tweets_df.loc[:, "content"], text_emotion_df.loc[:, "text"]], ignore_index=True)

text_df.dropna(inplace=True)

text_df.head()



294539


0    @tiffanylue i know  i was listenin to bad habi...
1    Layin n bed with a headache  ughhhh...waitin o...
2                  Funeral ceremony...gloomy friday...
3                 wants to hang out with friends SOON!
4    @dannycastillo We want to trade with someone w...
dtype: object

#### Process text data

import nltk and download necessary data

In [6]:
import nltk
nltk.__version__

nltk.download("treebank")

[nltk_data] Downloading package treebank to
[nltk_data]     /Users/drchasekim/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

##### Lowercase and delete ppunctuation, url, hashtag or etc...

In [7]:
import re

text_df = text_df.apply(lambda x: x.lower())
text_df = text_df.apply(lambda x: re.sub(r"(?:\@|#|http?\://|https?\://|www)\S+", "", x))
text_df = text_df.apply(lambda x: re.sub(r'\W*\b\w{1}\b', "", x))
text_df = text_df.apply(lambda x: re.sub('([.,!?()])', r' \1 ', x))
text_df = text_df.apply(lambda x: re.sub(r'\s{2,}', ' ', x))

text_df.head()


0     know was listenin to bad habit earlier and st...
1    layin bed with headache ughhhh . . . waitin on...
2          funeral ceremony . . . gloomy friday . . . 
3               wants to hang out with friends soon ! 
4     we want to trade with someone who has houston...
dtype: object

##### Tokenize text

In [8]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
text_df = text_df.apply(lambda x: tokenizer.tokenize(x))

text_df.head()

0    [know, was, listenin, to, bad, habit, earlier,...
1    [layin, bed, with, headache, ughhhh, ., ., ., ...
2    [funeral, ceremony, ., ., ., gloomy, friday, ....
3       [wants, to, hang, out, with, friends, soon, !]
4    [we, want, to, trade, with, someone, who, has,...
dtype: object

In [9]:
text_df.to_list()

[['know',
  'was',
  'listenin',
  'to',
  'bad',
  'habit',
  'earlier',
  'and',
  'started',
  'freakin',
  'at',
  'his',
  'part',
  '=',
  '['],
 ['layin',
  'bed',
  'with',
  'headache',
  'ughhhh',
  '.',
  '.',
  '.',
  'waitin',
  'on',
  'your',
  'call',
  '.',
  '.',
  '.'],
 ['funeral', 'ceremony', '.', '.', '.', 'gloomy', 'friday', '.', '.', '.'],
 ['wants', 'to', 'hang', 'out', 'with', 'friends', 'soon', '!'],
 ['we',
  'want',
  'to',
  'trade',
  'with',
  'someone',
  'who',
  'has',
  'houston',
  'tickets',
  ',',
  'but',
  'no',
  'one',
  'will',
  '.'],
 ['re-pinging',
  'why',
  'didn',
  'you',
  'go',
  'to',
  'prom',
  '?',
  'bc',
  'my',
  'bf',
  'didn',
  'like',
  'my',
  'friends'],
 ['should',
  'be',
  'sleep',
  ',',
  'but',
  'im',
  'not',
  '!',
  'thinking',
  'about',
  'an',
  'old',
  'friend',
  'who',
  'want',
  '.',
  'but',
  'he',
  'married',
  'now',
  '.',
  'damn',
  ',',
  '&',
  'amp',
  ';',
  'he',
  'wants',
  'me',
  '!',


#### Train fastText

In [10]:
from gensim.models.fasttext import FastText


model = FastText(sentences=text_df.to_list(), vector_size=128)

model.train(corpus_iterable=text_df.to_list(), epochs=model.epochs, total_examples=model.corpus_count, total_words=model.corpus_total_words)

(12336665, 14729215)

In [11]:
from pprint import pprint

ft = model.wv

pprint(ft.similarity("sad", "happy"))
pprint(ft.most_similar("happy"))

0.3841868
[('happy-day', 0.8413798213005066),
 ('sappy', 0.7685059905052185),
 ('happily', 0.737735390663147),
 ('happier', 0.6955354809761047),
 ('happiest', 0.6740206480026245),
 ('lappy', 0.6732267141342163),
 ('snappy', 0.6727899312973022),
 ('nappy', 0.671665608882904),
 ('unhappy', 0.6370724439620972),
 ('peppy', 0.5859425067901611)]


In [12]:
print(f"{'fires' in ft.key_to_index}")

pprint(ft.most_similar("fires"))

True
[('wires', 0.8612368702888489),
 ('fireflies', 0.8237053751945496),
 ('es', 0.822934627532959),
 ('ropes', 0.821685791015625),
 ('pores', 0.8034000396728516),
 ('claires', 0.7939077615737915),
 ('vampires', 0.788116991519928),
 ('fixes', 0.7749882936477661),
 ('ales', 0.7745190858840942),
 ('poles', 0.7737336158752441)]
