In [1]:
import os
import sys

app_path = os.getcwd().rsplit(os.sep, 1)[0]

"""Add app path to sys.path for importing parent directory modules"""
if app_path not in sys.path:
    sys.path.insert(0, app_path)

# Note: Do NOT delete this cell

In [2]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk
nltk.download("stopwords")
from string import punctuation
from nltk.stem.porter import *

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from utils.paths import data_path
train_path = data_path('train.csv')
test_path = data_path('test.csv')

In [7]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [8]:
train.head()

Unnamed: 0,tweet_id,tweet,sentiment
0,1701,#sxswnui #sxsw #apple defining language of tou...,1
1,1851,Learning ab Google doodles! All doodles should...,1
2,2689,one of the most in-your-face ex. of stealing t...,2
3,4525,This iPhone #SXSW app would b pretty awesome i...,0
4,3604,Line outside the Apple store in Austin waiting...,1


In [9]:
# Importing HTMLParser
from html.parser import HTMLParser
html_parser = HTMLParser()

In [10]:
train["tweet"]= train["tweet"].astype(str)
train.head()

Unnamed: 0,tweet_id,tweet,sentiment
0,1701,#sxswnui #sxsw #apple defining language of tou...,1
1,1851,Learning ab Google doodles! All doodles should...,1
2,2689,one of the most in-your-face ex. of stealing t...,2
3,4525,This iPhone #SXSW app would b pretty awesome i...,0
4,3604,Line outside the Apple store in Austin waiting...,1


In [11]:
# Created a new columns i.e. clean_tweet contains the same tweets but cleaned version
train['clean_tweet'] = train['tweet'].apply(lambda x: html_parser.unescape(x))
train.head(10)

  


Unnamed: 0,tweet_id,tweet,sentiment,clean_tweet
0,1701,#sxswnui #sxsw #apple defining language of tou...,1,#sxswnui #sxsw #apple defining language of tou...
1,1851,Learning ab Google doodles! All doodles should...,1,Learning ab Google doodles! All doodles should...
2,2689,one of the most in-your-face ex. of stealing t...,2,one of the most in-your-face ex. of stealing t...
3,4525,This iPhone #SXSW app would b pretty awesome i...,0,This iPhone #SXSW app would b pretty awesome i...
4,3604,Line outside the Apple store in Austin waiting...,1,Line outside the Apple store in Austin waiting...
5,966,#technews One lone dude awaits iPad 2 at Apple...,1,#technews One lone dude awaits iPad 2 at Apple...
6,1395,"SXSW Tips, Prince, NPR Videos, Toy Shopping Wi...",1,"SXSW Tips, Prince, NPR Videos, Toy Shopping Wi..."
7,8182,NU user RT @mention New #UberSocial for #iPhon...,1,NU user RT @mention New #UberSocial for #iPhon...
8,8835,Free #SXSW sampler on iTunes {link} #FreeMusic,2,Free #SXSW sampler on iTunes {link} #FreeMusic
9,883,I think I might go all weekend without seeing ...,2,I think I might go all weekend without seeing ...


In [12]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

In [13]:
# remove twitter handles (@user)
train['clean_tweet'] = np.vectorize(remove_pattern)(train['tweet'], "@[\w]*")
train.head(10)

Unnamed: 0,tweet_id,tweet,sentiment,clean_tweet
0,1701,#sxswnui #sxsw #apple defining language of tou...,1,#sxswnui #sxsw #apple defining language of tou...
1,1851,Learning ab Google doodles! All doodles should...,1,Learning ab Google doodles! All doodles should...
2,2689,one of the most in-your-face ex. of stealing t...,2,one of the most in-your-face ex. of stealing t...
3,4525,This iPhone #SXSW app would b pretty awesome i...,0,This iPhone #SXSW app would b pretty awesome i...
4,3604,Line outside the Apple store in Austin waiting...,1,Line outside the Apple store in Austin waiting...
5,966,#technews One lone dude awaits iPad 2 at Apple...,1,#technews One lone dude awaits iPad 2 at Apple...
6,1395,"SXSW Tips, Prince, NPR Videos, Toy Shopping Wi...",1,"SXSW Tips, Prince, NPR Videos, Toy Shopping Wi..."
7,8182,NU user RT @mention New #UberSocial for #iPhon...,1,NU user RT New #UberSocial for #iPhone now in...
8,8835,Free #SXSW sampler on iTunes {link} #FreeMusic,2,Free #SXSW sampler on iTunes {link} #FreeMusic
9,883,I think I might go all weekend without seeing ...,2,I think I might go all weekend without seeing ...


In [14]:
train['clean_tweet'] = train['clean_tweet'].apply(lambda x: x.lower())
train.head(10)

Unnamed: 0,tweet_id,tweet,sentiment,clean_tweet
0,1701,#sxswnui #sxsw #apple defining language of tou...,1,#sxswnui #sxsw #apple defining language of tou...
1,1851,Learning ab Google doodles! All doodles should...,1,learning ab google doodles! all doodles should...
2,2689,one of the most in-your-face ex. of stealing t...,2,one of the most in-your-face ex. of stealing t...
3,4525,This iPhone #SXSW app would b pretty awesome i...,0,this iphone #sxsw app would b pretty awesome i...
4,3604,Line outside the Apple store in Austin waiting...,1,line outside the apple store in austin waiting...
5,966,#technews One lone dude awaits iPad 2 at Apple...,1,#technews one lone dude awaits ipad 2 at apple...
6,1395,"SXSW Tips, Prince, NPR Videos, Toy Shopping Wi...",1,"sxsw tips, prince, npr videos, toy shopping wi..."
7,8182,NU user RT @mention New #UberSocial for #iPhon...,1,nu user rt new #ubersocial for #iphone now in...
8,8835,Free #SXSW sampler on iTunes {link} #FreeMusic,2,free #sxsw sampler on itunes {link} #freemusic
9,883,I think I might go all weekend without seeing ...,2,i think i might go all weekend without seeing ...


In [15]:
stop_words = list(set(stopwords.words('english')))+list(punctuation)+['``', "'s", "...", "n't"]

# tokenize
train['tokenized_tweet'] = [nltk.word_tokenize(x) for x in train['clean_tweet']]

# stopword removal
train['tokenized_tweet'] = train['tokenized_tweet'].apply(lambda row: [word for word in row if word not in stop_words])

# stemming words
stemmer = PorterStemmer()
train['tokenized_tweet'] = train['tokenized_tweet'].apply(lambda x: [stemmer.stem(i) for i in x])
train['tokenized_tweet'] = train['tokenized_tweet'].apply(lambda x: ' '.join(x))
train.head()

Unnamed: 0,tweet_id,tweet,sentiment,clean_tweet,tokenized_tweet
0,1701,#sxswnui #sxsw #apple defining language of tou...,1,#sxswnui #sxsw #apple defining language of tou...,sxswnui sxsw appl defin languag touch differ d...
1,1851,Learning ab Google doodles! All doodles should...,1,learning ab google doodles! all doodles should...,learn ab googl doodl doodl light funni amp inn...
2,2689,one of the most in-your-face ex. of stealing t...,2,one of the most in-your-face ex. of stealing t...,one in-your-fac ex steal show yr rt quot sxsw ...
3,4525,This iPhone #SXSW app would b pretty awesome i...,0,this iphone #sxsw app would b pretty awesome i...,iphon sxsw app would b pretti awesom crash eve...
4,3604,Line outside the Apple store in Austin waiting...,1,line outside the apple store in austin waiting...,line outsid appl store austin wait new ipad sx...
