In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import WordPunctTokenizer

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from replacers import RegexReplacer

In [37]:
# remove HTML tag pattern
rc = re.compile(r"\<.*?\>")  
# Replacer class
replacer = RegexReplacer()
# split sentence into word
pattern = r'[\d.,]+|[A-Z][.A-Z]+\b\.*|\w+|\S'
tokenizer = RegexpTokenizer(pattern)
# Lemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [40]:
LabelTrainDataPath = "data/2/labeledTrainData.tsv"
unLabelTrainDataPath = "data/2/unlabeledTrainData.tsv"
testDataPath = "data/2/testData.tsv"
LabelTrainDataFrame = pd.read_csv(LabelTrainDataPath, sep='\t',quoting=3)

In [42]:
LabelTrainDataFrame.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [43]:
def SplitPhase(row):
    """ split paragraph to sentence """
    return PunktTokenizer.tokenize(row['review'])

In [44]:
def RemoveHTML(row):
    """ remove HTML tags """
    return [rc.sub('',sentence) for sentence in row['review']]

In [45]:
def ReplaceAbbre(row):
    """ Replace abbreviation """
    return [replacer.replace(sentence) for sentence in row['review']]

In [46]:
def SplitSent(row):
    """ split sentence to words """
    return [tokenizer.tokenize(sentence) for sentence in row['review']]

In [47]:
def lemma(tags):
    WORD = []
    for word, tag in tags:
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'r', 'n', 'v','n','s'] else None
        if not wntag:
            lemma = word
        else:
            lemma = wordnet_lemmatizer.lemmatize(word, wntag)
        WORD.append(lemma)
    return WORD

In [48]:
def Lemmatizer(row):
    """ Lemmatizer words use WordNet """
    return [ lemma(nltk.pos_tag(sentence)) for sentence in row['review']]

In [49]:
def CleanWords(row):
    res = []
    stops = set(stopwords.words("english"))
    for sentence in row['review']:
        res.append([])
        for word in sentence:
            if len(word) >= 3 and word.isalpha() and word not in stops:
                res[-1].append(word.lower())
    return res

In [50]:
def ToStr(row):
    str=""
    for sentence in row['review']:
        for word in sentence:
            str += (word + " ")
    return str[:-1]

In [51]:
PunktTokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
LabelTrainDataFrame['review'] = LabelTrainDataFrame.apply(SplitPhase, axis=1)

In [52]:
LabelTrainDataFrame.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"[""With all this stuff going down at the moment..."
1,"""2381_9""",1,"[""\""The Classic War of the Worlds\"" by Timothy..."
2,"""7759_3""",0,"[""The film starts with a manager (Nicholas Bel..."
3,"""3630_4""",0,"[""It must be assumed that those who praised th..."
4,"""9495_8""",1,"[""Superbly trashy and wondrously unpretentious..."


# Remove HTMLtags

In [53]:
LabelTrainDataFrame['review'] = LabelTrainDataFrame.apply(RemoveHTML, axis=1)

In [55]:
LabelTrainDataFrame.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"[""With all this stuff going down at the moment..."
1,"""2381_9""",1,"[""\""The Classic War of the Worlds\"" by Timothy..."
2,"""7759_3""",0,"[""The film starts with a manager (Nicholas Bel..."
3,"""3630_4""",0,"[""It must be assumed that those who praised th..."
4,"""9495_8""",1,"[""Superbly trashy and wondrously unpretentious..."


# replace abbreviation

In [56]:
LabelTrainDataFrame['review'] = LabelTrainDataFrame.apply(ReplaceAbbre, axis=1)

In [57]:
LabelTrainDataFrame.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"[""With all this stuff going down at the moment..."
1,"""2381_9""",1,"[""\""The Classic War of the Worlds\"" by Timothy..."
2,"""7759_3""",0,"[""The film starts with a manager (Nicholas Bel..."
3,"""3630_4""",0,"[""It must be assumed that those who praised th..."
4,"""9495_8""",1,"[""Superbly trashy and wondrously unpretentious..."


# split into words

In [58]:
LabelTrainDataFrame['review'] = LabelTrainDataFrame.apply(SplitSent, axis=1)

In [59]:
LabelTrainDataFrame.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"[["", With, all, this, stuff, going, down, at, ..."
1,"""2381_9""",1,"[["", \, "", The, Classic, War, of, the, Worlds,..."
2,"""7759_3""",0,"[["", The, film, starts, with, a, manager, (, N..."
3,"""3630_4""",0,"[["", It, must, be, assumed, that, those, who, ..."
4,"""9495_8""",1,"[["", Superbly, trashy, and, wondrously, unpret..."


# Lemmatizer

In [None]:
LabelTrainDataFrame['review'] = LabelTrainDataFrame.apply(Lemmatizer, axis=1)

In [None]:
LabelTrainDataFrame.head()

# Clean words,  remove stop words

In [28]:
LabelTrainDataFrame['review'] = LabelTrainDataFrame.apply(CleanWords, axis=1)

In [29]:
LabelTrainDataFrame.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"[[stuff, moment, start, listen, music, watch, ..."
1,"""2381_9""",1,"[[classic, war, worlds, timothy, hines, entert..."
2,"""7759_3""",0,"[[film, start, manager, nicholas, bell, give, ..."
3,"""3630_4""",0,"[[must, assume, praise, film, greatest, filmed..."
4,"""9495_8""",1,"[[superbly, trashy, wondrously, unpretentious,..."


# convert to str

In [32]:
LabelTrainDataFrame['review']=LabelTrainDataFrame.apply(ToStr, axis=1)

In [33]:
LabelTrainDataFrame.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,stuff moment start listen music watch odd docu...
1,"""2381_9""",1,classic war worlds timothy hines entertaining ...
2,"""7759_3""",0,film start manager nicholas bell give welcome ...
3,"""3630_4""",0,must assume praise film greatest filmed opera ...
4,"""9495_8""",1,superbly trashy wondrously unpretentious explo...


In [34]:
LabelTrainDataFrame.to_csv("data/2/labeled_train_data.csv", index=False)

In [35]:
LabelTrainDataFrame

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,stuff moment start listen music watch odd docu...
1,"""2381_9""",1,classic war worlds timothy hines entertaining ...
2,"""7759_3""",0,film start manager nicholas bell give welcome ...
3,"""3630_4""",0,must assume praise film greatest filmed opera ...
4,"""9495_8""",1,superbly trashy wondrously unpretentious explo...
5,"""8196_8""",1,know people think bad movie get pretty good pl...
6,"""7166_2""",0,movie could good come way short cheesy special...
7,"""10633_1""",0,watch video friend house glad waste money buy ...
8,"""319_1""",0,friend mine bought film even grossly overprice...
9,"""8713_10""",1,movie full reference like mad max wild one man...
