In [13]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import re
nltk.download('stopwords')
nltk.download('punkt')

In [2]:
train = pd.read_csv("dataset/train.tsv", sep="\t")
test = pd.read_csv("dataset/test.tsv", sep="\t")

In [3]:
train.head(5)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
PhraseId      156060 non-null int64
SentenceId    156060 non-null int64
Phrase        156060 non-null object
Sentiment     156060 non-null int64
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


# transform
word_count = count word from original phase
new_phase 
    - remove special character (ex , .)
    - remove stopword (ex i, we, a, and, the)
new_word_count = count word after remove stopword
tokenize_phrase = list of word from new_phase

In [5]:
stopword = set(stopwords.words('english'))

def transform(df):
    df["word_count"] = df["Phrase"].apply(lambda x: len(x.split()))
    df["Phrase"] = df["Phrase"].apply(lambda x: x.lower())
    df["new_phrase"] = df["Phrase"].apply(lambda x: re.sub(r'[^A-Za-z0-9]',' ',x))
    df["new_phrase"] = df["new_phrase"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopword)]))
    df["new_word_count"] = df["new_phrase"].apply(lambda x: len(x.split()))
    return df

In [6]:
train = transform(train)
test = transform(test)

In [7]:
train.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,word_count,new_phrase,new_word_count
0,1,1,a series of escapades demonstrating the adage ...,1,37,series escapades demonstrating adage good goos...,15
1,2,1,a series of escapades demonstrating the adage ...,2,14,series escapades demonstrating adage good goose,6
2,3,1,a series,2,2,series,1
3,4,1,a,2,1,,0
4,5,1,series,2,1,series,1
5,6,1,of escapades demonstrating the adage that what...,2,12,escapades demonstrating adage good goose,5
6,7,1,of,2,1,,0
7,8,1,escapades demonstrating the adage that what is...,2,11,escapades demonstrating adage good goose,5
8,9,1,escapades,2,1,escapades,1
9,10,1,demonstrating the adage that what is good for ...,2,10,demonstrating adage good goose,4


### filter the stopword phrase out 
It can be seen than some phrase has only the stopword, 
so when the stopwords are removed, there is nothing left (such as phase id 4, 7)(about 2,000 data)
then I filter those out to be train_postproc & test_postproc

In [8]:
train_postproc = train[train['new_word_count']!=0]
test_postproc = test[test['new_word_count']!= 0]

In [10]:
train_postproc.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,word_count,new_phrase,new_word_count
0,1,1,a series of escapades demonstrating the adage ...,1,37,series escapades demonstrating adage good goos...,15
1,2,1,a series of escapades demonstrating the adage ...,2,14,series escapades demonstrating adage good goose,6
2,3,1,a series,2,2,series,1
4,5,1,series,2,1,series,1
5,6,1,of escapades demonstrating the adage that what...,2,12,escapades demonstrating adage good goose,5
7,8,1,escapades demonstrating the adage that what is...,2,11,escapades demonstrating adage good goose,5
8,9,1,escapades,2,1,escapades,1
9,10,1,demonstrating the adage that what is good for ...,2,10,demonstrating adage good goose,4
10,11,1,demonstrating the adage,2,3,demonstrating adage,2
11,12,1,demonstrating,2,1,demonstrating,1


### new phrase without stopword

In [11]:
new_phrase_train = train_postproc[['PhraseId','SentenceId','new_phrase','Sentiment']]
new_phrase_test = test_postproc[['PhraseId','SentenceId','new_phrase']]

In [12]:
new_phrase_train.head(10)

Unnamed: 0,PhraseId,SentenceId,new_phrase,Sentiment
0,1,1,series escapades demonstrating adage good goos...,1
1,2,1,series escapades demonstrating adage good goose,2
2,3,1,series,2
4,5,1,series,2
5,6,1,escapades demonstrating adage good goose,2
7,8,1,escapades demonstrating adage good goose,2
8,9,1,escapades,2
9,10,1,demonstrating adage good goose,2
10,11,1,demonstrating adage,2
11,12,1,demonstrating,2


In [11]:
new_phrase_train.to_csv('new_train.tsv',encoding='utf-8', index=False)
new_phrase_test.to_csv('new_test.tsv',encoding='utf-8', index=False)

## Drop the duplicated cases

In [14]:
drop_dup = train.drop_duplicates(subset=['SentenceId','new_phrase'], keep='last')
drop_dup = drop_dup[['PhraseId','SentenceId','new_phrase','Sentiment']]

In [15]:
drop_dup.to_csv('dataset/drop_dup.tsv',encoding='utf-8', index=False)