In [33]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
import re

## Test Data Process

In [34]:
test_data = pd.read_table('twitter_test.txt',header = None,sep='|')

In [35]:
test_data.columns=['None1','ID','Sentence','Entity','Label','None2']

#### Delete punctuation and stopwords

In [36]:
def text_process(sentence):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in sentence if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [37]:
test_data['Sentence'].apply(text_process)
bow_transformer = CountVectorizer(analyzer = text_process).fit(test_data['Sentence'])

#### Add a feature whether the entity's first letter is capital

In [38]:
def whetherIsCapital(entity):
    if entity.islower() == True:
        return '0'
    else:
        return '1'
test_data['WhetherIsCapital'] = test_data['Entity'].map(whetherIsCapital)

#### Delete Emoji

In [39]:
emoji_pattern = re.compile(u'[\U00010000-\U0010ffff]')

def remove_emoji(text):
    return emoji_pattern.sub(r'', text)

test_data['NewSentence']=test_data['Sentence'].map(remove_emoji)

In [40]:
test_data.iloc[77]['NewSentence']

"✨Getting so close, #TheForceAwakens. #StarWars marathon this weekend, yip I'm just that wild and crazy!  ✨ https://t.co/GKaSLCuAim"

In [41]:
emoji_pattern2 = re.compile(u'\U0001f601-\U0001f64f')
#sample_list = [u'\u2728']

def remove_emoji2(text):
    return emoji_pattern2.sub(r'', text)

#https://apps.timwhitlock.info/emoji/tables/unicode#

In [42]:
test_data['NewSentence']=test_data['NewSentence'].map(remove_emoji2)

In [43]:
test_data.iloc[160]['NewSentence']

'Here you go @john_boyega. These pics always look better with the lightsaber glowing  #StarWars #TheForceAwakens #… https://t.co/KWS8YJ6dvR'

#### Delete URL

In [44]:
def delete_url(list1):
    new = []
    for word in list1:
        if (word.startswith('htt') == False):
            if(word.startswith('https') == False):
                new.append(word)
    return new

def sentence_delete_url(sentence):
    word = sentence.split()
    newlist = delete_url(word)
    newsentence = ' '.join(newlist)
    return newsentence

test_data['NewSentence']=test_data['NewSentence'].map(sentence_delete_url)

In [45]:
test_data

Unnamed: 0,None1,ID,Sentence,Entity,Label,None2,WhetherIsCapital,NewSentence
0,,674869443671941120,RT @EntheosShines: Just As Some Parents Have A...,Egyptian,Thing,,1,RT @EntheosShines: Just As Some Parents Have A...
1,,674869443671941120,RT @EntheosShines: Just As Some Parents Have A...,Obama,Person,,1,RT @EntheosShines: Just As Some Parents Have A...
2,,674869443671941120,RT @EntheosShines: Just As Some Parents Have A...,chirofrenzy,Person,,0,RT @EntheosShines: Just As Some Parents Have A...
3,,674869443671941120,RT @EntheosShines: Just As Some Parents Have A...,EntheosShines,Person,,1,RT @EntheosShines: Just As Some Parents Have A...
4,,674869443671941120,RT @EntheosShines: Just As Some Parents Have A...,PatVPeters,Person,,1,RT @EntheosShines: Just As Some Parents Have A...
5,,674962184615682048,RT @Rick_OntheRocks: The Ultimate List Of Wher...,Star Wars,Product,,1,RT @Rick_OntheRocks: The Ultimate List Of Wher...
6,,674962184615682048,RT @Rick_OntheRocks: The Ultimate List Of Wher...,StarWars,Product,,1,RT @Rick_OntheRocks: The Ultimate List Of Wher...
7,,674962184615682048,RT @Rick_OntheRocks: The Ultimate List Of Wher...,TheForceAwakens,Product,,1,RT @Rick_OntheRocks: The Ultimate List Of Wher...
8,,674962184615682048,RT @Rick_OntheRocks: The Ultimate List Of Wher...,Rick_OntheRocks,Person,,1,RT @Rick_OntheRocks: The Ultimate List Of Wher...
9,,674962728692400128,RT @regentsuni: Is the #starwars force strong ...,starwars,Product,,0,RT @regentsuni: Is the #starwars force strong ...


#### Add a feature : length of NewSentence

In [46]:
def calculateLength(Sentence):
    return len(Sentence)

In [47]:
test_data['SentenceLength'] = test_data['NewSentence'].map(calculateLength)

#### Delete useless columns: only keep: NewSentence,Entity,WhetherIsCapital and Length

In [48]:
test_data.drop(test_data.columns[0:2],axis=1,inplace = True)

In [49]:
test_data.drop(test_data[['Sentence','None2','Label']],axis = 1,inplace = True)

In [50]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1022 entries, 0 to 1021
Data columns (total 4 columns):
Entity              1022 non-null object
WhetherIsCapital    1022 non-null object
NewSentence         1022 non-null object
SentenceLength      1022 non-null int64
dtypes: int64(1), object(3)
memory usage: 32.0+ KB


#### Extract NewSentence and Entity as test_Sentence.txt and test_Entity.txt

In [53]:
test_sentence = test_data.copy()

In [60]:
#test_sentence.head(20)
test_sentence.drop(test_sentence[['Entity','WhetherIsCapital']],axis=1,inplace=True)

In [63]:
test_sentence.drop(test_sentence[['SentenceLength']],axis=1,inplace=True)

In [64]:
test_sentence.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1022 entries, 0 to 1021
Data columns (total 1 columns):
NewSentence    1022 non-null object
dtypes: object(1)
memory usage: 8.1+ KB


In [67]:
test_data.drop(test_data[['NewSentence']],axis=1,inplace=True)

In [68]:
test_sentence.to_csv('test_sentence.txt',index = False)

In [69]:
test_entity = test_data.copy()

In [71]:
test_entity.drop(test_entity[['WhetherIsCapital','SentenceLength']],axis=1,inplace=True)

In [73]:
test_entity.to_csv('test_entity.txt',index = False)

In [75]:
test_data.drop(test_data[['Entity']],axis=1,inplace=True)

In [77]:
test_data.to_csv('test_Capital_and_Length.txt',index = False,sep=' ')

### Get start end position of entity

In [83]:
test_data_neel =  pd.read_table('NEEL2016-test_neel.gs',header = None,sep='\t')

In [95]:
test_data_neel.drop(test_data_neel.columns[2],axis=1,inplace=True)

In [98]:
test_data_neel.to_csv('test_start_and_end_position_of_label.txt',index= False,sep=' ')