# I had to install python 3.5 environment in my anaconda2 in order to install and run ekphasis
# Python 3 environment is installed as py35 
# I installed pip install -r requirements.txt
# I run pip3 install local_installation.sh 
#-------------------------------------------------------------------------------
# in order to run this program on my school computer, first open the commandline
# type 'sorce activate py35'
# once you see py35[place you are at]$, you know you are in py35 env
# now you type 'jupyter notebook' to open jupyter in python 3.5 shell 
# open this file, and run the cells. 
# type 'sorce deactivate py35' to get oput of this shell 

In [1]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

sentences = [
    "CANT WAIT for the new season of #TwinPeaks ＼(^o^)／!!! #davidlynch #tvseries :)))",
    "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/",
    "@SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!! :-D http://sentimentsymposium.com/."
]

for s in sentences:
    print(" ".join(text_processor.pre_process_doc(s)))

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...
<allcaps> cant wait </allcaps> for the new season of <hashtag> twin peaks </hashtag> ＼(^o^)／ ! <repeated> <hashtag> david lynch </hashtag> <hashtag> tv series </hashtag> <happy>
i saw the new <hashtag> john doe </hashtag> movie and it sucks <elongated> ! <repeated> <allcaps> waisted </allcaps> <money> . <repeated> <hashtag> bad movies </hashtag> <annoyed>
<user> : can not wait for the <date> <hashtag> sentiment </hashtag> talks ! <allcaps> yay <elongated> </allcaps> ! <repeated> <laugh> <url>


In [24]:
print(" ".join(text_processor.pre_process_doc('sooooooo')))

so <elongated>


In [37]:
print(" ".join(text_processor.pre_process_doc('$300 to me')))

<money> to me


In [38]:
print(" ".join(text_processor.pre_process_doc('3/4/15')))

<date>


In [39]:
print(" ".join(text_processor.pre_process_doc('March 3')))

<date>


In [40]:
print(" ".join(text_processor.pre_process_doc('@hithisisdhara')))

<user>


In [41]:
print(" ".join(text_processor.pre_process_doc('https://mail.google.com/mail/u/0/#inbox')))

<url>


In [42]:
print(" ".join(text_processor.pre_process_doc('#GSU')))

<hashtag> gsu </hashtag>


In [43]:
print(" ".join(text_processor.pre_process_doc('loll')))

loll


In [44]:
print(" ".join(text_processor.pre_process_doc(':)')))

<happy>


In [45]:
print(" ".join(text_processor.pre_process_doc('top 20')))

top <number>


In [46]:
print(" ".join(text_processor.pre_process_doc('top20')))

top20


In [47]:
print(" ".join(text_processor.pre_process_doc('＼(^o^)／')))

＼(^o^)／


In [5]:
import html
def clean_text(text):
    text = text.rstrip()

    if '""' in text:
        if text[0] == text[-1] == '"':
            text = text[1:-1]
        text = text.replace('\\""', '"')
        text = text.replace('""', '"')

    text = text.replace('\\""', '"')

    text = html.unescape(text)
    text = ' '.join(text.split())
    return text

In [28]:
f = open('/home/dharashah/Documents/Spring_18/DeepLearning/project/datastories/datastories-semeval2017-task4/dataset/Subtask_BD/4B-English/SemEval2017-task4-dev.subtask-BD.english.INPUT.txt')
cleaned_data = [] 
for line in f:
    tokens = line.rstrip().split('\t')
    tweet = clean_text(tokens[-1])
    #print(tweet)
    processed_tweet = " ".join(text_processor.pre_process_doc(tweet))
    cleaned_data.append(tokens[:-1]+[processed_tweet])
    #break
f.close()

In [29]:
len(cleaned_data)

10552

In [30]:
filepath = '/home/dharashah/Documents/Spring_18/DeepLearning/project'
f = open(filepath+'/'+'cleaned_data_tab_separatex.txt','w')
for c in cleaned_data:
    line =  '\t'.join(c)
    f.write(line+'\n')
f.close()

In [34]:
# sainitychk
counter = 0 
f = open(filepath+'/'+'cleaned_data_tab_separatex.txt')
for line in f:
    counter += 1 
    if counter % 1000 == 0:
        print(line)
print(counter)

635291585110929408	caitlyn jenner	negative	there ' s a <number> / <number> % chance that caitlyn jenner may be charged with manslaughter i should think so as well after what she did to poor bruce

628569856246308864	david price	positive	" no matter what drake tells you , monday in <hashtag> toronto </hashtag> belonged to david price . <hashtag> jays </hashtag> - <url> <url>

640162813625942016	foo fighters	positive	seeing foo fighters tomorrow argh <elongated> i am excited and terrified at the same time

665090003257683968	ira	negative	" just me that thinks it ' s fucking daft arresting oaps for killings on bloody sunday , too long ago , fuck the <allcaps> ira </allcaps> "

681696455736733696	justin bieber	negative	me : " justin bieber is gonna be in that 2 nd zoolander movie but only for a quick scene cause he dies " dad : " good " <user>

675890017487724544	lady gaga	positive	we want gaga in 1 st <hashtag> video mtv 2015 </hashtag> fifth harmony <hashtag> mtv stars </hashtag> lady ga