In [34]:
import pandas as pd
from html import unescape
import re
import numpy as np

from nltk import tokenize as tknz
from nltk.corpus import stopwords

from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

import pickle

In [2]:
train_df = pd.read_csv('train_tweets.csv')
test_df = pd.read_csv('test_tweets.csv')
data = train_df.append(test_df, ignore_index = True, sort = False)

**1. Заменим html-сущности (к примеру: < > &). "<" заменим на “<” и "&" заменим на “&”)""". Сделаем это с помощью `HTMLParser.unescape()`. Всю предобработку делаем в новом столбце 'clean_tweet'**

In [3]:
data['clean_tweet']=unescape(data['tweet'])

**2.	Удалим @user из всех твитов с помощью паттерна `"@[\w]*"`. Для этого создадим функцию:
●	для того, чтобы найти все вхождения паттерна в тексте, необходимо использовать `re.findall(pattern, input_txt)`
●	для для замены @user на пробел, необходимо использовать `re.sub()` при применении функции необходимо использовать `np.vectorize(function)`.**

In [4]:
def user_del(string):
    return re.sub(r'@[\w]*','', string)

In [5]:
data['tweet'][0]

' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run'

In [6]:
vfunc = np.vectorize(user_del)
data['clean_tweet'] = vfunc(data['clean_tweet'])

**3.	Изменим регистр твитов на нижний с помощью `.lower().`**

In [7]:
data['clean_tweet']=data['clean_tweet'].apply(str.lower)

**4. Заменим сокращения с апострофами (пример: ain't, can't) на пробел, используя `apostrophe_dict`. Для этого необходимо сделать функцию: для каждого слова в тексте проверить (`for word in text.split()`), если слово есть в словаре `apostrophe_dict` в качестве ключа (сокращенного слова), то заменить ключ на значение (полную версию слова).**

In [8]:
apostrophe_dict = {
"ain't": "am not / are not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is",
"i'd": "I had / I would",
"i'd've": "I would have",
"i'll": "I shall / I will",
"i'll've": "I shall have / I will have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"man's": "mans",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

In [9]:
def dict_del(string, dict_del):
    for word in string.split():
        if word in dict_del:
            string=string.replace(word, dict_del[word])
    return string

In [10]:
dict_del(data['tweet'][29], apostrophe_dict)

'50 people went to nightclub to have a good night and 1 mans actions means those people are lost to their families forever #rip#orlando'

In [11]:
data['tweet'][29]

"50 people went to nightclub to have a good night and 1 man's actions means those people are lost to their families forever #rip#orlando"

In [12]:
vfunc = np.vectorize(dict_del)
data['clean_tweet'] = vfunc(data['clean_tweet'], apostrophe_dict)

**5.	Заменим сокращения на их полные формы, используя `short_word_dict`. Для этого воспользуемся функцией, используемой в предыдущем пункте.**

In [13]:
short_word_dict = {
"121": "one to one",
"a/s/l": "age, sex, location",
"adn": "any day now",
"afaik": "as far as I know",
"afk": "away from keyboard",
"aight": "alright",
"alol": "actually laughing out loud",
"b4": "before",
"b4n": "bye for now",
"bak": "back at the keyboard",
"bf": "boyfriend",
"bff": "best friends forever",
"bfn": "bye for now",
"bg": "big grin",
"bta": "but then again",
"btw": "by the way",
"cid": "crying in disgrace",
"cnp": "continued in my next post",
"cp": "chat post",
"cu": "see you",
"cul": "see you later",
"cul8r": "see you later",
"cya": "bye",
"cyo": "see you online",
"dbau": "doing business as usual",
"fud": "fear, uncertainty, and doubt",
"fwiw": "for what it's worth",
"fyi": "for your information",
"g": "grin",
"g2g": "got to go",
"ga": "go ahead",
"gal": "get a life",
"gf": "girlfriend",
"gfn": "gone for now",
"gmbo": "giggling my butt off",
"gmta": "great minds think alike",
"h8": "hate",
"hagn": "have a good night",
"hdop": "help delete online predators",
"hhis": "hanging head in shame",
"iac": "in any case",
"ianal": "I am not a lawyer",
"ic": "I see",
"idk": "I don't know",
"imao": "in my arrogant opinion",
"imnsho": "in my not so humble opinion",
"imo": "in my opinion",
"iow": "in other words",
"ipn": "I’m posting naked",
"irl": "in real life",
"jk": "just kidding",
"l8r": "later",
"ld": "later, dude",
"ldr": "long distance relationship",
"llta": "lots and lots of thunderous applause",
"lmao": "laugh my ass off",
"lmirl": "let's meet in real life",
"lol": "laugh out loud",
"ltr": "longterm relationship",
"lulab": "love you like a brother",
"lulas": "love you like a sister",
"luv": "love",
"m/f": "male or female",
"m8": "mate",
"milf": "mother I would like to fuck",
"oll": "online love",
"omg": "oh my god",
"otoh": "on the other hand",
"pir": "parent in room",
"ppl": "people",
"r": "are",
"rofl": "roll on the floor laughing",
"rpg": "role playing games",
"ru": "are you",
"shid": "slaps head in disgust",
"somy": "sick of me yet",
"sot": "short of time",
"thanx": "thanks",
"thx": "thanks",
"ttyl": "talk to you later",
"tysvm" : "thank you so very much",
"u": "you",
"ur": "you are",
"uw": "you’re welcome",
"wb": "welcome back",
"wfm": "works for me",
"wibni": "wouldn't it be nice if",
"wtf": "what the fuck",
"wtg": "way to go",
"wtgp": "want to go private",
"ym": "young man",
"gr8": "great"
}

In [14]:
vfunc = np.vectorize(dict_del)
data['clean_tweet'] = vfunc(data['clean_tweet'], short_word_dict)

**6.	Заменим эмотиконы (пример: `":)" = "happy"`) на пробелы, используя `emoticon_dict`. Для этого воспользуемся функцией, используемой в предыдущем пункте.**

In [15]:
emoticon_dict = {
":)": "happy",
":‑)": "happy",
":-]": "happy",
":-3": "happy",
":->": "happy",
"8-)": "happy",
":-}": "happy",
":o)": "happy",
":c)": "happy",
":^)": "happy",
"=]": "happy",
"=)": "happy",
"<3": "happy",
":-(": "sad",
":(": "sad",
":c": "sad",
":<": "sad",
":[": "sad",
">:[": "sad",
":{": "sad",
">:(": "sad",
":-c": "sad",
":-< ": "sad",
":-[": "sad",
":-||": "sad"
}

In [16]:
vfunc = np.vectorize(dict_del)
data['clean_tweet'] = vfunc(data['clean_tweet'], emoticon_dict)

**7. Заменим пунктуацию на пробелы, используя `re.sub()` и паттерн `r'[^\w\s]'`.**

In [17]:
def space_del(string):
    return re.sub(r'[^\w\s]','', string)

In [18]:
vfunc = np.vectorize(space_del)
data['clean_tweet'] = vfunc(data['clean_tweet'])

**8.	Заменим спец. символы на пробелы, используя `re.sub()` и паттерн `r'[^a-zA-Z0-9]'`.**

In [19]:
def symbol_del(string):
    return re.sub(r'[^a-zA-Z0-9]',' ', string)

In [20]:
vfunc = np.vectorize(symbol_del)
data['clean_tweet'] = vfunc(data['clean_tweet'])

**9. Заменим числа на пробелы, используя re.sub() и паттерн r'[^a-zA-Z]'.**

In [21]:
def digit_del(string):
    return re.sub(r'[^a-zA-Z]',' ', string)

In [22]:
vfunc = np.vectorize(digit_del)
data['clean_tweet'] = vfunc(data['clean_tweet'])

**10.	Удалим из текста слова длиной в 1 символ, используя `' '.join([w for w in x.split() if len(w)>1])`.**

In [23]:
def del_1symb_word(string):
    string=' '.join([w for w in string.split() if len(w)>1])
    return string

In [24]:
vfunc = np.vectorize(del_1symb_word)
data['clean_tweet'] = vfunc(data['clean_tweet'])

**11.	Поделим твиты на токены с помощью `nltk.tokenize.word_tokenize`, создав новый столбец `'tweet_token'`.**

In [25]:
data['tweet_token'] = data['clean_tweet'].apply(tknz.word_tokenize)

**12.	Удалим стоп-слова из токенов, используя `nltk.corpus.stopwords`. Создадим столбец `'tweet_token_filtered'` без стоп-слов.**

In [26]:
mystopwords = stopwords.words('english')

def remove_stopwords(string, stop_words = mystopwords):
    return [w for w in string if w not in stop_words]

In [27]:
remove_stopwords(data['tweet_token'][0])

['father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', 'run']

In [28]:
data['tweet_token_filtered'] = data['tweet_token'].apply(remove_stopwords)

**13.	Применим стемминг к токенам с помощью `nltk.stem.PorterStemmer`. Создадим столбец `'tweet_stemmed'` после применения стемминга.**

In [29]:
def stemer(string):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in string]

In [30]:
data['tweet_stemmed'] = data['tweet_token_filtered'].apply(stemer)

**14.	Применим лемматизацию к токенам с помощью `nltk.stem.wordnet.WordNetLemmatizer`. Создадим столбец `'tweet_lemmatized'` после применения лемматизации.**

In [31]:
def lemmer(string, pos=wordnet.VERB):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w,pos) for w in string]

In [32]:
data['tweet_lemmatized'] = data['tweet_token_filtered'].apply(lemmer)

In [33]:
data

Unnamed: 0,id,label,tweet,clean_tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunctional, selfish, drag, kid, dy..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thank, lyft, credit, use, cause, offer, wheel..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesty]"
3,4,0.0,#model i love u take with u all the time in ...,model love yoyou take with yoyou all the time ...,"[model, love, yoyou, take, with, yoyou, all, t...","[model, love, yoyou, take, yoyou, time, yoyour]","[model, love, yoyou, take, yoyou, time, yoyour]","[model, love, yoyou, take, yoyou, time, yoyour]"
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]","[factsguide, society, motivation]"
...,...,...,...,...,...,...,...,...
49154,49155,,thought factory: left-right polarisation! #tru...,thought factory leftright polarisation trump u...,"[thought, factory, leftright, polarisation, tr...","[thought, factory, leftright, polarisation, tr...","[thought, factori, leftright, polaris, trump, ...","[think, factory, leftright, polarisation, trum..."
49155,49156,,feeling like a mermaid ð #hairflip #neverre...,feeling like mermaid hairflip neverready forma...,"[feeling, like, mermaid, hairflip, neverready,...","[feeling, like, mermaid, hairflip, neverready,...","[feel, like, mermaid, hairflip, neverreadi, fo...","[feel, like, mermaid, hairflip, neverready, fo..."
49156,49157,,#hillary #campaigned today in #ohio((omg)) &am...,hillary campaigned today in ohioomg amp used w...,"[hillary, campaigned, today, in, ohioomg, amp,...","[hillary, campaigned, today, ohioomg, amp, use...","[hillari, campaign, today, ohioomg, amp, use, ...","[hillary, campaign, today, ohioomg, amp, use, ..."
49157,49158,,"happy, at work conference: right mindset leads...",happy at work conference right mindset leads t...,"[happy, at, work, conference, right, mindset, ...","[happy, work, conference, right, mindset, lead...","[happi, work, confer, right, mindset, lead, cu...","[happy, work, conference, right, mindset, lead..."


In [35]:
with open('data.pickle', 'wb') as f:
    pickle.dump(data, f)