In [2]:
import pathlib
import json
import pickle
import re
import nltk
import numpy as np
import pandas as pd

from html import unescape
from functools import partial
from string import punctuation

DATA = pathlib.Path('data')

## init

__load data__

In [4]:
# load data
apostrophe_dict = json.load(open(DATA / 'apostrophe.json', 'r', encoding='utf-8'))
short_word_dict = json.load(open(DATA / 'short_words.json', 'r', encoding='utf-8'))
emoticon_dict = json.load(open(DATA / 'emotions.json', 'r', encoding='utf-8'))

train = pd.read_csv(DATA / 'train_tweets.csv')
valid = pd.read_csv(DATA / 'test_tweets.csv')
data = pd.concat([train, valid]).reset_index(drop=True)

# prepare columns for cleaning
data['clean'] = data['tweet'].copy()
data.head()

Unnamed: 0,id,label,tweet,clean
0,1,0.0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation,factsguide: society now #motivation


__Common functions__

In [5]:
def remap_func(text, dictionary):
    return ' '.join([dictionary.get(w, w) for w in text.split()])

## hw

__Задание 1.__
Заменим html-сущности (к примеру: < > &).

In [6]:
# можно вместо apply() использовать map()
data['clean'] = data['clean'].apply(unescape)
data.head()

Unnamed: 0,id,label,tweet,clean
0,1,0.0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation,factsguide: society now #motivation


__Задание 2.__ Удалим @user из всех твитов.

In [7]:
data['clean'] = data['clean'].str.replace(r'@\w*', '', regex=True)
data.head()

Unnamed: 0,id,label,tweet,clean
0,1,0.0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can't use cause th...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation,factsguide: society now #motivation


__Задание 3.__ Изменим регистр твитов на нижний.

In [8]:
data['clean'] = data['clean'].str.lower()
data.head()

Unnamed: 0,id,label,tweet,clean
0,1,0.0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can't use cause th...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation,factsguide: society now #motivation


__Задание 4.__ Заменим сокращения с апострофами (пример: ain't, can't) на пробел, используя apostrophe_dict. Для этого необходимо сделать функцию: для каждого слова в тексте проверить (for word in text.split()), если слово есть в словаре apostrophe_dict в качестве ключа (сокращенного слова), то заменить ключ на значение (полную версию слова)

In [9]:
data['clean'] = data['clean'].apply(partial(remap_func, dictionary=apostrophe_dict))
data.head()

Unnamed: 0,id,label,tweet,clean
0,1,0.0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i cannot use cause the...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ur...
4,5,0.0,factsguide: society now #motivation,factsguide: society now #motivation


__Задание 5.__ Заменим сокращения на их полные формы.

In [10]:
data['clean'] = data['clean'].apply(partial(remap_func, dictionary=short_word_dict))
data.head()

Unnamed: 0,id,label,tweet,clean
0,1,0.0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i cannot use cause the...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model i love you take with you all the time i...
4,5,0.0,factsguide: society now #motivation,factsguide: society now #motivation


__Задание 6.__ Заменим эмотиконы.

In [11]:
data['clean'] = data['clean'].apply(partial(remap_func, dictionary=emoticon_dict))
data.head()

Unnamed: 0,id,label,tweet,clean
0,1,0.0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i cannot use cause the...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model i love you take with you all the time i...
4,5,0.0,factsguide: society now #motivation,factsguide: society now #motivation


__Задание 7, 8, 9.__ Заменим пунктуацию на пробелы. Заменим спец. символы на пробелы. Заменим числа на пробелы.

In [12]:
# data['clean'] = data['clean'].str.replace(r'[^\w\s]', ' ', regex=True)
data['clean'] = data['clean'].str.replace(f'[{punctuation}0-9]', ' ', regex=True)
data.head()

Unnamed: 0,id,label,tweet,clean
0,1,0.0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i cannot use cause the...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,model i love you take with you all the time i...
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation


__Задание 10.__ Удалим из текста слова длиной в 1 символ.

In [13]:
# data['clean'] = data['clean'].apply(lambda text: ' '.join([w for w in text.split() if len(w) > 1]))
data['clean'] = data['clean'].str.replace(r'\b\S\b', '', regex=True)     # такой вариант, кмк проще для понимания
data.head()

Unnamed: 0,id,label,tweet,clean
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfis...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in...
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation


__Задание 11.__ Поделим твиты на токены.

In [14]:
data['token'] = data['clean'].apply(nltk.tokenize.word_tokenize)
data.head()

Unnamed: 0,id,label,tweet,clean,token
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfis...,"[when, father, is, dysfunctional, and, is, so,..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they...,"[thanks, for, lyft, credit, can, not, use, cau..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in...,"[model, love, you, take, with, you, all, the, ..."
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]"


__Задание 12.__ Удалим стоп-слова из токенов.

In [15]:
stopwords = nltk.corpus.stopwords.words('english')
data['filtered'] = data['token'].apply(lambda tokens: [w for w in tokens if w not in stopwords])
data.head()

Unnamed: 0,id,label,tweet,clean,token,filtered
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfis...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in...,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, urð±, ,..."
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]"


__Задание 13.__ Применим стемминг к токенам.

In [16]:
stemmer = nltk.stem.PorterStemmer()
data['stemmed'] = data['filtered'].apply(lambda tokens: [stemmer.stem(w) for w in tokens])
data.head()

Unnamed: 0,id,label,tweet,clean,token,filtered,stemmed
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfis...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in...,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, urð±, ,...","[model, love, take, time, urð±, ,..."
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]"


__Задание 14.__ Применим лемматизацию к токенам.

In [17]:
lemm = nltk.stem.wordnet.WordNetLemmatizer()
data['lemma'] = data['stemmed'].apply(lambda tokens: [lemm.lemmatize(w) for w in tokens])
data.head()

Unnamed: 0,id,label,tweet,clean,token,filtered,stemmed,lemma
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfis...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunct, selfish, drag, kid, dysfunc..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thank, lyft, credit, use, caus, offer, wheelc..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesti]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in...,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, urð±, ,...","[model, love, take, time, urð±, ,...","[model, love, take, time, urð±, ,..."
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]","[factsguid, societi, motiv]"


__Задание 15.__ Сохраним результат предобработки в pickle-файл.

In [18]:
# pickle может некорректно читаться на других версиях этой библиотеки или интерпретатора
pickle.dump(data['lemma'].to_dict(), open('lemma.pkl', 'wb'))
json.dump(data['lemma'].to_dict(), open('lemma.json', 'w', encoding='utf-8'))

Заметил, что проскакивают прелести кодировки, например в твите c id=4. Дополнительно никак не обрабатывал.

In [19]:
# P.S. I know, однострочники - зло