## Установка пакетов

In [407]:
# Install spaCy (run in terminal/prompt)
import sys
!{sys.executable} -m pip install spacy
# Download spaCy's  'en' Model
!{sys.executable} -m spacy download en

[!] As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the full
pipeline package name 'en_core_web_sm' instead.
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


2021-04-15 01:15:18.033308: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2021-04-15 01:15:18.033329: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [408]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kirill_Sergeev\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kirill_Sergeev\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Импорт зависимостей, константы

In [417]:
import re
import spacy
import string

import pandas as pd
import numpy as np
from tqdm.auto import tqdm, trange

from nltk.stem import *
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer, EnglishStemmer

In [418]:
LANGUAGE = 'english'

## Подготовка данных, лемматизация, стеминг

In [419]:
stemmer = EnglishStemmer()
lemmatizer = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [420]:
custom_stop_words = ['assume', 'never', 'nt', 'always', 'often', 'want', 'wonder', 'yeah', 'yes', 'know', 'like', 'nice', 'good', 'think', 'would', 'also', 'get', 'make', 'one', 'probably', 'maybe']

lang_stopwords = stopwords.words(LANGUAGE)
lang_stopwords.extend(['©', '…', '«', '»', '...', '- -', '--', '-', '_'])
lang_stopwords.extend(custom_stop_words)

In [421]:
def remove_url(text):
    url_check = "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    clean = re.sub(url_check, "", text)
    return clean

def replace_copywrite(text):
  check = "©\s?(?:\d{4}|\D{2})[^.]{1,}."
  clean = re.sub(check, "", text)

  return clean

def remove_numbers(text):
    num_check = "([0-9,]*)"
    clean = re.sub(num_check, "", text).strip()
    return clean

def remove_punctuation(text):
    return "".join([ch if ch not in string.punctuation else ' ' for ch in text])

def remove_multiple_spaces(text):
    return re.sub(r'\s+', ' ', text, flags=re.I)

def remove_stop_words(text):
    tokens = word_tokenize(text) 
    tokens = [token for token in tokens if token not in lang_stopwords and token != ' ']
    return " ".join(tokens)

In [422]:
def lemmatize_text(text):
    text_lem = lemmatizer(text)
    tokens = [token.lemma_ for token in text_lem if token.lemma_ != ' ' and token.lemma_ not in lang_stopwords]
    return " ".join(tokens)

def stemming_text(text):
    tokens = word_tokenize(text)    
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return " ".join(stemmed_tokens)

In [423]:
def clean_df_series(df_series):

  clean_series = df_series.map(lambda x: x.lower())
  clean_series = clean_series.map(remove_url)
  clean_series = clean_series.map(remove_punctuation)
  clean_series = clean_series.map(remove_numbers)
  clean_series = clean_series.map(remove_stop_words)
  clean_series = clean_series.apply(lambda x: " ".join([w for w in x.split() if (len(w)>2)]))
  clean_series = clean_series.apply(lambda x: " ".join([w for w in x.split() if (len(set(w))>2)]))
  clean_series = clean_series.map(remove_multiple_spaces)
  
  return clean_series

## Работа с датасетом

In [433]:
data = pd.read_csv('DATA2.csv')
# data.dropna(subset=['abstract'], inplace=True)
data.fillna("", inplace=True)

# data['message+context'] = data['title'] + " " + data['abstract']

data

Unnamed: 0,utterance_id,conversation_id,agent,message,topic,context
0,0,0,agent_1,do you listen to albums?,Music,
1,1,0,agent_2,Yes I do. Have you listened to Hybrid Theory?,Music,do you listen to albums?
2,2,0,agent_1,Yes it is the best selling album of the centur...,Music,do you listen to albums? \n Yes I do. Have you...
3,3,0,agent_2,True. Do you know who Reel Big FIsh are?,Music,Yes I do. Have you listened to Hybrid Theory? ...
4,4,0,agent_1,Yes the band that lost the rights to their own...,Music,Yes it is the best selling album of the centur...
...,...,...,...,...,...,...
1146,1157,54,agent_1,I’d like to visit you this summer perhaps in...,Weather_Time,Of course it rains sometimes but I still li...
1147,1158,54,agent_2,I think by summer I’ll pass all my exams so I...,Weather_Time,It’s fine and I feel comfortable. \n I’d lik...
1148,1159,54,agent_1,I’m also well thank you! I like it here tho...,Weather_Time,I’d like to visit you this summer perhaps in...
1149,1160,54,agent_2,You know I’m a Spanish guy.,Weather_Time,I think by summer I’ll pass all my exams so I...


In [434]:
data['text_with_context'] = data['message'] + data['context']
data['text_without_context'] = data['message']
    
data['text_with_context_clean'] = clean_df_series(data['text_with_context'])
data['text_with_context_lemm'] = data['text_with_context_clean'].map(lemmatize_text)
data['text_with_context_lemm'] = data['text_with_context_lemm'].map(remove_stop_words)

data['text_without_context_clean'] = clean_df_series(data['text_without_context'])
data['text_without_context_lemm'] = data['text_without_context_clean'].map(lemmatize_text)
data['text_without_context_lemm'] = data['text_without_context_lemm'].map(remove_stop_words)

In [435]:
data

Unnamed: 0,utterance_id,conversation_id,agent,message,topic,context,text_with_context,text_without_context,text_with_context_clean,text_with_context_lemm,text_without_context_clean,text_without_context_lemm
0,0,0,agent_1,do you listen to albums?,Music,,do you listen to albums?,do you listen to albums?,listen albums,listen album,listen albums,listen album
1,1,0,agent_2,Yes I do. Have you listened to Hybrid Theory?,Music,do you listen to albums?,Yes I do. Have you listened to Hybrid Theory?d...,Yes I do. Have you listened to Hybrid Theory?,listened hybrid theory listen albums,listen hybrid theory listen album,listened hybrid theory,listen hybrid theory
2,2,0,agent_1,Yes it is the best selling album of the centur...,Music,do you listen to albums? \n Yes I do. Have you...,Yes it is the best selling album of the centur...,Yes it is the best selling album of the centur...,best selling album century alldo listen albums...,selling album century alldo listen album liste...,best selling album century,selling album century
3,3,0,agent_2,True. Do you know who Reel Big FIsh are?,Music,Yes I do. Have you listened to Hybrid Theory? ...,True. Do you know who Reel Big FIsh are?Yes I ...,True. Do you know who Reel Big FIsh are?,true reel big fish listened hybrid theory best...,true reel big fish listen hybrid theory sellin...,true reel big fish,true reel big fish
4,4,0,agent_1,Yes the band that lost the rights to their own...,Music,Yes it is the best selling album of the centur...,Yes the band that lost the rights to their own...,Yes the band that lost the rights to their own...,band lost rights music sadyes best selling alb...,band lose right music sadye selling album cent...,band lost rights music sad,band lose right music sad
...,...,...,...,...,...,...,...,...,...,...,...,...
1146,1157,54,agent_1,I’d like to visit you this summer perhaps in...,Weather_Time,Of course it rains sometimes but I still li...,I’d like to visit you this summer perhaps in...,I’d like to visit you this summer perhaps in...,visit summer perhaps middle july convenient en...,visit summer perhaps middle july convenient en...,visit summer perhaps middle july convenient en...,visit summer perhaps middle july convenient en...
1147,1158,54,agent_2,I think by summer I’ll pass all my exams so I...,Weather_Time,It’s fine and I feel comfortable. \n I’d lik...,I think by summer I’ll pass all my exams so I...,I think by summer I’ll pass all my exams so I...,summer pass exams lot free time fine feel comf...,summer pass exam lot free time fine feel comfo...,summer pass exams lot free time,summer pass exam lot free time
1148,1159,54,agent_1,I’m also well thank you! I like it here tho...,Weather_Time,I’d like to visit you this summer perhaps in...,I’m also well thank you! I like it here tho...,I’m also well thank you! I like it here tho...,well thank though gets chilly humid sometimes ...,well thank though chilly humid sometimes visit...,well thank though gets chilly humid sometimes,well thank though chilly humid sometimes
1149,1160,54,agent_2,You know I’m a Spanish guy.,Weather_Time,I think by summer I’ll pass all my exams so I...,You know I’m a Spanish guy. I think by summe...,You know I’m a Spanish guy.,spanish guy summer pass exams lot free time we...,spanish guy summer pass exam lot free time wel...,spanish guy,spanish guy


## Сохранение

In [436]:
CSV_EXTENSION = '.csv'
EXCEL_EXTENSION = '.xlsx'

In [437]:
def save_dataset(data, file_name, output='csv', start_index=None, end_index=None):
    
    def save(data, file_name=file_name, output=output):
        if output == 'csv':
            data.to_csv(file_name + CSV_EXTENSION, index=False)
        elif output == 'excel':
            data.to_excel(file_name + EXCEL_EXTENSION, index=False)
        else:
            raise ValueError('unknown type')

            
    if start_index is None and end_index is None:
        saving_data = data
    elif start_index is None and end_index is not None:
        saving_data = data[:end_index]
    elif start_index is not None and end_index is None:
        saving_data = data[start_index:]
    else:
        saving_data = data[start_index:end_index]
    
    save(saving_data)

In [438]:
TRAIN_SIZE = 1
TEST_SIZE = 0

In [439]:
data_len = len(data)
train_index = int(data_len * TRAIN_SIZE)
test_index = int(train_index + data_len * TEST_SIZE)

In [440]:
save_dataset(data, 'preparing_train_2')

In [57]:
# save_dataset(data, 'preparing_train', end_index=train_index)

In [58]:
save_dataset(data, '../data/other/temp/preparing_test', start_index=train_index, end_index=test_index)

In [59]:
save_dataset(data, '../data/other/temp/preparing_private_test', start_index=test_index)