In [1]:
import pandas as pd
import numpy as np
from natasha import NamesExtractor
from rutermextract import TermExtractor
import re
from stop_words import get_stop_words
import pymorphy2 as pm2
import nltk
from datetime import datetime
import tzlocal 

In [15]:
term_ex = TermExtractor()
names_ex = NamesExtractor()
pymorph = pm2.MorphAnalyzer()
stop_words = get_stop_words('russian')
stop_words.extend(['ишо', 'вот'])


def find_date(text):
    unix_timestamp = float(text)
    local_timezone = tzlocal.get_localzone() # get pytz timezone
    return datetime.fromtimestamp(unix_timestamp, local_timezone)


def normilize_text(text):
    result = ''
    for word in text.split(' '):
        result += ' {}'.format(pymorph.parse(word)[0].normal_form)
    return result


def generate_collocations(tokens):
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens)
    colls = finder.nbest(bigram_measures.student_t, 10)
    colls = [{k: finder.ngram_fd[k]} for k in colls]
    return colls 


def find_names(text):
    names = []
    for match in names_ex(text):
        name = '{} {} {}'.format(match.fact.first, match.fact.middle, match.fact.last)
        name = name.replace('None', '')
        name = name.lstrip()
        names.append(name)
    return names


def find_key_words(text):
    terms = []
    for term in term_ex(text, limit=3):
        terms.append(term.normalized)
    return ' '.join(terms)


def exclude_stop_words(text):
    words = []
    for word in text.split():
        if not(word in stop_words):
            words.append(word)
    return ' '.join(words)


def text_prepocess(text):
    reg = re.compile('[^а-яА-я ]')
    text = reg.sub('', text)
    text = exclude_stop_words(text)
    text = text.lower()
    words = []
    for word in text.split():
        if len(word) > 2:
            words.append(word.strip())
    return ' '.join(words)


def make_data(path):
    data = pd.read_csv(path)
    data = data[['commentlikescount', 'commenttext', 'post_date']]
    data['commenttext'] = data['commenttext'].apply(str)
    data['names'] = data['commenttext'].apply(find_names)
    data['commenttext'] = data['commenttext'].apply(text_prepocess)
    data['keywords'] = data['commenttext'].apply(find_key_words)
    data['keywords'] = data['keywords'].apply(normilize_text)
    data['post_date'] = data['post_date'].apply(find_date)
    
    return data

    



In [16]:
path = "/Users/ba/Documents/DATA_CULTURE_HACK/rtfull.csv"

In [None]:
new_data = make_data(path)

  if (yield from self.run_code(code, result)):
