## Анализ текстов интернет-страниц
### Задача DMP-3643

In [30]:
import datetime
import pandas as pd

In [36]:
##### Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import hashlib
from collections import Counter

hive_config_query = '''
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set mapreduce.map.memory.mb=4096;
set mapreduce.map.child.java.opts=-Xmx4g;
set mapreduce.task.io.sort.mb=1024;
set mapreduce.reduce.child.java.opts=-Xmx4g;
set mapreduce.reduce.memory.mb=7000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=1073741824;
set hive.optimize.ppd=true;
set hive.merge.smallfiles.avgsize=536870912;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.hadoop.supports.splittable.combineinputformat=true;
set hive.exec.reducers.bytes.per.reducer=536870912;
set hive.exec.parallel=true;
set hive.exec.max.created.files=10000000;
set hive.exec.compress.output=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set io.seqfile.compression.type=BLOCK;
set mapreduce.map.failures.maxpercent=5;
'''
try:
    sc.stop()
except NameError:
    pass

conf = (SparkConf()
        .set("spark.executor.instances", 4)
        .set("spark.driver.maxResultSize", "8g")
        .set('spark.driver.memory','8g')
        .set("spark.executor.memory", '4g')
        .set("spark.yarn.executor.memoryOverhead", 1048)
        .set("spark.akka.frameSize", '1024')        
       )
sc = SparkContext(conf=conf)
hc = HiveContext(sc)


In [None]:
#! mv /data1/share/kosm/data/data_all.csv /data1/share/kosm/data/parsed_urls_cred_scor.csv
#df = pd.read_csv('/data1/share/kosm/data/parsed_urls_cred_scor.csv',sep = '\t',skiprows=2,index_col=0)
#for l in open('/data1/share/kosm/data/parsed_urls_cred_scor.csv','r'):
first_row = open('/data1/share/kosm/data/parsed_urls_cred_scor.csv','r').readline()
df = pd.DataFrame([e.split('\t')[1:] for e in open('/data1/share/kosm/data/parsed_urls_cred_scor.csv','r') if not e == first_row])
df.columns ='url init_url response_code title body'.split()

In [None]:
#df['title']

### Предобработка

In [None]:
import re
from pymystem3 import Mystem
from nltk.stem.snowball import SnowballStemmer
import nltk

def preprocess(s):
    return re.sub(u'[^а-яa-z]+',' ',s.lower())

def stem_ru(s, stemmer):
    ''' Simple string stemming.
        Input string is splitted by space and list of basic form of word + part of speech.
        Stemmer is pymystem3.Mystem instance.
    '''
    stemmed_words = [((
                e['analysis'][0]['lex'] 
                if 'analysis' in e and len(e['analysis']) > 0 else ''
             ) 
             + '_' + 
             (
                re.match('^([A-Z]+)', e['analysis'][0]['gr']).group(0) 
                 if 'analysis' in e and len(e['analysis']) > 0 else '')
             )
             for e in stemmer.analyze(s) if len(e['text'].strip()) > 0]
    return [w for w in stemmed_words if w != '_']

def stem_eng(s):
    ''' Simple string stemming.
        Input string is splitted by space and list of basic form of word + part of speech.
    '''
    words = [w for w in re.sub(u'[^a-z]+',' ',s.lower()).split(' ') if len(w)>0]
    return [wordbase + '_' + speechpart  for wordbase,speechpart in nltk.pos_tag(words)]

def lemmatize_eng(s, stemmer):
    ''' Simple English string lemmatizing.
        Input string is splitted by space and list of basic form of word .
        Stemmer is nltk.stem.snowball.SnowballStemmer instance.
    '''
    words = [w for w in re.sub(u'[^a-z]+',' ',s.lower()).split(' ') if len(w)>0]
    return [stemmer.stem(w) for w in words]




In [None]:
from itertools import chain
from collections import Counter
import sklearn
import cPickle

stemmer_eng = SnowballStemmer("english")
stemmer_ru = Mystem()
df['title_stemmed'] = df['title'].apply(lambda s: stem_ru(s, stemmer_ru) + lemmatize_eng(s, stemmer_eng))

df[[u'init_url','title_stemmed']].to_csv('/data1/share/kosm/data/text_analysis/stemmed_titles.csv')
print('titles stemmed')
st_list = list(chain(*df['title_stemmed'].values.flat))
counts = Counter(st_list)
word_stat = sorted(counts.items(), key = lambda (word,cnt):-cnt)
cPickle.dump(word_stat, open('/data1/share/kosm/data/text_analysis/stemmed_titles.pck','w'))

In [None]:
%matplotlib inline
print('\n'.join(u'{}:{}'.format(*e) for e in word_stat[:300]))
len([w for w in word_stat if w[1]>1]) # слова встречающиеся больше раза 113504

#### pymystem3 part of speech
A	прилагательное
ADV	наречие
ADVPRO	местоименное наречие
ANUM	числительное-прилагательное
APRO	местоимение-прилагательное
COM	часть композита - сложного слова
CONJ	союз
INTJ	междометие
NUM	числительное
PART	частица
PR	предлог
S	существительное
SPRO	местоимение-существительное
V	глагол

In [None]:
parts_of_speech_to_remove = ['CONJ','PART','PR']
words_to_remove = set([w[0] for w in word_stat if w[1] <= 5] + ['not','ru','the','dnslookuperror','timeouterror'])

In [None]:
# print('\n'.join([u'{}:{}'.format(*w) for w in word_stat if '_PR' in w[0]][:100]))
df['title_bow'] = df['title_stemmed'].map( lambda stem_list:
        [s for s in stem_list if all(not p in s for p in parts_of_speech_to_remove) & (not s in words_to_remove) & (len(s) > 1)]
)
df.ix[:,'url_bow'] = df['init_url'].map(lambda s: [w for w in re.sub(u'[^a-z]+',' ',str(s).lower()).split(' ') if len(w)>0])

In [51]:
#df.ix[:,['init_url','title_bow','url_bow']].to_csv('/data1/share/kosm/data/text_analysis/cred_scor_url_titles.csv',index = False)
#print(df.iloc[3,1] )

In [None]:
df_spark1 = hc.createDataFrame(df.ix[:,['init_url','url_bow']])
df_spark2 = hc.createDataFrame(df.ix[:,['init_url','title_bow']])
hc.registerDataFrameAsTable(df_spark1, "df_spark1")
hc.registerDataFrameAsTable(df_spark2, "df_spark2")
hc.sql('create table user_kposminin.cred_scor_url_titles4 as select * from df_spark1')
hc.sql('create table user_kposminin.cred_scor_url_titles5 as select * from df_spark2')
hc.sql('''
  create table user_kposminin.cred_scor_url_titles6 as 
  select a.init_url as url,a.url_bow,b.title_bow from 
   user_kposminin.cred_scor_url_titles4 a
   inner join user_kposminin.cred_scor_url_titles5 b on a.init_url = b.init_url
   ''')

ya_queries = '''
create table user_kposminin.cred_scor_url_titles7 as
select 
  init_url,
  split(regexp_replace(title_bow,'(u\')|(\')|"|\\[|\\]| ',''),',') as title_bow
from user_kposminin.cred_scor_url_titles5
;

drop table user_kposminin.cred_scor_url_titles6;
  create table user_kposminin.cred_scor_url_titles6 as 
  select a.init_url as url,a.url_bow,b.title_bow from 
   user_kposminin.cred_scor_url_titles4 a
   inner join user_kposminin.cred_scor_url_titles7 b on a.init_url = b.init_url
;

create temporary function trunc as 'brickhouse.udf.collect.TruncateArrayUDF';

create table user_kposminin.cred_scor_url_titles9 as 
select url, trunc(bow,least(size(bow),100)) as bow
from user_kposminin.cred_scor_url_titles8   
;
'''
# В хдфс не хочет сохраняться

In [None]:
#Сохраняем локально
#df.ix[:,['init_url','title_bow','url_bow']].to_csv('/data1/share/kosm/data/text_analysis/cred_scor_url_titles.csv',index = False)