# Text processing and feature extraction

Regular env preparations.

In [1]:
import os
import os.path
import pandas as pd
import datetime
import matplotlib
import pickle
import numpy as np
from sklearn.svm import SVC
%matplotlib inline

In [2]:
matplotlib.rcParams['figure.figsize'] = (14.0, 7.0)

In [3]:
DATA_PATH = 'D://DSworkshop//Avito'

In [4]:
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))

Let's see some data!

In [5]:
train.head(10)

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,activation_date,user_type,image,image_top_1,deal_probability
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",400.0,2,2017-03-28,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",3000.0,19,2017-03-26,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,2017-03-20,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Автокресло,Продам кресло от0-25кг,2200.0,286,2017-03-25,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110,"ВАЗ 2110, 2003",Все вопросы по телефону.,40000.0,3,2017-03-16,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797
5,51e0962387f7,bbfad0b1ad0a,Татарстан,Чистополь,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Авто люлька,В хорошем состоянии,1300.0,9,2017-03-28,Private,eb6ad1231c59d3dc7e4020e724ffe8e4d302023ddcbb99...,796.0,0.80323
6,c4f260a2b48a,08f469d2e6f7,Нижегородская область,Нижний Новгород,Для дома и дачи,Ремонт и строительство,Сантехника и сауна,,,Водонагреватель 100 литров нержавейка плоский,Электро водонагреватель накопительный на 100 л...,11000.0,125,2017-03-23,Private,0330f6ac561f5db1fa8226dd5e7e127b5671d44d075a98...,2823.0,0.0
7,6b71309d6a8a,fef86baa002c,Пермский край,Пермь,Личные вещи,"Одежда, обувь, аксессуары",Женская одежда,Джинсы,26,Бойфренды colins,Бойфренды в хорошем состоянии.,500.0,61,2017-03-25,Private,9bab29a519e81c14f4582024adfebd4f11a4ac71d323a6...,567.0,0.80323
8,c5b969cb63a2,055825270190,Оренбургская область,Оренбург,Личные вещи,"Одежда, обувь, аксессуары",Женская одежда,Платья и юбки,> 50 (XXL),Платье,54 раз мер очень удобное,500.0,85,2017-03-17,Private,75ce06d1f939a31dfb2af8ac55f08fa998fa336d13ee05...,415.0,0.0
9,b1570962e68c,f9e8f831d94c,Нижегородская область,Нижний Новгород,Личные вещи,Детская одежда и обувь,Для девочек,Обувь,25,Полу ботиночки замш натур.Бамбини,По стельке 15.5см мерить приокский район. Цвет...,400.0,136,2017-03-22,Company,54fb8521135fda77a860bfd2fac6bf46867ab7c06796e3...,46.0,0.0


How many unique categories and parent categories do we have?

In [6]:
print ("Num of parent categories: ",len(train["parent_category_name"].unique()))
print ("Num of categories: ",len(train["category_name"].unique()))

Num of parent categories:  9
Num of categories:  47


How many unique (distinct) "param"s we have?

In [7]:
print("Num of distinct params: "
      ,len(set(train["param_1"].tolist()) | set(train["param_2"].tolist()) | set(train["param_3"].tolist())))

Num of distinct params:  1841


Nice. We think that combining those with the "params" attributes can generate the most informative features about of the data. But in order to engeneer those, we need to "categorize" it somehow. Lets see how many ads do not have any param mentioned within it.

In [10]:
ads_with_absent_params = len(train[train["param_1"].isnull() & train["param_2"].isnull() & train["param_3"].isnull()])
total_ads = train.shape[0]
print ("Absent params:",ads_with_absent_params)
print ("Total ads:",total_ads)
print ("Ratio of ads with no params at all:",100*ads_with_absent_params / float(total_ads),"%")

Absent params: 61576
Total ads: 1503424
Ratio of ads with no params at all: 4.09571750883317 %


## Basic text feature extraction

### Text length

In [34]:
def merge_params(row):
    sentence = ""
    for att in ["param_1","param_2","param_3"]:
        if not pd.isnull(row[att]):
            sentence += row[att] + " "
    return sentence

def count_words_safe(s):
    if pd.isnull(s):
        return 0
    return len(s.split())

train['title_word_count'] = train.apply(lambda row:count_words_safe(row['title']), axis=1)
train['description_word_count'] = train.apply(lambda row:count_words_safe(row['description']), axis=1)
train['params_total_word_count'] = train.apply(lambda row:count_words_safe(merge_params(row)), axis=1)

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,...,price,item_seq_number,activation_date,user_type,image,image_top_1,deal_probability,title len,description len,params total len
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,Кокоби(кокон для сна),...,400.0,2,2017-03-28,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789,3,7,2
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,Стойка для Одежды,...,3000.0,19,2017-03-26,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0,3,7,1
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,Philips bluray,...,4000.0,9,2017-03-20,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177,2,17,5
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Автокресло,...,2200.0,286,2017-03-25,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323,1,3,2
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110.0,"ВАЗ 2110, 2003",...,40000.0,3,2017-03-16,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797,3,4,5


In [45]:
train.describe()

Unnamed: 0,price,item_seq_number,image_top_1,deal_probability,title len,description len,params total len,title capital letters ratio,description capital letters ratio
count,1418062.0,1503424.0,1390836.0,1503424.0,1503424.0,1503424.0,1503424.0,1503424.0,10.0
mean,316708.1,743.674,1241.932,0.1391306,3.32542,25.12265,3.590326,0.0896385,0.037304
std,66891540.0,5572.522,970.4641,0.2600785,1.847104,40.27831,2.323865,0.0706203,0.017874
min,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,500.0,9.0,425.0,0.0,2.0,6.0,2.0,0.04347826,0.03141
50%,1300.0,29.0,1057.0,0.0,3.0,12.0,3.0,0.06666667,0.04211
75%,7000.0,88.0,2217.0,0.15087,4.0,26.0,5.0,0.122449,0.047949
max,79501010000.0,204429.0,3066.0,1.0,21.0,711.0,9.0,1.0,0.060606


### Capital letters ratio in text columns

In [54]:
def count_capitals_ratio_safe(s):
    if pd.isnull(s) or len(s)==0:
        return 0
    s = s.replace(" ","")
    return len([l for l in s if l.isupper()])/len(s)

train['title capital letters ratio'] = train.apply(lambda row:count_capitals_ratio_safe(row['title']), axis=1)
train['description capital letters ratio'] = train.apply(lambda row:count_capitals_ratio_safe(row['description']), axis=1)

In [48]:
train.describe()

Unnamed: 0,price,item_seq_number,image_top_1,deal_probability,title len,description len,params total len,title capital letters ratio,description capital letters ratio
count,1418062.0,1503424.0,1390836.0,1503424.0,1503424.0,1503424.0,1503424.0,1503424.0,1503424.0
mean,316708.1,743.674,1241.932,0.1391306,3.32542,25.12265,3.590326,0.0896385,0.04231141
std,66891540.0,5572.522,970.4641,0.2600785,1.847104,40.27831,2.323865,0.0706203,0.07094886
min,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,500.0,9.0,425.0,0.0,2.0,6.0,2.0,0.04347826,0.01530612
50%,1300.0,29.0,1057.0,0.0,3.0,12.0,3.0,0.06666667,0.02898551
75%,7000.0,88.0,2217.0,0.15087,4.0,26.0,5.0,0.122449,0.04761905
max,79501010000.0,204429.0,3066.0,1.0,21.0,711.0,9.0,1.0,1.0


Lets see for example how CAPS only ads reflect deal_probability:

In [66]:
train[train['deal_probability']>0.8].corr()

Unnamed: 0,price,item_seq_number,image_top_1,deal_probability,title len,description len,params total len,title capital letters ratio,description capital letters ratio
price,1.0,-0.000214,0.001594,0.010461,-0.002433,0.001805,-0.002446,0.001785,-0.001891
item_seq_number,-0.000214,1.0,0.004064,-0.0099,0.046895,0.048347,-0.020132,0.013317,0.023082
image_top_1,0.001594,0.004064,1.0,0.389754,0.031059,0.053747,-0.37555,0.018078,0.033435
deal_probability,0.010461,-0.0099,0.389754,1.0,0.050423,0.107157,-0.044246,-0.104329,0.041476
title len,-0.002433,0.046895,0.031059,0.050423,1.0,0.270894,0.003494,-0.338729,0.008214
description len,0.001805,0.048347,0.053747,0.107157,0.270894,1.0,-0.062742,-0.03048,-0.021094
params total len,-0.002446,-0.020132,-0.37555,-0.044246,0.003494,-0.062742,1.0,-0.059104,-0.0083
title capital letters ratio,0.001785,0.013317,0.018078,-0.104329,-0.338729,-0.03048,-0.059104,1.0,0.036156
description capital letters ratio,-0.001891,0.023082,0.033435,0.041476,0.008214,-0.021094,-0.0083,0.036156,1.0


In [68]:
train[train['title capital letters ratio'] < 0.1].corr()

Unnamed: 0,price,item_seq_number,image_top_1,deal_probability,title len,description len,params total len,title capital letters ratio,description capital letters ratio
price,1.0,0.079868,0.05326,-0.013547,0.080159,0.068962,-0.031698,-0.091642,-0.005817
item_seq_number,0.079868,1.0,0.100867,-0.038214,0.132168,0.123762,-0.048148,-0.166012,0.017569
image_top_1,0.05326,0.100867,1.0,0.188336,0.224237,0.187152,-0.553924,-0.172553,0.050671
deal_probability,-0.013547,-0.038214,0.188336,1.0,-0.004503,-0.002538,-0.117534,-0.019726,-0.001099
title len,0.080159,0.132168,0.224237,-0.004503,1.0,0.282934,-0.14254,-0.558967,0.006504
description len,0.068962,0.123762,0.187152,-0.002538,0.282934,1.0,-0.156659,-0.161931,0.028826
params total len,-0.031698,-0.048148,-0.553924,-0.117534,-0.14254,-0.156659,1.0,0.055904,-0.03645
title capital letters ratio,-0.091642,-0.166012,-0.172553,-0.019726,-0.558967,-0.161931,0.055904,1.0,0.025068
description capital letters ratio,-0.005817,0.017569,0.050671,-0.001099,0.006504,0.028826,-0.03645,0.025068,1.0


We can see a (very) slight correlation between description_len, and a (slight) negative correlation between deal_probability and "title capital letters ratio".

Iterestingly we also see a negative correlation between "title capital letters ratio" and "image_top_1".

## N-grams count

### Unigrams (bag of words)

Create a bag of words

### Bigrams

### Trigrams

## Tagging and tags analysis

## Semantic analysis

### Is the product popular (how many buy this product)?

We first need somehow to determine what is the product being sold. We'll do this by recognizing the "head" noun in the title.

### Is the product new / good shape?

Look for words with similar embeddings and semantic tags (k, learn this k).

Find a way to enrich popular products lines (stuff like avg. price and/or product condition semantics - see below)

## Word embeddings for clustering params and other keywords.

As most of the ads have set "param", we turn to use word embeddings to represent the "params" description. We've downloaded a ready-to-use russian word embeddings from "Polyglot":
https://sites.google.com/site/rmyeid/projects/polyglot.
Let's load those:

In [41]:
with open(os.path.join(DATA_PATH, 'polyglot-ru.pkl'), 'rb') as f:
    u = pickle._Unpickler(f)
    u.encoding = 'latin1'
    words, embeddings = u.load()
print("Emebddings shape is {}".format(embeddings.shape))

Emebddings shape is (100004, 64)


Some othe Russian word2vec to explore later on:
https://github.com/nlpub/russe-evaluation/tree/master/russe/measures/word2vec

Let's see how many of the (meaningful) words from params appear in the wrodEmbeddings dictionary.
We first filter out Stopwords. We've examined 3 lists of stopwords:
* https://gist.github.com/menzenski/7047705
* https://www.ranks.nl/stopwords/russian
* https://www.symantec.com/connect/downloads/russian-stop-words-list

After consulting with our russian linguist, we've picked the third option because it had the least amount of words (the first two would filter out important semantic info).

In [36]:
stopwords = set()
import codecs
with codecs.open(os.path.join(DATA_PATH, 'stopwords_ru.txt'), encoding='cp1251') as ins:
    for w in ins:
        word = w.strip("\r\n")
        word = word.strip("\n")
        stopwords.add(word)
        stopwords.add(word.upper())
        stopwords.add(word[0].upper() + word[1:])

Now we proceed to check how many meaningful words appear in "params".

In [42]:
# Returns true iff *any* word in sentense is present in words.
def param_meaningfull_word_ratio(row):
    sentence = ""
    for att in ["param_1","param_2","param_3"]:
        if not pd.isnull(row[att]):
            sentence += row[att] + " "

    count = 0
    present = 0
    for w in sentence.split():
        count += 1
        if (w in words) and (w not in stopwords):
            present += 1
    return present / float(count)

# We sample 10,000 ads.
N = 10000
train_present_params = train[~(train["param_1"].isnull() & train["param_2"].isnull() & train["param_3"].isnull())].sample(N)
train_present_params['param_1_has_meaningful'] = train_present_params.apply(param_meaningfull_word_ratio, axis=1)
print("We sampled",N,"rows.")
num_meaningful = len(train_present_params[train_present_params['param_1_has_meaningful'] > 0.0])
print(num_meaningful,"Have meaningful words in their params (That's",num_meaningful/float(N)*100,"%).")

We sampled 10000 rows.
7865 Have meaningful words in their params (That's 78.64999999999999 %).
