# Carga de datos
Los datos de entrenamiento se cargan con Pandas


In [1]:
import nltk
#nltk.download('punkt')
#nltk.download('rslp')
#nltk.download('stopwords')
#nltk.download('omw') #WORDNET
#nltk.download('wordnet')
import pandas as pd
import numpy as np
'''import pyspark
from pyspark import SparkContext

number_cores = 8
memory_gb = 10
conf = (
    pyspark.SparkConf()
        .setMaster('local[{}]'.format(number_cores))
        .set('spark.driver.memory', '{}g'.format(memory_gb))
)
sc = SparkContext(appName="Meli", conf=conf)
'''

ds = pd.read_csv('../MLChallenge/data/train.csv')
print(ds.head())
ds_test = pd.read_csv('../MLChallenge/data/test.csv')

                                               title label_quality  \
0  Hidrolavadora Lavor One 120 Bar 1700w  Bomba A...    unreliable   
1                  Placa De Sonido - Behringer Umc22    unreliable   
2               Maquina De Lavar Electrolux 12 Kilos    unreliable   
3  Par Disco De Freio Diant Vent Gol 8v 08/ Frema...    unreliable   
4  Flashes Led Pestañas Luminoso Falso Pestañas P...    unreliable   

     language                   category  
0     spanish  ELECTRIC_PRESSURE_WASHERS  
1     spanish                SOUND_CARDS  
2  portuguese           WASHING_MACHINES  
3  portuguese        VEHICLE_BRAKE_DISCS  
4     spanish            FALSE_EYELASHES  


# Procesamiento de datos

In [2]:
from functools import reduce
import re
from gensim.utils import deaccent
from collections import Counter

trans = {ord(c):' ' for c in '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'}
ref = re.compile("\d+[\.,]\d+")
red = re.compile("\d+")
rem = re.compile(".*[A-Z]+.*")
s_stem = nltk.stem.SnowballStemmer('spanish').stem
#p_stem = nltk.stem.RSLPStemmer().stem
p_stem = nltk.stem.SnowballStemmer('portuguese').stem

s_sw = set(nltk.corpus.stopwords.words('spanish'))
p_sw = set(nltk.corpus.stopwords.words('portuguese'))

from nltk.tokenize import word_tokenize


def proc_text(text, stem, sw):
    text = text.lower()
    text = ref.sub('FLOAT', text)
    text = red.sub('INT', text)
    text = text.translate(trans)
    text = text.split()
    #text = word_tokenize(text)
    text = [stem(w) if rem.match(w) is None else w for w in text if w not in sw]
    text = ' '.join(text)
    if len(text)==0:
        text = 'ART'
    return deaccent(text)

def process_words(x):
    if x[2] == 'spanish':
        stem = s_stem
        sw = s_sw
    else:
        stem = p_stem
        sw = p_sw
    text = proc_text(x[0], stem, sw)
    return [text]

def process_words_test(x):
    if x[2] == 'spanish':
        stem = s_stem
        sw = s_sw
    else:
        stem = p_stem
        sw = p_sw
    text = proc_text(x[1], stem, sw)
    return [text]


def count_words(x):
    res = Counter()
    for s in tqdm(x):
        s = set(s)
        for i in s:
            res[i] = res[i] + 1
    return res


'''def count_words(x):
    def merge(x, y):
        res = {}
        for k in (x.keys() - y.keys()):
            res[k] = x[k]
        for k in (y.keys() - x.keys()):
            res[k] = y[k]
        for k in (x.keys() & y.keys()):
            res[k] = x[k] + y[k]
        return res
    return reduce(merge,map(lambda x:{w:1 for w in set(x)}, x))
'''



'def count_words(x):\n    def merge(x, y):\n        res = {}\n        for k in (x.keys() - y.keys()):\n            res[k] = x[k]\n        for k in (y.keys() - x.keys()):\n            res[k] = y[k]\n        for k in (x.keys() & y.keys()):\n            res[k] = x[k] + y[k]\n        return res\n    return reduce(merge,map(lambda x:{w:1 for w in set(x)}, x))\n'

In [3]:
process_words(ds.values[0,:])

['hidrolav lavor one INT bar INTw bomb alumini itali']

In [4]:
from tqdm import tqdm
import pickle
import os.path


if os.path.exists('post_train.csv'):
    print('Loading train...')
    ds_train = pd.read_csv('post_train.csv')
    post_proc = ds_train.values.tolist()
else:
    post_proc = [process_words(x) for x in tqdm(ds.values)]
    ds_train = pd.DataFrame(data=np.asarray(post_proc), columns=['text'])
    ds_train.to_csv('post_train.csv', index=False)

del ds_train
#Por cuestiones de memoria lo hago feo, se puede hacer inline
#pero requiere duplicar estructuras
for i in tqdm(range(len(post_proc))):
    post_proc[i] = str(post_proc[i][0]).split()

if os.path.exists('post_test.csv'):
    print('Loading test...')
    ds_test_a = pd.read_csv('post_test.csv')
    post_proc_test = ds_test_a.values.tolist()
else:
    post_proc_test = [process_words_test(x) for x in tqdm(ds_test.values)]
    ds_test_a = pd.DataFrame(data=np.asarray(post_proc_test), columns=['text'])
    ds_test_a.to_csv('post_test.csv', index=False)
    
del ds_test_a
#Por cuestiones de memoria lo hago feo
for i in tqdm(range(len(post_proc_test))):
    post_proc_test[i] = str(post_proc_test[i][0]).split()

Loading train...


100%|██████████████████████████████████████████████████████████████████| 20000000/20000000 [00:30<00:00, 659773.70it/s]


Loading test...


100%|██████████████████████████████████████████████████████████████████████| 246955/246955 [00:00<00:00, 478705.08it/s]


## Separando datasets SP-PT

In [5]:
def separate_train(ds, x):
    count = ds['language'].value_counts()
    x_new_sp = [None] * count['spanish']
    quality_sp = [None] * count['spanish']
    y_sp = [None] * count['spanish']
    i_sp = 0
    
    x_new_pt = [None] * count['portuguese']
    quality_pt = [None] * count['portuguese']
    y_pt = [None] * count['portuguese']
    i_pt = 0
    
    for row, s in tqdm(zip(ds.values, x), total=len(x)):
        if row[2] == 'spanish':
            x_new_sp[i_sp] = s
            quality_sp[i_sp] = row[1]
            y_sp[i_sp] = row[3]
            i_sp += 1
        else:
            x_new_pt[i_pt] = s
            quality_pt[i_pt] = row[1]
            y_pt[i_pt] = row[3]
            i_pt += 1
    return (x_new_sp, quality_sp, y_sp), (x_new_pt, quality_pt, y_pt)


def separate_test(ds, x):
    count = ds['language'].value_counts()
    x_new_sp = [None] * count['spanish']
    idx_sp = [None] * count['spanish']
    i_sp = 0
    
    x_new_pt = [None] * count['portuguese']
    idx_pt = [None] * count['portuguese']
    i_pt = 0
    
    for row, s in tqdm(zip(ds.values, x), total=len(x)):
        if row[2] == 'spanish':
            x_new_sp[i_sp] = s
            idx_sp[i_sp] = row[0]
            i_sp += 1
        else:
            x_new_pt[i_pt] = s
            idx_pt[i_pt] = row[0]
            i_pt += 1
    return (x_new_sp, idx_sp), (x_new_pt, idx_pt)


In [6]:
(x_sp, q_sp, y_sp), (x_pt, q_pt, y_pt) = separate_train(ds, post_proc)
(x_test_sp, idx_sp), (x_test_pt, idx_pt) = separate_test(ds_test, post_proc_test)

100%|██████████████████████████████████████████████████████████████████| 20000000/20000000 [00:21<00:00, 927348.56it/s]
100%|██████████████████████████████████████████████████████████████████████| 246955/246955 [00:00<00:00, 804356.42it/s]


In [7]:
del ds
del ds_test
del post_proc
del post_proc_test

In [8]:
def filter_words(c, m, min_len=0):
    b = set()
    for k, v in c.items():
        if v > m and len(k) > min_len:
            b.add(k)
    return b


def non_empty_post(post, words):
    total = 0
    for x in post:
        for w in x:
            if w in words:
                total = total + 1
                break
    return total

def how_many_word(x,c=None, mini=2, maxi=100):
    print(c)
    res = c
    if c is None:
        res = count_words(x)
        c = res
    print('Words: {}'.format(len(c)))
    for i in range(mini, maxi+1):
        w = filter_words(c, i)
        print('Repeats: {} Words:{} Posts: {}'.format(i, len(w), non_empty_post(x, w)))
    return res

In [9]:
c_sp = count_words(x_sp)
c_pt = count_words(x_pt)

100%|██████████████████████████████████████████████████████████████████| 10000000/10000000 [00:52<00:00, 188814.42it/s]
100%|██████████████████████████████████████████████████████████████████| 10000000/10000000 [00:42<00:00, 233417.79it/s]


## Train Model

In [10]:
from scipy.sparse import dok_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight
import numpy as np


def map_words(c, min_rep=24, min_len=0):
    return {w: i for i, w in enumerate(filter_words(c, min_rep, min_len))}


def word_vectors(posts, words):
    return [[words[w] + 1 for w in s if w in words] for s in tqdm(posts)]


def map_classes(y):
    return {k:i for i, k in tqdm(enumerate(set(y)))}


def y_create(y, classes):
    return np.asarray([classes[c] for c in tqdm(y)])


def weights(quality):
    return np.asarray([1 if x == 'reliable' else 0.5 for x in tqdm(quality)])

In [11]:
print('Spanish')
print('Words')
words_sp = map_words(c_sp, 8, 1)
print('X')
x_sp = word_vectors(x_sp, words_sp)
print('Classes')
classes_sp = map_classes(y_sp)
print('y')
y_sp = y_create(y_sp, classes_sp)
print('Weights')
w_sp = weights(q_sp)

Spanish
Words
X


100%|███████████████████████████████████████████████████████████████████| 10000000/10000000 [02:27<00:00, 67871.63it/s]


Classes


1574it [00:00, 261800.95it/s]


y


100%|█████████████████████████████████████████████████████████████████| 10000000/10000000 [00:05<00:00, 1878628.69it/s]


Weights


100%|█████████████████████████████████████████████████████████████████| 10000000/10000000 [00:03<00:00, 3073143.95it/s]


In [12]:
print('Portuguese')
print('Words')
words_pt = map_words(c_pt, 8, 1)
print('X')
x_pt = word_vectors(x_pt, words_pt)
print('Classes')
classes_pt = map_classes(y_pt)
print('y')
y_pt = y_create(y_pt, classes_pt)
print('Weights')
w_pt = weights(q_pt)

Portuguese
Words
X


100%|███████████████████████████████████████████████████████████████████| 10000000/10000000 [09:32<00:00, 17469.29it/s]


Classes


1576it [00:00, 225097.84it/s]


y


100%|█████████████████████████████████████████████████████████████████| 10000000/10000000 [00:04<00:00, 2372360.26it/s]


Weights


100%|█████████████████████████████████████████████████████████████████| 10000000/10000000 [00:03<00:00, 3017491.40it/s]


In [13]:
x_test_sp = word_vectors(x_test_sp, words_sp)

100%|███████████████████████████████████████████████████████████████████████| 124987/124987 [00:01<00:00, 95262.12it/s]


In [14]:
x_test_pt = word_vectors(x_test_pt, words_pt)

100%|██████████████████████████████████████████████████████████████████████| 121968/121968 [00:00<00:00, 169394.55it/s]


In [15]:
print(max(map(len, x_sp)))
print(max(map(len, x_pt)))
print(max(map(len, x_test_sp)))
print(max(map(len, x_test_pt)))
print(min(map(len, x_sp)))
print(min(map(len, x_pt)))
print(min(map(len, x_test_sp)))
print(min(map(len, x_test_pt)))

32
31
17
17
0
0
0
0


In [16]:
import os
base_dir = 'separated_seq'
if not os.path.exists(base_dir):
    os.mkdir(base_dir)

In [17]:
import json

json.dump({'x_sp': x_sp, 'x_test_sp': x_test_sp}, open(base_dir + os.sep + 'x_sp.json', 'w', encoding='utf-8'))
np.savez_compressed(base_dir + os.sep + 'y_w_idx_sp.npz', y_sp=y_sp, w_sp=w_sp, idx_sp=np.asarray(idx_sp))
pickle.dump(classes_sp, open(base_dir + os.sep + 'classes_sp.p', 'wb'))
pickle.dump(words_sp, open(base_dir + os.sep + 'words_sp.p', 'wb'))

json.dump({'x_pt': x_pt, 'x_test_pt': x_test_pt}, open(base_dir + os.sep + 'x_pt.json', 'w', encoding='utf-8'))
np.savez_compressed(base_dir + os.sep + 'y_w_idx_pt.npz', y_pt=y_pt, w_pt=w_pt, idx_pt=np.asarray(idx_pt))
pickle.dump(classes_pt, open(base_dir + os.sep + 'classes_pt.p', 'wb'))
pickle.dump(words_pt, open(base_dir + os.sep + 'words_pt.p', 'wb'))