# Funciones y modelos

In [1]:
from funciones import get_nontrivial_words, matriz_confusion, confusiones_principales, categorias_para_mejorar
from modelos import WordMagic, EnsambleSuma

# Leemos los datos

In [2]:
import pandas as pd
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

# Procesamos los títulos para extraer palabras

In [3]:
from tqdm.auto import tqdm
tqdm.pandas()
df_train["title"] = df_train['title'].progress_apply(get_nontrivial_words)
df_test["title"] = df_test['title'].progress_apply(get_nontrivial_words)

HBox(children=(IntProgress(value=0, max=20000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=246955), HTML(value='')))




# Para validar tomamos un pedazo reliable del mismo tamaño que test

In [4]:
from sklearn.model_selection import train_test_split
X, X_val = train_test_split(df_train, test_size=0.21, random_state=42)

In [5]:
X_val_reliable = X_val[X_val["label_quality"]=="reliable"]

In [6]:
len(X_val_reliable)

248809

# Class weights (elegidos para optimizar balanced accuracy)

In [7]:
def cantidad_apariciones(df):
    f ={}
    l = list(df["category"])
    for c in l:
        if c in f:
            f[c]+=1
        else:
            f[c]=1
    return f

def frequencies(df):
    f = cantidad_apariciones(df)
    for c in f:
        f[c]/=len(df)
    return f

freq = frequencies(df_train)
class_weights = lambda c : 1/freq[c]

# Entrenamos

In [8]:
model1 = WordMagic(
    n = 1,
    word_getter = lambda x : x,
    class_weights = class_weights,
    normalizar_predict_proba = False,
    sacar_repetidos = False,
)
model2 = WordMagic(
    n = 2,
    word_getter = lambda x : x,
    class_weights = class_weights,
    normalizar_predict_proba = False,
)
model3 = WordMagic(
    n = 3,
    word_getter = lambda x : x,
    class_weights = class_weights,
    normalizar_predict_proba = False,
)

model1.fit(X)
model2.fit(X)
model3.fit(X)

ensamble = EnsambleSuma( [model1, model2, model3], pesos = [1,1,1], random_guess = "ICE_CREAM_MACHINES" )

Extrayendo palabras de los títulos:


HBox(children=(IntProgress(value=0, max=15800000), HTML(value='')))


Entrenando modelo en  1 -uplas...


HBox(children=(IntProgress(value=0, max=15800000), HTML(value='')))


Normalizando...


HBox(children=(IntProgress(value=0, max=1761475), HTML(value='')))


Entrenamiento completo.

Extrayendo palabras de los títulos:


HBox(children=(IntProgress(value=0, max=15800000), HTML(value='')))


Entrenando modelo en  2 -uplas...


HBox(children=(IntProgress(value=0, max=15800000), HTML(value='')))


Normalizando...


HBox(children=(IntProgress(value=0, max=59727215), HTML(value='')))


Entrenamiento completo.

Extrayendo palabras de los títulos:


HBox(children=(IntProgress(value=0, max=15800000), HTML(value='')))


Entrenando modelo en  3 -uplas...


HBox(children=(IntProgress(value=0, max=15800000), HTML(value='')))


Normalizando...


HBox(children=(IntProgress(value=0, max=399091098), HTML(value='')))


Entrenamiento completo.



# Score de validación

In [9]:
from sklearn.metrics import balanced_accuracy_score
X_val_reliable["predicted_category"] = ensamble.predict(X_val_reliable)
print("score val: ", balanced_accuracy_score(X_val_reliable["category"], X_val_reliable["predicted_category"]))

HBox(children=(IntProgress(value=0, max=248809), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


score val:  0.884491774075475




# Estudiamos la matriz de confusiones

In [10]:
categories = sorted(list(set(df_train["category"])))
CM = matriz_confusion(X_val_reliable["category"], X_val_reliable["predicted_category"], categories)

  CM = ( N/N.sum(axis=1)[:,None], number_cat)


In [11]:
categorias_para_mejorar(CM,categories, threshold=0.8)

Hay  228 categorias para mejorar.


['ACTION_FIGURES',
 'AFTERSHAVES',
 'AIRGUN_PELLETS',
 'AIR_FRESHENERS',
 'ANTIQUE_CHAIRS',
 'ANTIQUE_TAPE_RECORDERS',
 'APERITIFS',
 'ARTIFICIAL_PLANTS',
 'AUDIO_AMPLIFIERS',
 'AUDIO_AND_VIDEO_CABLES_AND_ADAPTERS',
 'AUTOMOTIVE_ARMRESTS',
 'AUTOMOTIVE_BATTERIES',
 'AUTOMOTIVE_BUMPER_GRILLES',
 'AUTOMOTIVE_CLUTCH_MASTER_CYLINDERS',
 'AUTOMOTIVE_CV_JOINT_BOOTS',
 'AUTOMOTIVE_DOORS',
 'AUTOMOTIVE_FRONT_BUMPERS',
 'AUTOMOTIVE_MIRROR_COVERS',
 'AUTOMOTIVE_SHOCK_ABSORBERS',
 'AUTOMOTIVE_TRANSMISSION_GEARS',
 'BABY_BLANKETS',
 'BARBECUE_TOOL_SETS',
 'BAR_SOAPS',
 'BATHROOM_ACCESSORIES_SETS',
 'BEACH_BALLS',
 'BEACH_PADDLES',
 'BICYCLE_BAGS',
 'BINDING_SPINES',
 'BODY_SKIN_CARE_PRODUCTS',
 'BOOKS',
 'BOXING_HEADGEARS',
 'BRICKS',
 'BUMPER_IMPACT_ABSORBERS',
 'BUTT_PLUGS',
 'CAKE_TOPPERS',
 'CAMERA_CASES',
 'CAMERA_FLASHES',
 'CAMERA_STRAPS',
 'CAMPING_STOVES',
 'CANDIES',
 'CANDLE_HOLDERS',
 'CARDS_AND_INVITATIONS',
 'CAR_AC_HOSE_ASSEMBLIES',
 'CAR_AIR_FRESHENERS',
 'CAR_CENTER_CONSOLES',
 'C

In [12]:
componentes_complicadas = confusiones_principales(CM,categories, threshold = 0.1)
componentes_complicadas

Hay  1293  componentes.
Hay  167  componentes complicadas


[['ACTION_CAMERA_MOUNTS', 'CAMERA_STRAPS', 'DOG_LEASHES'],
 ['ACTION_FIGURES', 'BEACH_PADDLES', 'PADDLE_TENNIS_RACKETS'],
 ['AFTERSHAVES', 'MAKEUP_REMOVERS'],
 ['AIRGUN_PELLETS', 'PAINTBALLS'],
 ['AIR_FRESHENERS',
  'CAR_AIR_FRESHENERS',
  'DEHUMIDIFIERS',
  'ESSENTIAL_OILS',
  'FABRIC_SOFTENERS',
  'OIL_DIFFUSERS',
  'PORTABLE_EVAPORATIVE_AIR_COOLERS',
  'SOLDERING_IRONS',
  'WELDING_BLOWTORCHES'],
 ['ALARMS_AND_SENSORS', 'DOORBELLS'],
 ['ALARM_CLOCKS', 'TABLE_CLOCKS'],
 ['ALTERNATORS', 'ALTERNATOR_PULLEYS'],
 ['ANALOG_CAMERAS',
  'CAMERA_FLASHES',
  'ELECTRICAL_TIMERS',
  'ELECTRICITY_METERS',
  'MULTIMETERS',
  'TOILET_PAPER_HOLDERS',
  'TOWEL_HOLDERS',
  'VARIABLE_FREQUENCY_DRIVES',
  'WATER_FLOW_SENSORS'],
 ['ANTIQUE_CHAIRS',
  'DINING_CHAIRS',
  'DINING_SETS',
  'DINING_TABLES',
  'LIVING_ROOM_SETS',
  'OFFICE_CHAIRS',
  'SOFAS',
  'STYLING_CHAIRS'],
 ['ANTIQUE_TAPE_RECORDERS',
  'DIGITAL_VOICE_RECORDERS',
  'DVR_RECORDERS',
  'SURVEILLANCE_CAMERAS',
  'VIDEO_CAPTURE_DEVICES',
  

# Pequeño experimento  para ver como varía el score al mirar el 30% de val

In [13]:
for i in range(30):
    d_pub = X_val_reliable.sample(int(0.3*len(X_val_reliable)))
    print("score: ", balanced_accuracy_score(d_pub["category"],d_pub["predicted_category"]),flush=True)

score:  0.8847224735054979
score:  0.8860426511132183
score:  0.8881233314625434
score:  0.8818649611445452
score:  0.8805947485616139
score:  0.8827867207534649
score:  0.8924734404816727
score:  0.8916701070900045
score:  0.8850331775160967
score:  0.8889415814364929
score:  0.8885417195151634
score:  0.8918364979272584
score:  0.889058829565412
score:  0.8923461079292518
score:  0.8799306366016727
score:  0.8838946674831127
score:  0.8884432001445541
score:  0.8896127599430754
score:  0.8848145872023151
score:  0.8862081927126527
score:  0.8838939029278858
score:  0.8803165817619425
score:  0.8842597320236532
score:  0.8874650771399375
score:  0.8885771450789351
score:  0.8929524504936402
score:  0.8869331002936438
score:  0.8879312015877099
score:  0.8882427739324336
score:  0.8882047061954639


# Predicciones de test (lo ideal es entrenar de nuevo usando todo train)

In [14]:
preds_test = ensamble.predict(df_test)
df_test["category"] = preds_test
#df_test[["id","category"]].to_csv("solution.csv",index=None)
df_test.head(40)

HBox(children=(IntProgress(value=0, max=246955), HTML(value='')))




Unnamed: 0,id,title,language,category
0,0,"[kit, maternidade, bolsa, mala, baby, bebe, vi...",portuguese,DIAPER_BAGS
1,1,"[trocador, de, fraldas, fisher, price, feminin...",portuguese,BABY_CHANGING_PADS
2,2,"[motor, ventoinha, fiat, idea, palio, 1, 8, a,...",portuguese,ENGINE_COOLING_FAN_MOTORS
3,3,"[amortecedor, mola, batente, d, dir, new, civi...",portuguese,AUTOMOTIVE_SHOCK_ABSORBER_BUMP_STOPS
4,4,"[cadeirinha, de, carro, bebe, princesa, prince...",portuguese,BABY_CAR_SEATS
5,5,"[cabo, freio, mao, tras, direito, vw, up, cod,...",portuguese,HAND_BRAKE_CABLES
6,6,"[mini, pc, dell, optiplex, fx160, atom, 2gb, r...",portuguese,MINI_PCS
7,7,"[kit, bi, xenon, lampada, h4, 8000k]",portuguese,XENON_KITS
8,8,"[protetor, pe, botinha, kickboxing, karate, ta...",portuguese,MARTIAL_ARTS_FOOT_GUARDS
9,9,"[disco, rigido, externo, western, digital, ele...",spanish,HARD_DRIVES_AND_SSDS


# Ideas para probar

1) jugar con los pesos de las filas utilizando el parámetro 'row_weights' (pesos distintos para reliable y unreliable)

2) hacer CV para quedarse con filas de unreliable que parezcan bien etiquetadas