In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

from expvocabulario import *
from knnpca import PCAKneighboursClasifier

In [18]:
# Carga de datos:

df = pd.read_csv("../data/imdb_small.csv", index_col=0)

print("Cantidad de documentos: {}".format(df.shape[0]))

text_train = df[df.type == 'train']["review"]
label_train = df[df.type == 'train']["label"]

text_test = df[df.type == 'test']["review"]
label_test = df[df.type == 'test']["label"]

print("Cantidad de instancias de entrenamiento = {}".format(len(text_train)))
print("Cantidad de instancias de test = {}".format(len(text_test)))

print("Class balance : {} pos {} neg".format(
    (label_train == 'pos').sum() / label_train.shape[0], 
    (label_train == 'neg').sum() / label_train.shape[0]
))

Cantidad de documentos: 12500
Cantidad de instancias de entrenamiento = 6225
Cantidad de instancias de test = 6275
Class balance : 0.49493975903614457 pos 0.5050602409638554 neg


In [None]:
vectorizer = CountVectorizer(max_df=0.90, min_df=0.01, max_features=5000)

vectorizer.fit(text_train)

X_train, y_train = vectorizer.transform(text_train), (label_train == 'pos').values
X_test, y_test = vectorizer.transform(text_test), (label_test == 'pos').values

---

## Variación del accuracy para distintos tamaños de set de entrenamiento

Del enunciado:

> Analizar la calidad de los resultados obtenidos al combinar kNN con PCA, para un rango
> amplio de instancias de entrenamiento. Utilizar desde muy pocas hasta todas las
> disponibles para identificar en qu´e situación se comporta mejor cada uno de los
> métodos.

Se diseñó el experimento tal que, dados $k, \alpha$, se varió el tamaño muestral del set de entrenamiento.

In [49]:
SUBSAMPLING_RATIOS = [.1, .2, .3, .4, .5, .6, .7, .8, .999]
KS = [5, 100, 300]
ALPHAS = [10, 50, 100]

# ==============================================================================

# Se vectoriza una sola vez. Este experimento se hizo para un solo par de
# `max_df`, `min_df`.
vectorizer = CountVectorizer(max_df=0.90, min_df=0.01, max_features=5000)
vectorizer.fit(text_train)
X_train, y_train = vectorizer.transform(text_train), (label_train == 'pos').values
X_test, y_test = vectorizer.transform(text_test), (label_test == 'pos').values
X_train = X_train.todense()
X_test = X_test.todense()

results_train_subsampling = exp_grid_train_subsample(X_train, y_train, X_test,
                                                     y_test, SUBSAMPLING_RATIOS,
                                                     KS, ALPHAS)
results_train_subsampling = list(results_train_subsampling)

ratio: 0.1, k: 5, alpha: 10
ratio: 0.1, k: 5, alpha: 50
ratio: 0.1, k: 5, alpha: 100
ratio: 0.1, k: 100, alpha: 10
ratio: 0.1, k: 100, alpha: 50
ratio: 0.1, k: 100, alpha: 100
ratio: 0.1, k: 300, alpha: 10
ratio: 0.1, k: 300, alpha: 50
ratio: 0.1, k: 300, alpha: 100
ratio: 0.2, k: 5, alpha: 10
ratio: 0.2, k: 5, alpha: 50
ratio: 0.2, k: 5, alpha: 100
ratio: 0.2, k: 100, alpha: 10
ratio: 0.2, k: 100, alpha: 50
ratio: 0.2, k: 100, alpha: 100
ratio: 0.2, k: 300, alpha: 10
ratio: 0.2, k: 300, alpha: 50
ratio: 0.2, k: 300, alpha: 100
ratio: 0.3, k: 5, alpha: 10
ratio: 0.3, k: 5, alpha: 50
ratio: 0.3, k: 5, alpha: 100
ratio: 0.3, k: 100, alpha: 10
ratio: 0.3, k: 100, alpha: 50
ratio: 0.3, k: 100, alpha: 100
ratio: 0.3, k: 300, alpha: 10
ratio: 0.3, k: 300, alpha: 50
ratio: 0.3, k: 300, alpha: 100
ratio: 0.4, k: 5, alpha: 10
ratio: 0.4, k: 5, alpha: 50
ratio: 0.4, k: 5, alpha: 100
ratio: 0.4, k: 100, alpha: 10
ratio: 0.4, k: 100, alpha: 50
ratio: 0.4, k: 100, alpha: 100
ratio: 0.4, k: 300, alp

In [52]:
results_train_subsampling = pd.DataFrame(results_train_subsampling)

In [53]:
results_train_subsampling

Unnamed: 0,acc,alpha,k,n_train,time_fit,time_predict
0,0.551554,10,5,5602,0.000045,2.885083
1,0.611793,50,5,5602,0.000042,7.997496
2,0.609084,100,5,5602,0.000041,12.176606
3,0.590757,10,100,5602,0.000044,2.827453
4,0.653068,50,100,5602,0.000044,6.655037
5,0.658486,100,100,5602,0.000042,12.800264
6,0.587092,10,300,5602,0.000046,2.843198
7,0.654343,50,300,5602,0.000043,7.723997
8,0.655936,100,300,5602,0.000052,13.084599
9,0.554263,10,5,4980,0.000043,2.557228


In [54]:
results_train_subsampling.to_csv('subsampling_results.csv')