# Carga de archivos

In [1]:
import pickle
import numpy as np
from tqdm import tqdm
import pandas as pd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.colab import drive

In [2]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

downloaded = drive.CreateFile({'id':"1-01jDt6eGboCV5ZidWMKiN71cKFUD6X3"})   
downloaded.GetContentFile('sparseImages') 

downloaded = drive.CreateFile({'id':"1-3pzoQCAspxGnOiFBGQTHg_wxx9Eu9VU"})   
downloaded.GetContentFile('labels')


In [3]:
with open('sparseImages', 'rb') as f:
    images = pickle.load(f)

In [4]:
with open('labels', 'rb') as f:
    labels = pickle.load(f)

In [5]:
len(labels)

100000

In [6]:
pd.DataFrame(labels).value_counts()

potato        17140
banana        16126
apple          8009
grapes         7653
asparagus      7612
broccoli       6862
onion          6692
pineapple      6626
watermelon     6368
blueberry      6045
blackberry     5580
strawberry     5287
dtype: int64

In [7]:
images = images.todense()

In [8]:
x_train = images[:int(0.80*len(images))]
y_train = labels[:int(0.80*len(images))]

x_valid = images[int(0.80*len(images)):int(0.90*len(images))]
y_valid = labels[int(0.80*len(images)):int(0.90*len(images))]

x_test = images[int(0.90*len(images)):]
y_test = labels[int(0.90*len(images)):]

In [9]:
len(x_train), len(x_valid), len(x_test)

(80000, 10000, 10000)

# Baseline

In [10]:
from sklearn.metrics import classification_report, top_k_accuracy_score
from random import choice
from sklearn.linear_model import LogisticRegression

### Matriz sin reduccion

In [11]:
model = LogisticRegression(random_state = 42)

In [12]:
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=42)

In [13]:
preds = model.predict(x_valid)



In [14]:
proba_preds = model.predict_proba(x_valid)



In [15]:
print(classification_report(y_valid, preds))

              precision    recall  f1-score   support

       apple       0.64      0.69      0.67       845
   asparagus       0.76      0.79      0.77       820
      banana       0.69      0.71      0.70      1570
  blackberry       0.44      0.41      0.42       548
   blueberry       0.30      0.25      0.27       617
    broccoli       0.58      0.58      0.58       689
      grapes       0.40      0.38      0.39       713
       onion       0.45      0.38      0.41       686
   pineapple       0.57      0.61      0.59       652
      potato       0.73      0.81      0.77      1699
  strawberry       0.48      0.47      0.47       535
  watermelon       0.46      0.40      0.43       626

    accuracy                           0.59     10000
   macro avg       0.54      0.54      0.54     10000
weighted avg       0.58      0.59      0.59     10000



In [16]:
top_k_accuracy_score(y_true = y_valid, y_score=proba_preds, k=3)

0.8443

### PCA

In [17]:
from sklearn.decomposition import PCA

In [18]:
pca = PCA(random_state=42, n_components = 200)

In [19]:
model = LogisticRegression(random_state = 42)

In [20]:
x_train_pca = pca.fit_transform(x_train)
x_valid_pca = pca.transform(x_valid)



In [21]:
model.fit(x_train_pca, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=42)

In [22]:
preds = model.predict(x_valid_pca)

In [23]:
proba_preds = model.predict_proba(x_valid_pca)

In [24]:
print(classification_report(y_valid, preds))

              precision    recall  f1-score   support

       apple       0.70      0.79      0.74       845
   asparagus       0.79      0.82      0.81       820
      banana       0.71      0.80      0.75      1570
  blackberry       0.64      0.47      0.55       548
   blueberry       0.48      0.29      0.36       617
    broccoli       0.63      0.68      0.66       689
      grapes       0.56      0.53      0.54       713
       onion       0.60      0.51      0.55       686
   pineapple       0.66      0.70      0.68       652
      potato       0.75      0.88      0.81      1699
  strawberry       0.60      0.53      0.57       535
  watermelon       0.58      0.45      0.50       626

    accuracy                           0.67     10000
   macro avg       0.64      0.62      0.63     10000
weighted avg       0.66      0.67      0.66     10000



In [25]:
top_k_accuracy_score(y_true = y_valid, y_score=proba_preds, k=3)

0.8865

### SVD

In [26]:
from sklearn.decomposition import TruncatedSVD

In [27]:
model = LogisticRegression(random_state = 42)

In [28]:
svd = TruncatedSVD(random_state = 42, n_components = 200)

In [29]:
x_train_svd = svd.fit_transform(x_train)
x_valid_svd = svd.transform(x_valid)



In [30]:
model.fit(x_train_svd, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=42)

In [31]:
preds = model.predict(x_valid_svd)

In [32]:
proba_preds = model.predict_proba(x_valid_svd)

In [33]:
print(classification_report(y_valid, preds))

              precision    recall  f1-score   support

       apple       0.70      0.79      0.74       845
   asparagus       0.80      0.82      0.81       820
      banana       0.71      0.80      0.75      1570
  blackberry       0.65      0.48      0.55       548
   blueberry       0.47      0.28      0.35       617
    broccoli       0.63      0.67      0.65       689
      grapes       0.55      0.54      0.55       713
       onion       0.60      0.50      0.55       686
   pineapple       0.66      0.69      0.67       652
      potato       0.75      0.88      0.81      1699
  strawberry       0.59      0.53      0.56       535
  watermelon       0.59      0.45      0.51       626

    accuracy                           0.67     10000
   macro avg       0.64      0.62      0.63     10000
weighted avg       0.66      0.67      0.66     10000



In [34]:
top_k_accuracy_score(y_true = y_valid, y_score=proba_preds, k=3)

0.8866