# Challenge 1 -- Buscando el mejor Modelo

En esta notebook se trata de buscar el mejor modelo para logar predecir si un articulo es nuevo o usado


In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import RFECV


In [2]:
import warnings
warnings.simplefilter("ignore")

In [3]:
X = pd.read_csv('MLA_100k.csv')
X=X.drop('Unnamed: 0',1)
X=X.drop(['seller_id'], axis=1)
X.head()

Unnamed: 0,condition,base_price,price,accepts_mercadopago,automatic_relist,initial_quantity,sold_quantity,available_quantity,shipping.local_pick_up,shipping.free_shipping
0,1,80.0,80.0,1,0,1,0,1,1,0
1,0,2650.0,2650.0,1,0,1,0,1,1,0
2,0,60.0,60.0,1,0,1,0,1,1,0
3,1,580.0,580.0,1,0,1,0,1,1,0
4,0,30.0,30.0,1,0,1,0,1,1,0


## Testing algorithms <a class="anchor" id="second-bullet"></a>

In [5]:
y = X['condition']
X = X.drop('condition',axis=1)

### Withouth reduction <a class="anchor" id="without_red"></a>

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.34)

In [7]:
len(X_train), len(X_test), len(y_train), len(y_test)

(66000, 34000, 66000, 34000)

#### CLASSIFIERS

In [9]:
from Classifiers import Clas_LogisticRegression 
LR_fit_time, LR_score_time, LR_accuracy, LR_precision, LR_recall, LR_f1, LR_roc= Clas_LogisticRegression(X_train, y_train)

In [10]:
from Classifiers import Clas_DecisionTree 
dtree_fit_time, dtree_score_time, dtree_accuracy, dtree_precision, dtree_recall, dtree_f1, dtree_roc= Clas_DecisionTree(X_train, y_train)

In [11]:
from Classifiers import Clas_LDA
LDA_fit_time, LDA_score_time, LDA_accuracy, LDA_precision, LDA_recall, LDA_f1, LDA_roc = Clas_LDA(X_train, y_train)

In [12]:
from Classifiers import Clas_QDA
QDA_fit_time, QDA_score_time, QDA_accuracy, QDA_precision, QDA_recall, QDA_f1, QDA_roc = Clas_QDA(X_train, y_train)

In [13]:
from Classifiers import Clas_RFOREST
forest_fit_time, forest_score_time, forest_accuracy, forest_precision, forest_recall, forest_f1, forest_roc  = Clas_RFOREST(X_train, y_train)

In [14]:
from Classifiers import Clas_KNN
KNN_fit_time, KNN_score_time, KNN_accuracy, KNN_precision, KNN_recall, KNN_f1, KNN_roc = Clas_KNN(X_train, y_train)

In [15]:
from Classifiers import Clas_GNB
bayes_fit_time, bayes_score_time, bayes_accuracy, bayes_precision, bayes_recall, bayes_f1, bayes_roc = Clas_GNB(X_train, y_train)

### Comparacion

In [16]:
models_initial = pd.DataFrame({
    'Model'       : ['Logistic Regression', 'Decision Tree', 'Linear Discriminant Analysis', 'Quadratic Discriminant Analysis', 'Random Forest', 'K-Nearest Neighbors', 'Bayes'],
    'Fitting time': [LR_fit_time, dtree_fit_time,  LDA_fit_time, QDA_fit_time, forest_fit_time, KNN_fit_time, bayes_fit_time],
    'Scoring time': [LR_score_time, dtree_score_time, LDA_score_time, QDA_score_time, forest_score_time, KNN_score_time, bayes_score_time],
    'Accuracy'    : [LR_accuracy, dtree_accuracy,  LDA_accuracy, QDA_accuracy, forest_accuracy, KNN_accuracy, bayes_accuracy],
    'Precision'   : [LR_precision, dtree_precision, LDA_precision, QDA_precision, forest_precision, KNN_precision, bayes_precision],
    'Recall'      : [LR_recall, dtree_recall, LDA_recall, QDA_recall, forest_recall, KNN_recall, bayes_recall],
    'F1_score'    : [LR_f1, dtree_f1,  LDA_f1, QDA_f1, forest_f1, KNN_f1, bayes_f1],
    'AUC_ROC'     : [LR_roc, dtree_roc, LDA_roc, QDA_roc, forest_roc, KNN_roc, bayes_roc],
    }, columns = ['Model', 'Fitting time', 'Scoring time', 'Accuracy', 'Precision', 'Recall', 'F1_score', 'AUC_ROC'])

models_initial.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model,Fitting time,Scoring time,Accuracy,Precision,Recall,F1_score,AUC_ROC
4,Random Forest,5.640021,0.181594,0.794758,0.802615,0.80045,0.79439,0.861538
1,Decision Tree,0.179996,0.010244,0.787152,0.799225,0.794171,0.78625,0.835743
5,K-Nearest Neighbors,0.992482,0.221711,0.777545,0.780049,0.780724,0.777686,0.83612
3,Quadratic Discriminant Analysis,0.041093,0.022256,0.569318,0.736801,0.597949,0.497789,0.793211
2,Linear Discriminant Analysis,0.210975,0.014342,0.559152,0.560054,0.536107,0.5091,0.7674
0,Logistic Regression,0.103412,0.011643,0.484864,0.544718,0.512995,0.324541,0.426267
6,Bayes,0.027484,0.009143,0.478258,0.719163,0.510454,0.322234,0.747665


## CONCLUSIÓN

Analizando las variables y los distintos modelos la clonclusion obtenida es:

Random Forest serpia el mejor modelo, pero en cuanto al tiempo es el que mas se demoro de todos. 