![image info](https://raw.githubusercontent.com/davidzarruk/MIAD_ML_NLP_2023/main/images/banner_1.png)

# Proyecto 1 - Predicción de precios de vehículos usados

En este proyecto podrán poner en práctica sus conocimientos sobre modelos predictivos basados en árboles y ensambles, y sobre la disponibilización de modelos. Para su desasrrollo tengan en cuenta las instrucciones dadas en la "Guía del proyecto 1: Predicción de precios de vehículos usados".

**Entrega**: La entrega del proyecto deberán realizarla durante la semana 4. Sin embargo, es importante que avancen en la semana 3 en el modelado del problema y en parte del informe, tal y como se les indicó en la guía.

Para hacer la entrega, deberán adjuntar el informe autocontenido en PDF a la actividad de entrega del proyecto que encontrarán en la semana 4, y subir el archivo de predicciones a la [competencia de Kaggle](https://www.kaggle.com/t/b8be43cf89c540bfaf3831f2c8506614).

## Datos para la predicción de precios de vehículos usados

En este proyecto se usará el conjunto de datos de Car Listings de Kaggle, donde cada observación representa el precio de un automóvil teniendo en cuenta distintas variables como: año, marca, modelo, entre otras. El objetivo es predecir el precio del automóvil. Para más detalles puede visitar el siguiente enlace: [datos](https://www.kaggle.com/jpayne/852k-used-car-listings).

## Ejemplo predicción conjunto de test para envío a Kaggle

En esta sección encontrarán el formato en el que deben guardar los resultados de la predicción para que puedan subirlos a la competencia en Kaggle.

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importación librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

%matplotlib inline
sns.set_style('darkgrid')
plt.style.use('ggplot')

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

from flask import Flask
from flask_restx import Api, Resource, fields, reqparse

import joblib
import os
os.chdir('..')

In [None]:
# Carga de datos de archivo .csv
dataTraining = pd.read_csv('https://raw.githubusercontent.com/davidzarruk/MIAD_ML_NLP_2023/main/datasets/dataTrain_carListings.zip')
dataTesting = pd.read_csv('https://raw.githubusercontent.com/davidzarruk/MIAD_ML_NLP_2023/main/datasets/dataTest_carListings.zip', index_col=0)

In [None]:
# Visualización datos de entrenamiento
print(dataTraining.info())
dataTraining.head()

In [None]:
# Visualización datos de test
print(dataTesting.info())
dataTesting.head()

In [None]:
def grafica_distribucion(df: pd.DataFrame):
    '''
    '''
    df_num = df.select_dtypes(exclude=[object]).copy()
    df_cat = df.select_dtypes(include=[object]).copy()

    var_num = df_num.columns.to_list()
    col = len(var_num)
    fig, axes = plt.subplots (1,col,figsize = (18,4))
    fig.suptitle('DISTRIBUCIÓN DE VARIABLES NUMERICAS')

    for i,j in enumerate (var_num):
        data = df_num[j]
        axes[i].hist(data,bins = 37,color = "green",alpha = 0.65, rwidth = 0.85);
        axes[i].set_xlabel(f'{j}')
        axes[1].set_ylabel('cantidad')
    
    sns.pairplot(df);
    

    var_cat= df_cat.columns.to_list()
    col = len(var_cat)
    fig, axes = plt.subplots (col,1,figsize = (17,7))
    fig.suptitle('DISTRIBUCIÓN DE VARIABLES CATAGÓRICAS')

    for i, j in enumerate(var_cat):
        data = df_cat[j].value_counts()
        sns.countplot(x=j, data=df_cat, ax=axes[i]);
        axes[i].set_xlabel(f'{j}')
        axes[i].set_ylabel('cantidad')
        axes[i].tick_params(axis='x', rotation=45,labelsize=7)

    return

def stats(df:pd.DataFrame):
    '''
    '''
    df_num = df.select_dtypes(exclude=[object]).copy()
    df_cat = df.select_dtypes(include=[object]).copy()

    for i,j in enumerate (df_cat.columns.tolist()):
        print(f'Variable: {j}')
        print(f'Cantidad de clases: {df_cat[j].nunique()}')
        print(f'Top 10 de las clases: {Counter(df_cat[j]).most_common(10)}')
        print('------------------------------------')
        
    return df_num.describe().T 

def remover(df, columns, threshold=3):
    '''
    '''
    df_no_outliers = pd.DataFrame()
    
    for column in columns:
        z_scores = np.abs((df[column] - df[column].mean()) / df[column].std())
        df_filtered = df[z_scores <= threshold]
        df_no_outliers = pd.concat([df_no_outliers, df_filtered], ignore_index=True)
    
    return df_no_outliers

def metrics(test,predd):
    RMSE = np.sqrt(mean_squared_error(test,predd))
    return print(f"La métrica de evaluación del modelo regresión es:\nRMSE : {RMSE:,.3f}")

def df_to_dict(df):
    '''
    '''
    result_dict = {}
    for index, row in df.iterrows():
        key = row['Make'] 
        value = row['Model'] 
        if key in result_dict:
            result_dict[key].add(value)
        else:
            result_dict[key] = {value} 
    
    return {key: list(values) for key, values in result_dict.items()}

In [None]:
#stats(dataTraining)

In [None]:
#grafica_distribucion(dataTraining)

In [None]:
dataTraining_noo = remover(dataTraining, ['Mileage'], threshold=30)
sns.pairplot(dataTraining_noo);

In [None]:
X = dataTraining_noo.drop(['Price'], axis=1)
y = dataTraining_noo['Price']

data_num = X.select_dtypes(exclude=[object]).copy()
data_cat = X.select_dtypes(include=[object]).copy()

scaler = StandardScaler()
data_num_scaled = scaler.fit_transform(data_num)

#decomposer = PLSRegression(n_components=2).fit(data_num_scaled,y)
#data_num_pls = decomposer.transform(data_num_scaled)

decomposer = PCA(n_components=2).fit(data_num_scaled)
data_num_pca = decomposer.transform(data_num_scaled)

for i in data_cat.columns:
    data_cat[i] = data_cat[i].astype('category')

#data_cat_encoded = pd.get_dummies(data_cat,columns=data_cat.columns, drop_first=True)
encoder = OneHotEncoder(handle_unknown='ignore', drop='first').fit(data_cat)
data_cat_encoded = encoder.transform(data_cat).toarray()


In [None]:
X_Training = np.concatenate((data_num_pca, data_cat_encoded), axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_Training, y, test_size=0.20, random_state=42)

#X_train = pd.DataFrame(X_train, columns=['PLS1', 'PLS2'] + data_cat_encoded.columns.tolist())
#X_test = pd.DataFrame(X_test, columns=['PLS1', 'PLS2'] + data_cat_encoded.columns.tolist())

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
params = {'learning_rate': 0.8054,
           'n_estimators': 244,
           'max_depth': 6,
           'max_leaves': 37, 
           'subsample': 1.0, 
           'colsample_bytree': 1.0,
           'gamma': 0.0, 
           'reg_alpha': 0.1020408}

model_xgb = XGBRegressor(objective ='reg:squarederror', eval_metric='rmse',random_state=42, n_jobs=-1, ** params)

model_xgb.fit(X_train, y_train)
prediction = model_xgb.predict(X_test)

metrics(y_test, prediction)

In [None]:
dataTraining_ = remover(dataTraining, ['Mileage'], threshold=30)

X_ = dataTraining_.drop(['Price'], axis=1)
y_ = dataTraining_['Price']

#X_train_, X_test_, y_train_, y_test_ = train_test_split(X_, y_, test_size=0.20, random_state=42)

data_num = X_.select_dtypes(exclude=[object]).copy()
data_cat = X_.select_dtypes(include=[object]).copy()

num_pipeline = Pipeline([('scaler', StandardScaler()),('pca', PCA(n_components=2))])
cat_pipeline = Pipeline([('encoder', OneHotEncoder(drop='first'))])

preprocessor = ColumnTransformer([('numerical', num_pipeline, data_num.columns),('categorical', cat_pipeline, data_cat.columns)])

params = {'learning_rate': 0.8054054054054054,
           'n_estimators': 244,
           'max_depth': 6,
           'max_leaves': 37, 
           'subsample': 1.0, 
           'colsample_bytree': 1.0,
           'gamma': 0.0, 
           'reg_alpha': 0.10204081632653061}

cars_pipeline = Pipeline([('preprocessor', preprocessor),('model', XGBRegressor(objective='reg:squarederror', eval_metric='rmse', random_state=42, n_jobs=-1, **params))])
cars_pipeline.fit(X_, y_)


In [None]:
dict_ = {
    'Year': [2017],
    'Mileage': [9913],
    'State': [' FL'],
    'Make': ['Jeep'],
    'Model': ['Wrangler']
}

cars_prediction = cars_pipeline.predict(pd.DataFrame(dict_))

cars_prediction 


In [None]:
states = [' FL', ' OH', ' TX', ' CO', ' ME', ' WA', ' CT', ' CA', ' LA',' NY', ' PA', ' SC', ' ND', ' NC', ' GA', ' AZ', ' TN', ' KY',
       ' NJ', ' UT', ' IA', ' AL', ' NE', ' IL', ' OK', ' MD', ' NV',' WV', ' MI', ' VA', ' WI', ' MA', ' OR', ' IN', ' NM', ' MO',
       ' HI', ' KS', ' AR', ' MN', ' MS', ' MT', ' AK', ' VT', ' SD',
       ' NH', ' DE', ' ID', ' RI', ' WY', ' DC']
makes = ['Jeep', 'Chevrolet', 'BMW', 'Cadillac', 'Mercedes-Benz', 'Toyota','Buick', 'Dodge', 'Volkswagen', 'GMC', 'Ford', 'Hyundai',
       'Mitsubishi', 'Honda', 'Nissan', 'Mazda', 'Volvo', 'Kia', 'Subaru','Chrysler', 'INFINITI', 'Land', 'Porsche', 'Lexus', 'MINI',
       'Lincoln', 'Audi', 'Ram', 'Mercury', 'Tesla', 'FIAT', 'Acura','Scion', 'Pontiac', 'Jaguar', 'Bentley', 'Suzuki', 'Freightliner']   
models = ['Wrangler', 'Tahoe4WD', 'X5AWD', 'SRXLuxury', '3', 'C-ClassC300','CamryL', 'TacomaPreRunner', 'LaCrosse4dr', 'ChargerSXT',
       'CamryLE', 'Jetta', 'AcadiaFWD', 'EscapeSE', 'SonataLimited','Santa', 'Outlander', 'CruzeSedan', 'Civic', 'CorollaL', '350Z2dr',
       'EdgeSEL', 'F-1502WD', 'FocusSE', 'PatriotSport', 'Accord','MustangGT', 'FusionHybrid', 'ColoradoCrew', 'Wrangler4WD',
       'CR-VEX-L', 'CTS', 'CherokeeLimited', 'Yukon', 'Elantra', 'New','CorollaLE', 'Canyon4WD', 'Golf', 'Sonata4dr', 'Elantra4dr',
       'PatriotLatitude', 'Mazda35dr', 'Tacoma2WD', 'Corolla4dr','Silverado', 'TerrainFWD', 'EscapeFWD', 'Grand', 'RAV4FWD',
       'Liberty4WD', 'FocusTitanium', 'DurangoAWD', 'S60T5', 'CivicLX','MuranoAWD', 'ForteEX', 'TraverseAWD', 'CamaroConvertible',
       'Sportage2WD', 'Pathfinder4WD', 'Highlander4dr', 'WRXSTI', 'Ram','F-150XLT', 'SiennaXLE', 'LaCrosseFWD', 'RogueFWD', 'CamaroCoupe',
       'JourneySXT', 'AccordEX-L', 'Escape4WD', 'OptimaEX', 'FusionSE','5', 'F-150SuperCrew', '200Limited', 'Malibu', 'CompassSport',
       'G37', 'CanyonCrew', 'Malibu1LT', 'MustangPremium', 'MustangBase','Sierra', 'FlexLimited', 'Tahoe2WD', 'Transit', 'Outback2.5i',
       'TucsonLimited', 'Rover', 'CayenneAWD', 'MalibuLT', 'TucsonFWD','F-150FX2', 'Camaro2dr', 'Colorado4WD', 'SonataSE', 'ESES',
       'EnclavePremium', 'CR-VEX', 'F-150STX', 'Impreza', 'EquinoxFWD','Cooper', 'Super', 'Passat4dr', '911', 'CivicEX', 'CamrySE',
       'Highlander4WD', 'Corvette2dr', '200S', 'PilotLX', 'SorentoEX','RioLX', 'ExplorerXLT', 'CorvetteCoupe', 'EnclaveLeather',
       'Avalanche4WD', 'TacomaBase', 'Versa5dr', 'MKXFWD','SL-ClassSL500', 'VeracruzFWD', 'CorollaS', 'PriusTwo', 'CR-V2WD',
       'Lucerne4dr', '4Runner4dr', 'PilotTouring', 'CR-VLX','CompassLatitude', 'Altima4dr', 'OptimaLX', 'Focus5dr',
       'Charger4dr', 'AcadiaAWD', 'JourneyFWD', '7', 'RX', 'MalibuLS','LSLS', 'SportageLX', 'Yukon4WD', 'SorentoLX', 'TiguanSEL',
       'Camry4dr', 'F-1504WD', 'PriusBase', 'AccordLX', 'Q7quattro','ExplorerLimited', '4RunnerSR5', 'OdysseyEX-L', 'C-ClassC',
       'CX-9FWD', 'JourneyAWD', 'Sorento2WD', 'F-250Lariat', 'Prius','TahoeLT', '25004WD', 'Escalade4dr', 'GTI4dr', '4RunnerRWD',
       'FX35AWD', 'XC90T6', 'Taurus4dr', 'AvalonXLE', '300300S', 'G35','F-150Platinum', 'TerrainAWD', 'GXGX', 'MKXAWD', 'Town',
       'CamryXLE', 'VeracruzAWD', 'FusionS', 'Challenger2dr', 'Tundra','Navigator4WD', 'Legacy3.6R', 'GS', 'E-ClassE350', 'Suburban2WD',
       'A44dr', 'RegalTurbo', 'Outback3.6R', '4Runner4WD', 'Legacy2.5i','1', 'Yukon2WD', 'Explorer', 'PilotEX-L', '200LX', 'M-ClassML350',
       'RAV4XLE', 'WranglerSport', 'Model', 'FJ', 'Titan', 'Titan4WD','FlexSEL', 'OdysseyTouring', 'SorentoSX', 'RAV4Base', 'OdysseyEX',
       'Explorer4WD', 'Mustang2dr', 'EdgeLimited', 'FusionSEL','Yukon4dr', 'Touareg4dr', 'Matrix5dr', 'CTCT', 'CherokeeSport',
       '6', 'Maxima4dr', 'Frontier4WD', 'PriusThree', 'F-350XL', '500Pop','RDXAWD', 'Tacoma4WD', 'Optima4dr', 'Q5quattro', 'X3xDrive28i',
       'RDXFWD', 'X5xDrive35i', 'Malibu4dr', 'ExpeditionXLT', 'Ranger2WD','Patriot4WD', 'Quest4dr', 'TaurusSE', 'PathfinderS', 'Murano2WD',
       'LS', 'SiennaLimited', 'ES', 'SiennaLE', 'F-150Lariat', 'Titan2WD','Durango2WD', 'Tahoe4dr', 'Focus4dr', 'YarisBase', 'TaurusLimited',
       'RAV44WD', 'C-Class4dr', 'Soul+', 'TundraBase', 'Expedition','ImpalaLT', 'SedonaLX', 'Sequoia4WD', 'ElantraLimited', '15002WD',
       'Suburban4WD', 'FiestaSE', '15004WD', 'TundraSR5', 'Camry','RAV4Limited', 'RangerSuperCab', 'MDXAWD', 'RAV4LE',
       'ChallengerR/T', 'FlexSE', 'ForteLX', 'TraverseFWD','LibertySport', 'ISIS', 'Impala4dr', 'Tundra4WD', 'F-250XLT',
       'RXRX', 'Armada2WD', 'Frontier', 'WranglerRubicon', 'EquinoxAWD','PilotEX', 'TiguanS', 'EscaladeAWD', 'DTS4dr', 'Pilot2WD',
       'Express', 'PacificaLimited', 'CanyonExtended', 'MX5', 'EscapeS','IS', 'C-ClassC350', 'Compass4WD', 'SportageEX', 'Legacy',
       'E-ClassE', 'Dakota4WD', '300300C', 'Forte', 'SportageAWD','TaurusSEL', 'Xterra4WD', 'GSGS', 'Explorer4dr', 'F-150XL',
       'SportageSX', 'xB5dr', 'TundraLimited', 'CruzeLT', 'Wrangler2dr','HighlanderFWD', 'Sprinter', 'Highlander', 'Prius5dr', 'CX-9Grand',
       'CTS4dr', 'Econoline', 'AccordEX', 'RAV4Sport', '35004WD','ChargerSE', 'OdysseyLX', 'TucsonAWD', 'CX-7FWD', 'AccordLX-S',
       'Navigator4dr', 'EscapeXLT', 'TiguanSE', 'Cayman2dr', 'TaurusSHO','F-150FX4', 'Ranger4WD', 'OptimaSX', 'SequoiaSR5', 'G64dr',
       'HighlanderLimited', 'ExplorerFWD', 'F-350King', 'PriusFive','Yaris4dr', 'PatriotLimited', 'Lancer4dr', 'HighlanderSE',
       'CompassLimited', 'S2000Manual', 'F-250King', 'Forester2.5X','Fusion4dr', 'Frontier2WD', 'FocusST', 'Pathfinder2WD',
       'Sentra4dr', 'XF4dr', 'F-250XL', 'PacificaTouring','MustangDeluxe', 'Caliber4dr', 'GTI2dr', 'Mazda34dr', 'FocusS',
       'Sienna5dr', 'CR-V4WD', 'CX-9Touring', 'Mazda64dr', 'Forester4dr','1500Tradesman', 'MDX4WD', 'Escalade', 'TL4dr', 'CX-9AWD',
       'Canyon2WD', 'A64dr', 'A8', 'Armada4WD', 'Impreza2.0i', 'GX','QX564WD', 'CC4dr', 'MKZ4dr', 'Yaris', 'FitSport', 'Regal4dr',
       'Tundra2WD', 'X3AWD', 'SonicSedan', 'Cobalt4dr', 'RidgelineRTL','CivicSi', 'AvalonLimited', 'XC90FWD', 'Outlander2WD', 'RAV44dr',
       'ColoradoExtended', 'ExpeditionLimited', '3004dr', '200Touring','SC', 'X1xDrive28i', 'SonicHatch', 'GLI4dr', 'PilotSE', 'Savana',
       'RegalPremium', 'CR-VSE', 'RegalGS', 'XC90AWD', 'EdgeSport','PriusFour', 'SiennaSE', '1500Laramie', '300Base', 'Pilot4WD',
       'A34dr', 'HighlanderBase', 'Expedition4WD', 'STS4dr', 'SoulBase','Xterra2WD', 'CT', 'tC2dr', 'Tiguan2WD', 'CR-ZEX', 'MustangShelby',
       'C702dr', 'WranglerX', 'WranglerSahara', 'DurangoSXT','Sequoia4dr', 'Outlander4WD', 'Expedition2WD', 'Navigator',
       '9112dr', 'Vibe4dr', 'F-150King', '300Limited', 'XC60T6','CivicEX-L', 'Avalanche2WD', 'F-350XLT', 'ExplorerBase', 'MuranoS',
       'LXLX', 'EdgeSE', 'ImpalaLS', 'Land', 'E-ClassE320', 'Milan4dr','Boxster2dr', 'RAV4', 'Eos2dr', 'SedonaEX', 'xD5dr', 'Colorado2WD',
       'Monte', 'Escape4dr', 'LX', 'FiestaS', 'F-350Lariat', 'Galant4dr','TT2dr', 'Xterra4dr', 'SequoiaLimited', '4RunnerLimited',
       'Genesis', 'Suburban4dr', 'EnclaveConvenience', 'LaCrosseAWD','Versa4dr', 'Cobalt2dr', 'XC60FWD', 'F-150Limited', 'Dakota2WD',
       'S44dr', '4Runner2WD', 'Sedona4dr', 'RidgelineSport','TSXAutomatic', 'ImprezaSport', 'SLK-ClassSLK350', 'Accent4dr',
       'CorvetteConvertible', 'Avalon4dr', 'Passat', '25002WD','ExplorerEddie', 'LibertyLimited', 'CTS-V', '4RunnerTrail',
       'Eclipse3dr', 'Azera4dr', 'TahoeLS', 'Continental', 'XJ4dr','ForteSX', 'SequoiaPlatinum', 'FocusSEL', 'Durango4dr',
       'CamryBase', 'XC704dr', 'S804dr', 'Element4WD', 'YarisLE','WRXBase', 'TLAutomatic', 'AvalonTouring', 'XK2dr', 'PT',
       'PathfinderSE', '300Touring', 'Navigator2WD', 'XC60AWD','EscapeLimited', 'WRXLimited', 'AccordSE', 'QX562WD',
       'Escalade2WD', 'EscapeLImited', 'PriusOne', 'Element2WD','Excursion137"', 'WRXPremium', 'RX-84dr']

In [None]:
joblib.dump(cars_pipeline, 'model_deployment/carpricing_reg.pkl', compress=3)

In [None]:
app = Flask(__name__)

api = Api(
    app, 
    version='0.0a', 
    title='Pre-own Car Price Prediction API',
    description='API for prediction of pre-own cars in USA. Developed by Team 8.')

ns = api.namespace('Prediction', description='Pre-own Car Price Predictor')

def prediction(year, mileage, state, make, model):
    '''
    '''
    dict_ = {
        'Year': [year],
        'Mileage': [mileage],
        'State': [state],
        'Make': [make],
        'Model': [model]}
    
    prediction = cars_pipeline.predict(pd.DataFrame(dict_))
    
    return (prediction[0]).astype(int)

resource_fields = api.model('Resource', {'result': fields.String})

parser1 = reqparse.RequestParser()
parser1.add_argument(
    'Year', 
    type=int, 
    required=True, 
    help='Car Year', 
    location='args')
parser1.add_argument(
    'Mileage', 
    type=int, 
    required=True, 
    help='Car Mileage', 
    location='args')
parser1.add_argument(
    'State', 
    type=str, 
    required=True, 
    help='USA State', 
    location='args',
    choices=states)
parser1.add_argument(
    'Make', 
    type=str, 
    required=True, 
    help='Car Make', 
    location='args',
    choices=makes)
parser1.add_argument(
    'Model', 
    type=str, 
    required=True, 
    help='Car Model', 
    location='args', 
    choices=models)

@ns.route('/')
class CarPricingApi(Resource):

    @api.doc(parser=parser1)
    @api.marshal_with(resource_fields)

    def get(self):
        args = parser1.parse_args()
        return {
            'result': prediction(
                 args['Year'], args['Mileage'], args['State'], args['Make'], args['Model']
            )
        }, 200
    
if __name__ == '__main__':
    app.run(debug=True, use_reloader=False, host='0.0.0.0', port=5050)