In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# Import necessary libraries


# Load the training dataset
train_data = pd.read_json('train.json')
test_data = pd.read_json('test.json')

# Explore and preprocess the data
# ... (handle missing values, encode categorical features, normalize/standardize numeric features)

numeric_cols = train_data.select_dtypes(include=['number']).columns
categorical_cols = train_data.select_dtypes(include=['object']).columns

# Split the data into features (X) and target variable (y)
X = train_data.drop(['id', 'pret'], axis=1)
y = train_data['pret']


In [None]:
from sklearn.feature_selection import RFE
train_data_numerical = train_data.select_dtypes(exclude = "O")
X_num = train_data_numerical.drop(['id', 'pret'], axis=1)
y_num = train_data_numerical['pret']
model = RandomForestRegressor()
rfe = RFE(model, n_features_to_select = 6)
fit = rfe.fit(X_num,y_num)

In [None]:
rfe = pd.DataFrame(list(zip(X.columns,rfe.support_,rfe.ranking_)), columns = ["feature","rfe_support","ranking"])

In [None]:
rfe

Unnamed: 0,feature,rfe_support,ranking
0,marca,True,1
1,model,True,1
2,an,True,1
3,km,True,1


In [None]:
print("Remaining Columns based on RFE:", rfe[rfe.rfe_support == True].feature.to_list())

Remaining Columns based on RFE: ['marca', 'model', 'an', 'km']


In [None]:
test_data_numerical = test_data.select_dtypes(exclude = "O")
X_test = test_data_numerical.drop(['id', 'pret'], axis=1)
y_pred = rfe.predict(X_test)

AttributeError: 'DataFrame' object has no attribute 'predict'

In [None]:
train_data.head()


Unnamed: 0,marca,model,an,km,putere,cutie_de_viteze,combustibil,capacitate_cilindrica,transmisie,caroserie,culoare,optiuni_culoare,addons,pret,id
0,Opel,Opel_Astra,2008,283100,115,Manuala,Benzina + CNG/GPL,1598,Fata,Masina de oras,Gri,Metalizata,"[Radio, Sistem audio, Control vocal, Climatron...",3650,0
1,Volkswagen,Volkswagen_Tiguan,2017,215994,190,Automata,Diesel,1968,4x4 (automat),SUV,Negru,Metalizata,"[Apple Carplay, Bluetooth, Radio, Sistem navig...",17950,1
2,Ford,Ford_Transit,2011,226000,116,Manuala,Diesel,2402,Fata,Monovolum,Alb,Fara,"[Radio, Aer conditionat, Cotiera (fata), Volan...",9500,2
3,Audi,Audi_A6,2013,290000,180,Automata,Benzina,1984,Fata,Sedan,Negru,Fara,"[Bluetooth, Radio, Sistem hands-free, Port USB...",11750,3
4,Land Rover,Land Rover_Discovery,2018,116628,258,Automata,Diesel,2993,4x4 (automat),SUV,Negru,Metalizata,"[Bluetooth, Radio, Sistem hands-free, Port USB...",38675,4


In [None]:
train_data.shape
train_data.info

<bound method DataFrame.info of                marca                    model    an      km  putere  \
0               Opel               Opel_Astra  2008  283100     115   
1         Volkswagen        Volkswagen_Tiguan  2017  215994     190   
2               Ford             Ford_Transit  2011  226000     116   
3               Audi                  Audi_A6  2013  290000     180   
4         Land Rover     Land Rover_Discovery  2018  116628     258   
...              ...                      ...   ...     ...     ...   
20240     Volkswagen          Volkswagen_Polo  2018   52000      95   
20241           Fiat               Fiat_Doblo  2017  124000      95   
20242          Skoda            Skoda_Octavia  2015  235170     150   
20243  Mercedes-Benz  Mercedes-Benz_GLE Coupe  2019   64390     390   
20244           Opel               Opel_Corsa  2002  157000      75   

      cutie_de_viteze        combustibil  capacitate_cilindrica  \
0             Manuala  Benzina + CNG/GPL        

In [None]:
train_data.describe(include='O') # for categorical variables

Unnamed: 0,marca,model,cutie_de_viteze,combustibil,transmisie,caroserie,culoare,optiuni_culoare,addons
count,20245,20245,20245,20245,20245,20245,20245,20245,20245
unique,52,576,2,4,4,9,11,4,16290
top,Volkswagen,Volkswagen_Golf,Automata,Diesel,Fata,SUV,Negru,Metalizata,[]
freq,2765,641,11852,13865,10531,7706,5141,12628,1406


In [None]:
train_data.describe(exclude='O') # for numerical variables

Unnamed: 0,an,km,putere,capacitate_cilindrica,pret,id
count,20245.0,20245.0,20245.0,20245.0,20245.0,20245.0
mean,2015.582267,139900.639269,168.854779,1956.352531,22342.395752,10122.0
std,4.511354,86728.856122,75.138026,599.236987,19466.371337,5844.372436
min,2000.0,1.0,5.0,400.0,250.0,0.0
25%,2012.0,68450.0,115.0,1560.0,8500.0,5061.0
50%,2017.0,143000.0,150.0,1968.0,15690.0,10122.0
75%,2019.0,203594.0,191.0,1998.0,29500.0,15183.0
max,2023.0,450000.0,600.0,7273.0,99980.0,20244.0


In [None]:
train_data["an"].mean()

2015.5822672264758

In [None]:

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

NameError: name 'X_test' is not defined

In [None]:
print(numeric_cols)
print(categorical_cols)

Index(['an', 'km', 'putere', 'capacitate_cilindrica', 'pret', 'id'], dtype='object')
Index(['marca', 'model', 'cutie_de_viteze', 'combustibil', 'transmisie',
       'caroserie', 'culoare', 'optiuni_culoare', 'addons'],
      dtype='object')


In [None]:
# Preprocessing using ColumnTransformer and Pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'  # Include the remaining columns as is
)

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
print(train_data.columns)
print(y)
print(y_train)

Index(['marca', 'model', 'an', 'km', 'putere', 'cutie_de_viteze',
       'combustibil', 'capacitate_cilindrica', 'transmisie', 'caroserie',
       'culoare', 'optiuni_culoare', 'addons', 'pret', 'id'],
      dtype='object')
0         3650
1        17950
2         9500
3        11750
4        38675
         ...  
20240    14756
20241    10750
20242     9485
20243    62951
20244     1990
Name: pret, Length: 20245, dtype: int64
18877    19350
1097     62000
17401      995
20130    94512
14313    33000
         ...  
11284     9250
11964    22000
5390      6490
860      16190
15795     3200
Name: pret, Length: 16196, dtype: int64


In [None]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                             ('model', model)])
print(X_train)
print(y_train)


               marca                    model    an      km  putere  \
18877        Renault           Renault_Megane  2021    8900     140   
1097   Mercedes-Benz        Mercedes-Benz_CLS  2018   90000     435   
17401           Ford               Ford_Focus  2002  400000      90   
20130  Mercedes-Benz  Mercedes-Benz_GLE Coupe  2022   17000     330   
14313           Ford                Ford_F150  2016  119800     324   
...              ...                      ...   ...     ...     ...   
11284     Volkswagen         Volkswagen_Jetta  2013  193900     122   
11964         Jaguar            Jaguar_F-Pace  2016  125000     180   
5390      Volkswagen           Volkswagen_up!  2013  122000      75   
860       Volkswagen          Volkswagen_Golf  2020  120000     150   
15795          Skoda            Skoda_Octavia  2005  306716     140   

      cutie_de_viteze combustibil  capacitate_cilindrica     transmisie  \
18877        Automata     Benzina                   1332           Fata 

In [None]:
pipeline.fit(X_train, y_train)


ValueError: A given column is not a column of the dataframe

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# Încarcă datele de antrenare
train_data = pd.read_json("train.json")

#elimin randurile cu NaN complet
train_data.fillna(0,inplace=True)

# Împarte setul de antrenare în set de antrenare și set de validare
train_set, val_set = train_test_split(train_data, test_size=0.2, random_state=42)

# Definirea atributelor și a variabilei țintă
features = train_set.drop(['id', 'pret'], axis=1)
target = train_set['pret']

# Extrage toate opțiunile de addons din setul de antrenare
all_addons = train_data['addons'].explode().unique()

# Separă caracteristicile categorice și numerice
categorical_features = features.select_dtypes(include=['object']).columns.tolist()
numeric_features = features.select_dtypes(exclude=['object']).columns.tolist()



# Aplică one-hot encoding pe caracteristicile categorice, cu excepția câmpului "addons"
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_categorical_features = pd.DataFrame(encoder.fit_transform(features[categorical_features].drop('addons', axis=1)))
encoded_categorical_features.columns = encoder.get_feature_names_out(categorical_features[:-1])

# Procesează câmpul "addons" pentru a crea noi caracteristici binare
for option in all_addons:
    features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)


# Concatenează caracteristicile numerice, categorice și cele create din câmpul "addons"



  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = feature

  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = feature

  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = features['addons'].apply(lambda x: 1 if option in x else 0)
  features[option] = feature

In [None]:
features[numeric_features]

Unnamed: 0,an,km,putere,capacitate_cilindrica
18877,2021,8900,140,1332
1097,2018,90000,435,2999
17401,2002,400000,90,1753
20130,2022,17000,330,2925
14313,2016,119800,324,2694
...,...,...,...,...
11284,2013,193900,122,1390
11964,2016,125000,180,1999
5390,2013,122000,75,999
860,2020,120000,150,1968


In [None]:
encoded_categorical_features

Unnamed: 0,marca_Abarth,marca_Aixam,marca_Alfa Romeo,marca_Altul,marca_Aston Martin,marca_Audi,marca_BMW,marca_Bentley,marca_Cadillac,marca_Chevrolet,...,culoare_Galben/Auriu,culoare_Gri,culoare_Maro,culoare_Negru,culoare_Rosu,culoare_Verde,optiuni_culoare_Fara,optiuni_culoare_Mat,optiuni_culoare_Metalizata,optiuni_culoare_Perlat
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
16192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
16193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
16194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
features[all_addons]

Unnamed: 0,Radio,Sistem audio,Control vocal,Climatronic,Tapiterie stofa,Scaune sport fata,Cotiera (fata),Volan piele,Volan sport,Volan cu comenzi,...,Faruri laser,Suspensie hidropneumatica,Scaune spate cu masaj,Cablu incarcare masina electrica,Cauciucuri off road,Frane carbo-ceramice,Functie incarcare rapida,Autocolant,Trapa manuala,Jante aliaj 22
18877,1,1,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1097,1,1,0,0,0,1,1,1,1,1,...,0,1,0,0,0,0,0,0,0,0
17401,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20130,0,0,0,1,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
14313,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,1,1,1,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
11964,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5390,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
860,1,1,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
features1 = pd.concat([features[numeric_features], features[all_addons]], axis=1)
features1 = pd.concat([features1, encoded_categorical_features], axis=1)


In [None]:
features = pd.concat([features[numeric_features], features[all_addons]], axis=1)

In [None]:
features


Unnamed: 0,an,km,putere,capacitate_cilindrica,Radio,Sistem audio,Control vocal,Climatronic,Tapiterie stofa,Scaune sport fata,...,Faruri laser,Suspensie hidropneumatica,Scaune spate cu masaj,Cablu incarcare masina electrica,Cauciucuri off road,Frane carbo-ceramice,Functie incarcare rapida,Autocolant,Trapa manuala,Jante aliaj 22
18877,2021,8900,140,1332,1,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1097,2018,90000,435,2999,1,1,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
17401,2002,400000,90,1753,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20130,2022,17000,330,2925,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
14313,2016,119800,324,2694,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,2013,193900,122,1390,1,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
11964,2016,125000,180,1999,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5390,2013,122000,75,999,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
860,2020,120000,150,1968,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from sklearn.impute import SimpleImputer

# Assume 'features' is your DataFrame containing numeric features with NaN values

# Identify numeric features with NaN values
numeric_features_with_nan = features[numeric_features].columns[features[numeric_features].isnull().any()].tolist()

# Create an instance of SimpleImputer
imputer = SimpleImputer(strategy='mean')

# Fit and transform the imputer on the numeric features with NaN values
features[numeric_features_with_nan] = imputer.fit_transform(features[numeric_features_with_nan])


ValueError: at least one array or dtype is required

In [None]:
# Elimină câmpul "addons" din caracteristicile finale
#features = features.drop('addons', axis=1)

# Inițializarea modelului Random Forest
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Antrenarea modelului pe setul de antrenare
random_forest_model.fit(features, target)

# Efectuarea prezicerilor pe setul de validare
val_features = val_set.drop(['id', 'pret'], axis=1)

# Aplică aceleași transformări pe setul de validare
encoded_categorical_val_features = pd.DataFrame(encoder.transform(val_features[categorical_features[:-1]]))
encoded_categorical_val_features.columns = encoder.get_feature_names_out(categorical_features[:-1])
val_features = pd.concat([val_features[numeric_features], encoded_categorical_val_features, val_features[all_addons]], axis=1)

# Elimină câmpul "addons" din caracteristicile finale de validare
val_features = val_features.drop('addons', axis=1)

val_predictions = random_forest_model.predict(val_features)

# Evaluarea performanței modelului pe setul de validare
mse = mean_squared_error(val_set['pret'], val_predictions)
print(f'Mean Squared Error pe setul de validare: {mse}')

# ... Restul codului rămâne neschimbat pentru predicții pe setul de test

KeyError: "None of [Index(['Radio', 'Sistem audio', 'Control vocal', 'Climatronic',\n       'Tapiterie stofa', 'Scaune sport fata', 'Cotiera (fata)', 'Volan piele',\n       'Volan sport', 'Volan cu comenzi',\n       ...\n       'Faruri laser', 'Suspensie hidropneumatica', 'Scaune spate cu masaj',\n       'Cablu incarcare masina electrica', 'Cauciucuri off road',\n       'Frane carbo-ceramice', 'Functie incarcare rapida', 'Autocolant',\n       'Trapa manuala', 'Jante aliaj 22'],\n      dtype='object', length=183)] are in the [columns]"

In [3]:
# Load the training dataset
train_data = pd.read_json('train.json')
test_data = pd.read_json('test.json')

# Explore and preprocess the data
# ... (handle missing values, encode categorical features, normalize/standardize numeric features)

numeric_cols = train_data.select_dtypes(include=['number']).columns
categorical_cols = train_data.select_dtypes(include=['object']).columns

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
# Load the training dataset
train_data = pd.read_json('train.json')
test_data = pd.read_json('test.json')

# Explore and preprocess the data
# ... (handle missing values, encode categorical features, normalize/standardize numeric features)

numeric_cols = train_data.select_dtypes(include=['number']).columns
categorical_cols = train_data.select_dtypes(include=['object']).columns

# Split the data into features (X) and target variable (y)
X = train_data.drop(['id', 'pret'], axis=1)
y = train_data['pret']

# Initialize the Random Forest model
model = RandomForestRegressor()

# # Fit the model on the training data
# model.fit(X, y)

# # Predict the price in the test data
# y_pred = model.predict(test_data.drop('id', axis=1))

# # Print the predicted prices
# print(y_pred)


ValueError: could not convert string to float: 'Opel'