In [41]:
import pandas as pd

In [42]:
usedcars_df = pd.read_parquet(r'data/transformed_data.parquet')
usedcars_df['registration_date'] = usedcars_df['registration_date'].dt.year

In [43]:
usedcars_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 221199 entries, 0 to 251076
Data columns (total 14 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   brand                     221199 non-null  object 
 1   model                     221199 non-null  object 
 2   color                     221199 non-null  object 
 3   registration_date         221199 non-null  int32  
 4   year                      221199 non-null  object 
 5   price_in_euro             221199 non-null  float64
 6   power_kw                  221199 non-null  float64
 7   power_ps                  221199 non-null  float64
 8   transmission_type         221199 non-null  object 
 9   fuel_type                 221199 non-null  object 
 10  fuel_consumption_l_100km  221199 non-null  float64
 11  fuel_consumption_g_km     221199 non-null  object 
 12  mileage_in_km             221199 non-null  float64
 13  offer_description         221199 non-null  object

In [44]:
from sklearn.model_selection import train_test_split

# Split the data into train (60%) and temporary set (40%)
train_df, temp_df = train_test_split(usedcars_df, test_size=0.4, random_state=1)

# Split the temporary set into validation (50%) and test (50%) sets
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=1)

In [45]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

y_train = train_df.price_in_euro.values
y_valid = valid_df.price_in_euro.values
y_test = test_df.price_in_euro.values

full_train_df = train_df.copy()
full_valid_df = valid_df.copy()
full_test_df = test_df.copy()

del train_df['price_in_euro']
del valid_df['price_in_euro']
del test_df['price_in_euro']

In [46]:
categorical_columns = ["brand", "model", "color", "registration_date", "year", "transmission_type", "fuel_type"]
numerical_columns = ["power_kw", "power_ps", "fuel_consumption_l_100km", "mileage_in_km"]

In [47]:
print(train_df[categorical_columns + numerical_columns].shape, valid_df[categorical_columns + numerical_columns].shape, test_df[categorical_columns + numerical_columns].shape)

(132719, 11) (44240, 11) (44240, 11)


In [48]:
dv = DictVectorizer(sparse=False)


train_dict = train_df[categorical_columns + numerical_columns].to_dict(orient='records')

valid_dict = valid_df[categorical_columns + numerical_columns].to_dict(orient='records')

test_dict = test_df[categorical_columns + numerical_columns].to_dict(orient='records')

X_train = dv.fit_transform(train_dict)
X_valid = dv.transform(valid_dict)
X_test = dv.transform(test_dict)

In [49]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)

model.fit(X_train, y_train)