In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54273 entries, 0 to 54272
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            54273 non-null  int64 
 1   brand         54273 non-null  object
 2   model         54273 non-null  object
 3   model_year    54273 non-null  int64 
 4   milage        54273 non-null  int64 
 5   fuel_type     54273 non-null  object
 6   engine        54273 non-null  object
 7   transmission  54273 non-null  object
 8   ext_col       54273 non-null  object
 9   int_col       54273 non-null  object
 10  accident      54273 non-null  object
 11  clean_title   54273 non-null  object
 12  price         54273 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 5.4+ MB


In [4]:
# Feature Engineering: Encode categorical features and standardize numerical features
categorical_features = ['brand', 'model', 'fuel_type', 'transmission', 'ext_col', 'int_col','engine','accident','clean_title']
encoder = LabelEncoder()

for feature in categorical_features:
    train_df[feature] = encoder.fit_transform(train_df[feature])

In [6]:
# Standardize numerical features
scaler = StandardScaler()
numerical_features = ['id', 'model_year', 'milage','price']
train_df[numerical_features] = scaler.fit_transform(train_df[numerical_features])

In [8]:
# Splitting the Data
X = train_df.drop(['price', 'id'], axis=1)
y = train_df['price']
vehicle_ids = train_df['id']

In [9]:
# Split the data
X_train, X_test, y_train, y_test, ids_train, ids_test = train_test_split(X, y, vehicle_ids, test_size=0.2, random_state=42)

In [10]:
# Model Selection and Training: Using Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [11]:
# Model Evaluation
y_pred = rf_model.predict(X_test)
print('Random Forest:')
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('R-squared:', r2_score(y_test, y_pred))

Random Forest:
MAE: 0.2549991968460909
MSE: 0.5704005333194409
R-squared: -0.029492191562107806


In [12]:
# Saving Predictions to CSV
# Create a DataFrame with IDs and predicted prices
predictions_df = pd.DataFrame({
    'ID': ids_test,
    'Actual Price': y_test,
    'Predicted Price': y_pred
})

# Save the DataFrame to a CSV file
predictions_df.to_csv('predicted_prices_with_id.csv', index=False)

print("Predictions have been saved to 'predicted_prices_with_id.csv'")

Predictions have been saved to 'predicted_prices_with_id.csv'
