In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col, count

In [2]:
spark = SparkSession.builder.master('local[*]').appName('used-car-price')\
.config("spark.executor.instances", "1")\
.config("spark.executor.memory", "6g")\
.config("spark.driver.memory", "6g")\
.config("spark.executor.memoryOverhead", "8g")\
.getOrCreate()

24/09/11 13:30:09 WARN Utils: Your hostname, langchain resolves to a loopback address: 127.0.1.1; using 192.168.0.103 instead (on interface wlp3s0)
24/09/11 13:30:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/11 13:30:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/11 13:30:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/09/11 13:30:11 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
csv_file_path = '../data/cleaned_test'
test_data = spark.read.csv(csv_file_path, header=True, inferSchema=True)

In [4]:
test_data.show()

+------+-------------+--------------------+----------+------+-------------+--------------------+--------------------+------------+-----------+---------------+-----------+------+
|    id|        brand|               model|model_year|milage|    fuel_type|              engine|        transmission|     ext_col|    int_col|       accident|clean_title| price|
+------+-------------+--------------------+----------+------+-------------+--------------------+--------------------+------------+-----------+---------------+-----------+------+
|114046|Mercedes-Benz|     AMG GT AMG GT S|      2022|  4106|     Gasoline|40L V8 32V GDI DO...|  10-speed automatic| Ultra Black|      Black|  None reported|        Yes| 78350|
|114057|      Porsche|           911 Turbo|      2022| 13800|     Gasoline|5720HP 38L Flat 6...|   8-speed automatic|       Black|          –|  None reported|        Yes| 94995|
|114068|         Ford|     Transit-350 XLT|      2020| 85000|E85 Flex Fuel|3550HP 53L 8 Cyli...|           aut

In [5]:
data = test_data.drop('id')

In [6]:
from sklearn.impute import KNNImputer
import pandas as pd
def knn_impute(df, n_neighbors=5):   
    df_encoded = df.copy()
    for col in df_encoded.select_dtypes(include='object').columns:
        df_encoded[col] = df_encoded[col].astype('category').cat.codes
    knn_imputer = KNNImputer(n_neighbors=n_neighbors)
    df_imputed = pd.DataFrame(knn_imputer.fit_transform(df_encoded), columns=df_encoded.columns)
    for col in df.select_dtypes(include='object').columns:
        df_imputed[col] = df_imputed[col].round().astype(int).map(
            dict(enumerate(df[col].astype('category').cat.categories)))
    return df_imputed

In [7]:
test_data = knn_impute(data.toPandas(), n_neighbors=25)

In [8]:
test_data

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,Mercedes-Benz,AMG GT AMG GT S,2022.0,4106.0,Gasoline,40L V8 32V GDI DOHC Twin Turbo,10-speed automatic,Ultra Black,Black,None reported,Yes,78350.0
1,Porsche,911 Turbo,2022.0,13800.0,Gasoline,5720HP 38L Flat 6 Cylinder Engine Gasoline Fuel,8-speed automatic,Black,–,None reported,Yes,94995.0
2,Ford,Transit-350 XLT,2020.0,85000.0,E85 Flex Fuel,3550HP 53L 8 Cylinder Engine Flex Fuel Capability,automatic,Beige,Black,None reported,Yes,45000.0
3,Buick,LaCrosse Leather,2016.0,50000.0,Gasoline,3040HP 36L V6 Cylinder Engine Gasoline Fuel,automatic,White,Beige,damage reported,Yes,12000.0
4,MINI,Cooper S Base,2011.0,121250.0,Gasoline,1720HP 16L 4 Cylinder Engine Gasoline Fuel,automatic,Black,Black,damage reported,Yes,8900.0
...,...,...,...,...,...,...,...,...,...,...,...,...
37460,GMC,Yukon Denali,2019.0,34672.0,Gasoline,4200HP 62L 8 Cylinder Engine Gasoline Fuel,10-speed automatic,Black,Black,None reported,Yes,49900.0
37461,Mercedes-Benz,GLC 300 GLC 300,2022.0,18031.0,Gasoline,20 Liter Turbo,automatic,White,Black,None reported,,39998.0
37462,BMW,M240 i xDrive,2019.0,32000.0,Gasoline,3350HP 30L Straight 6 Cylinder Engine Gasoline...,6-speed manual,White,Black,None reported,Yes,36300.0
37463,Mercedes-Benz,AMG C 43 AMG C 43 4MATIC,2018.0,28600.0,Gasoline,3850HP 30L V6 Cylinder Engine Gasoline Fuel,8-speed automatic,White,Black,damage reported,Yes,30000.0


In [9]:
from sklearn.preprocessing import OrdinalEncoder
cat_cols_train = test_data.select_dtypes(include=['object']).columns
cat_cols_train = cat_cols_train[cat_cols_train != 'class']
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

test_data[cat_cols_train] = ordinal_encoder.fit_transform(test_data[cat_cols_train].astype(str))


In [10]:
test_data.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,35.0,222.0,2022.0,4106.0,2.0,789.0,1.0,285.0,14.0,0.0,0.0,78350.0
1,42.0,151.0,2022.0,13800.0,2.0,974.0,18.0,29.0,151.0,0.0,0.0,94995.0
2,14.0,1666.0,2020.0,85000.0,1.0,652.0,23.0,22.0,14.0,0.0,0.0,45000.0
3,7.0,906.0,2016.0,50000.0,2.0,513.0,23.0,293.0,10.0,1.0,0.0,12000.0
4,31.0,478.0,2011.0,121250.0,2.0,101.0,23.0,29.0,14.0,1.0,0.0,8900.0


In [12]:
test_data = spark.createDataFrame(test_data)

In [17]:
from pyspark.ml.feature import VectorAssembler, StandardScalerModel
from pyspark.sql.functions import col
from pyspark.ml.functions import vector_to_array

scaler_model = StandardScalerModel.load("scaler_model")
feature_columns = [column for column in test_data.columns if column != "price"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
assembled_test_df = assembler.transform(test_data)
scaled_test_df = scaler_model.transform(assembled_test_df)
scaled_test_df = scaled_test_df.withColumn("scaled_features_array", vector_to_array(scaled_test_df["scaled_features"]))
num_features = len(feature_columns)
for i in range(num_features):
    scaled_test_df = scaled_test_df.withColumn(f"scaled_feature_{i+1}", col("scaled_features_array")[i])
scaled_test_df_to_save = scaled_test_df.select([f"scaled_feature_{i+1}" for i in range(num_features)] + ["price"])

scaled_test_df_to_save.write.csv("../data/scaled_test_data_new", header=True, mode='overwrite')
scaled_test_df_to_save.show()


                                                                                

+-------------------+-------------------+------------------+-------------------+-------------------+------------------+--------------------+--------------------+--------------------+-------------------+-------------------+--------+
|   scaled_feature_1|   scaled_feature_2|  scaled_feature_3|   scaled_feature_4|   scaled_feature_5|  scaled_feature_6|    scaled_feature_7|    scaled_feature_8|    scaled_feature_9|  scaled_feature_10|  scaled_feature_11|   price|
+-------------------+-------------------+------------------+-------------------+-------------------+------------------+--------------------+--------------------+--------------------+-------------------+-------------------+--------+
|-1.6016252503957296| -1.321891010074043|126.18545808246164|  374.1786953036393|-1.4357612502500614|1113.7990365515723| -2.1512891093070188|  1.1191147331975215| -0.5225270695999898|-0.5485619277171502|-0.3472294237818045| 78350.0|
| -1.588015992561963|-1.5846022271426754|126.18545808246164| 2109.455313

In [None]:
# from pyspark.ml.feature import VectorAssembler, StandardScalerModel
# from pyspark.ml.functions import vector_to_array
# from pyspark.sql.functions import col

# # Load the saved scaler model
# scaler_model = StandardScalerModel.load("scaler_model")

# # Define feature columns for the test data
# feature_columns = [column for column in Sp_data.columns]  # Adjust if needed

# # Assemble features for test data
# assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
# test_assembled_df = assembler.transform(Sp_data)

# # Use the pre-fitted scaler model to transform the test data
# scaled_test_df = scaler_model.transform(test_assembled_df)

# # Convert the 'scaled_features' vector to an array for easier handling
# scaled_test_df = scaled_test_df.withColumn("scaled_features_array", vector_to_array(scaled_test_df["scaled_features"]))

# # Get the number of features (same as training data)
# num_features = len(feature_columns)

# # Create individual columns for each feature in the array
# for i in range(num_features):
#     scaled_test_df = scaled_test_df.withColumn(f"scaled_feature_{i+1}", col("scaled_features_array")[i])

# # Select only the individual scaled feature columns for saving
# scaled_test_to_save = scaled_test_df.select([f"scaled_feature_{i+1}" for i in range(num_features)])

# # Save the scaled test data to CSV
# scaled_test_to_save.write.csv("../data/scaled_test_data.csv", header=True)


                                                                                