In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col, count

In [14]:
spark = SparkSession.builder.master('local[*]').appName('used-car-price').config('spark.executor.memory', '6g').getOrCreate()
spark = SparkSession.builder.master('local[*]').appName('used-car-price')\
.config("spark.executor.instances", "1")\
.config("spark.executor.memory", "6g")\
.config("spark.driver.memory", "6g")\
.config("spark.executor.memoryOverhead", "8g")\
.getOrCreate()

In [15]:
csv_file_path = '../data/cleaned_test.csv'
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

In [16]:
df.show()

+------+-------------+--------------------+----------+------+---------+--------------------+--------------------+--------------------+-------+---------------+-----------+
|    id|        brand|               model|model_year|milage|fuel_type|              engine|        transmission|             ext_col|int_col|       accident|clean_title|
+------+-------------+--------------------+----------+------+---------+--------------------+--------------------+--------------------+-------+---------------+-----------+
|218052|Mercedes-Benz|       E-Class E 350|      2014| 82900| Gasoline|302.0HP 3.5L V6 C...|   7-speed automatic|               White|  Beige|  None reported|        Yes|
|218053|      Genesis|      G80 3.3T Sport|      2009|140401| Gasoline|365.0HP 3.3L V6 C...|   6-speed automatic|                Gray|  Black|  None reported|        Yes|
|218054|         Ford|        F-150 Lariat|      2021|  3055| Gasoline|375.0HP 3.5L V6 C...|  10-speed automatic|               Green|  Black|  N

In [17]:
data = df.drop('id')

In [18]:
from sklearn.impute import KNNImputer
import pandas as pd
def knn_impute(df, n_neighbors=5):   
    df_encoded = df.copy()
    for col in df_encoded.select_dtypes(include='object').columns:
        df_encoded[col] = df_encoded[col].astype('category').cat.codes
    knn_imputer = KNNImputer(n_neighbors=n_neighbors)
    df_imputed = pd.DataFrame(knn_imputer.fit_transform(df_encoded), columns=df_encoded.columns)
    for col in df.select_dtypes(include='object').columns:
        df_imputed[col] = df_imputed[col].round().astype(int).map(
            dict(enumerate(df[col].astype('category').cat.categories)))
    return df_imputed

In [19]:
df_train_imputed = knn_impute(data.toPandas(), n_neighbors=25)

In [20]:
from sklearn.preprocessing import OrdinalEncoder
cat_cols_train = df_train_imputed.select_dtypes(include=['object']).columns
cat_cols_train = cat_cols_train[cat_cols_train != 'class']
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

df_train_imputed[cat_cols_train] = ordinal_encoder.fit_transform(df_train_imputed[cat_cols_train].astype(str))


In [25]:
df_train_imputed.isnull().sum()

brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
dtype: int64

In [21]:
Sp_data = spark.createDataFrame(df_train_imputed)

In [10]:
Sp_data.show()

+-----+------+----------+--------+---------+------+------------+-------+-------+--------+-----------+
|brand| model|model_year|  milage|fuel_type|engine|transmission|ext_col|int_col|accident|clean_title|
+-----+------+----------+--------+---------+------+------------+-------+-------+--------+-----------+
| 36.0| 549.0|    2014.0| 82900.0|      2.0| 583.0|        16.0|  302.0|   10.0|     0.0|        0.0|
| 16.0| 754.0|    2009.0|140401.0|      2.0| 738.0|        11.0|  127.0|   14.0|     0.0|        0.0|
| 14.0| 660.0|    2021.0|  3055.0|      2.0| 750.0|         1.0|  128.0|   14.0|     0.0|        0.0|
| 36.0| 206.0|    2022.0|  3921.0|      2.0| 769.0|        22.0|   29.0|   71.0|     0.0|        0.0|
|  9.0|1627.0|    2018.0|  1025.0|      2.0| 717.0|        11.0|  261.0|   14.0|     1.0|        0.0|
| 14.0| 650.0|    2017.0|161000.0|      2.0| 478.0|        11.0|  302.0|   10.0|     1.0|        0.0|
| 19.0|1743.0|    2012.0|183000.0|      2.0|  38.0|        30.0|   29.0|   71.0|  

                                                                                

In [24]:
from pyspark.ml.feature import VectorAssembler, StandardScalerModel
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import col

# Load the saved scaler model
scaler_model = StandardScalerModel.load("scaler_model")

# Define feature columns for the test data
feature_columns = [column for column in Sp_data.columns]  # Adjust if needed

# Assemble features for test data
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
test_assembled_df = assembler.transform(Sp_data)

# Use the pre-fitted scaler model to transform the test data
scaled_test_df = scaler_model.transform(test_assembled_df)

# Convert the 'scaled_features' vector to an array for easier handling
scaled_test_df = scaled_test_df.withColumn("scaled_features_array", vector_to_array(scaled_test_df["scaled_features"]))

# Get the number of features (same as training data)
num_features = len(feature_columns)

# Create individual columns for each feature in the array
for i in range(num_features):
    scaled_test_df = scaled_test_df.withColumn(f"scaled_feature_{i+1}", col("scaled_features_array")[i])

# Select only the individual scaled feature columns for saving
scaled_test_to_save = scaled_test_df.select([f"scaled_feature_{i+1}" for i in range(num_features)])

# Save the scaled test data to CSV
scaled_test_to_save.write.csv("../data/scaled_test_data.csv", header=True)


                                                                                

In [12]:
output_path = "../data/preprocessed_test.csv"
scaled_df.write.csv(output_path, header=True, mode='overwrite')


AnalysisException: [UNSUPPORTED_DATA_TYPE_FOR_DATASOURCE] The CSV datasource doesn't support the column `features` of the type "STRUCT<type: TINYINT, size: INT, indices: ARRAY<INT>, values: ARRAY<DOUBLE>>".