In [118]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
from pyspark.sql import SparkSession
from pyspark.sql import functions
from pyspark.sql.functions import col, lit, concat, lower, upper, substring, min, max

In [68]:
import findspark
findspark.init()

spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) 

In [69]:
spark

In [70]:
df = spark.read.csv('cars.csv', header=True, sep=";", inferSchema=True)

In [71]:
df.show(5)

+--------------------+----+---------+------------+----------+------+------------+-----+------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+--------------------+----+---------+------------+----------+------+------------+-----+------+
|Chevrolet Chevell...|18.0|        8|       307.0|     130.0|  3504|        12.0|   70|    US|
|   Buick Skylark 320|15.0|        8|       350.0|     165.0|  3693|        11.5|   70|    US|
|  Plymouth Satellite|18.0|        8|       318.0|     150.0|  3436|        11.0|   70|    US|
|       AMC Rebel SST|16.0|        8|       304.0|     150.0|  3433|        12.0|   70|    US|
|         Ford Torino|17.0|        8|       302.0|     140.0|  3449|        10.5|   70|    US|
+--------------------+----+---------+------------+----------+------+------------+-----+------+
only showing top 5 rows



df.take(5) 

will return a list of five row objects

df.collect() 

will get all of the data from the entire dataframe
be really careful when using it because if you have a large data set you can easily crash the driver node

df.show() 

is the most commonly used method to view a dataframe there are a few parameters we can pass to this method like the number of rows and truncaiton
for example df.show(5, False) or df.show(5, truncate=False) will show the entire data wihtout any truncation

df.limit(5) 

will return a new dataframe by taking the first n rows as spark is distributed in nature there is no guarantee that df.limit() will give you the same results each time

truncate = True

uzun string değerleri 20 karakterden sonra ... koyarak kesilir

truncate = False

string değerleri tam uzunlukta gösterilir

In [72]:
df.columns

['Car',
 'MPG',
 'Cylinders',
 'Displacement',
 'Horsepower',
 'Weight',
 'Acceleration',
 'Model',
 'Origin']

In [73]:
df.dtypes

[('Car', 'string'),
 ('MPG', 'double'),
 ('Cylinders', 'int'),
 ('Displacement', 'double'),
 ('Horsepower', 'double'),
 ('Weight', 'decimal(4,0)'),
 ('Acceleration', 'double'),
 ('Model', 'int'),
 ('Origin', 'string')]

In [74]:
df.printSchema()

root
 |-- Car: string (nullable = true)
 |-- MPG: double (nullable = true)
 |-- Cylinders: integer (nullable = true)
 |-- Displacement: double (nullable = true)
 |-- Horsepower: double (nullable = true)
 |-- Weight: decimal(4,0) (nullable = true)
 |-- Acceleration: double (nullable = true)
 |-- Model: integer (nullable = true)
 |-- Origin: string (nullable = true)



In [75]:
labels = [
    ('Car',StringType()),
    ('MPG',DoubleType()),
    ('Cylinders',IntegerType()),
    ('Displacement',DoubleType()),
    ('Horsepower',DoubleType()),
    ('Weight',DoubleType()),
    ('Acceleration',DoubleType()),
    ('Model',IntegerType()),
    ('Origin',StringType())
]

In [76]:
schema = StructType([StructField (x[0], x[1], True) for x in labels])

In [77]:
schema

StructType([StructField('Car', StringType(), True), StructField('MPG', DoubleType(), True), StructField('Cylinders', IntegerType(), True), StructField('Displacement', DoubleType(), True), StructField('Horsepower', DoubleType(), True), StructField('Weight', DoubleType(), True), StructField('Acceleration', DoubleType(), True), StructField('Model', IntegerType(), True), StructField('Origin', StringType(), True)])

In [78]:
df = spark.read.csv('cars.csv', header=True, sep=";", schema=schema)
df.printSchema()

root
 |-- Car: string (nullable = true)
 |-- MPG: double (nullable = true)
 |-- Cylinders: integer (nullable = true)
 |-- Displacement: double (nullable = true)
 |-- Horsepower: double (nullable = true)
 |-- Weight: double (nullable = true)
 |-- Acceleration: double (nullable = true)
 |-- Model: integer (nullable = true)
 |-- Origin: string (nullable = true)



In [79]:
df.select(df.Car).show()

+--------------------+
|                 Car|
+--------------------+
|Chevrolet Chevell...|
|   Buick Skylark 320|
|  Plymouth Satellite|
|       AMC Rebel SST|
|         Ford Torino|
|    Ford Galaxie 500|
|    Chevrolet Impala|
|   Plymouth Fury iii|
|    Pontiac Catalina|
|  AMC Ambassador DPL|
|Citroen DS-21 Pallas|
|Chevrolet Chevell...|
|    Ford Torino (sw)|
|Plymouth Satellit...|
|  AMC Rebel SST (sw)|
| Dodge Challenger SE|
|  Plymouth 'Cuda 340|
|Ford Mustang Boss...|
|Chevrolet Monte C...|
|Buick Estate Wago...|
+--------------------+
only showing top 20 rows



In [80]:
df.select(df['car']).show(truncate=False)

+--------------------------------+
|car                             |
+--------------------------------+
|Chevrolet Chevelle Malibu       |
|Buick Skylark 320               |
|Plymouth Satellite              |
|AMC Rebel SST                   |
|Ford Torino                     |
|Ford Galaxie 500                |
|Chevrolet Impala                |
|Plymouth Fury iii               |
|Pontiac Catalina                |
|AMC Ambassador DPL              |
|Citroen DS-21 Pallas            |
|Chevrolet Chevelle Concours (sw)|
|Ford Torino (sw)                |
|Plymouth Satellite (sw)         |
|AMC Rebel SST (sw)              |
|Dodge Challenger SE             |
|Plymouth 'Cuda 340              |
|Ford Mustang Boss 302           |
|Chevrolet Monte Carlo           |
|Buick Estate Wagon (sw)         |
+--------------------------------+
only showing top 20 rows



In [81]:
df.select(col('car')).show(truncate=False)

+--------------------------------+
|car                             |
+--------------------------------+
|Chevrolet Chevelle Malibu       |
|Buick Skylark 320               |
|Plymouth Satellite              |
|AMC Rebel SST                   |
|Ford Torino                     |
|Ford Galaxie 500                |
|Chevrolet Impala                |
|Plymouth Fury iii               |
|Pontiac Catalina                |
|AMC Ambassador DPL              |
|Citroen DS-21 Pallas            |
|Chevrolet Chevelle Concours (sw)|
|Ford Torino (sw)                |
|Plymouth Satellite (sw)         |
|AMC Rebel SST (sw)              |
|Dodge Challenger SE             |
|Plymouth 'Cuda 340              |
|Ford Mustang Boss 302           |
|Chevrolet Monte Carlo           |
|Buick Estate Wagon (sw)         |
+--------------------------------+
only showing top 20 rows



In [82]:
df.select(df.Car, df.Cylinders).show()

+--------------------+---------+
|                 Car|Cylinders|
+--------------------+---------+
|Chevrolet Chevell...|        8|
|   Buick Skylark 320|        8|
|  Plymouth Satellite|        8|
|       AMC Rebel SST|        8|
|         Ford Torino|        8|
|    Ford Galaxie 500|        8|
|    Chevrolet Impala|        8|
|   Plymouth Fury iii|        8|
|    Pontiac Catalina|        8|
|  AMC Ambassador DPL|        8|
|Citroen DS-21 Pallas|        4|
|Chevrolet Chevell...|        8|
|    Ford Torino (sw)|        8|
|Plymouth Satellit...|        8|
|  AMC Rebel SST (sw)|        8|
| Dodge Challenger SE|        8|
|  Plymouth 'Cuda 340|        8|
|Ford Mustang Boss...|        8|
|Chevrolet Monte C...|        8|
|Buick Estate Wago...|        8|
+--------------------+---------+
only showing top 20 rows



In [83]:
df = df.withColumn('first_column',lit(1))
df.show(5)

+--------------------+----+---------+------------+----------+------+------------+-----+------+------------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|first_column|
+--------------------+----+---------+------------+----------+------+------------+-----+------+------------+
|Chevrolet Chevell...|18.0|        8|       307.0|     130.0|3504.0|        12.0|   70|    US|           1|
|   Buick Skylark 320|15.0|        8|       350.0|     165.0|3693.0|        11.5|   70|    US|           1|
|  Plymouth Satellite|18.0|        8|       318.0|     150.0|3436.0|        11.0|   70|    US|           1|
|       AMC Rebel SST|16.0|        8|       304.0|     150.0|3433.0|        12.0|   70|    US|           1|
|         Ford Torino|17.0|        8|       302.0|     140.0|3449.0|        10.5|   70|    US|           1|
+--------------------+----+---------+------------+----------+------+------------+-----+------+------------+
only showing top 5 rows



In [84]:
df = df.withColumn('second_column', lit(2)).withColumn('third_column', lit("a"))
df.show(5)

+--------------------+----+---------+------------+----------+------+------------+-----+------+------------+-------------+------------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|first_column|second_column|third_column|
+--------------------+----+---------+------------+----------+------+------------+-----+------+------------+-------------+------------+
|Chevrolet Chevell...|18.0|        8|       307.0|     130.0|3504.0|        12.0|   70|    US|           1|            2|           a|
|   Buick Skylark 320|15.0|        8|       350.0|     165.0|3693.0|        11.5|   70|    US|           1|            2|           a|
|  Plymouth Satellite|18.0|        8|       318.0|     150.0|3436.0|        11.0|   70|    US|           1|            2|           a|
|       AMC Rebel SST|16.0|        8|       304.0|     150.0|3433.0|        12.0|   70|    US|           1|            2|           a|
|         Ford Torino|17.0|        8|       302.0|     

In [85]:
df = df.withColumn("car_model",concat(col("car"),lit(" "),col("model")))
df.show(5)

+--------------------+----+---------+------------+----------+------+------------+-----+------+------------+-------------+------------+--------------------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|first_column|second_column|third_column|           car_model|
+--------------------+----+---------+------------+----------+------+------------+-----+------+------------+-------------+------------+--------------------+
|Chevrolet Chevell...|18.0|        8|       307.0|     130.0|3504.0|        12.0|   70|    US|           1|            2|           a|Chevrolet Chevell...|
|   Buick Skylark 320|15.0|        8|       350.0|     165.0|3693.0|        11.5|   70|    US|           1|            2|           a|Buick Skylark 320 70|
|  Plymouth Satellite|18.0|        8|       318.0|     150.0|3436.0|        11.0|   70|    US|           1|            2|           a|Plymouth Satellit...|
|       AMC Rebel SST|16.0|        8|       304.0|     150.0|343

In [86]:
df = df.withColumnRenamed("Car","car").withColumnRenamed("MPG","mpg").withColumnRenamed("Cylinders","cylinders")
df = df.withColumnRenamed("Displacement","displacement").withColumnRenamed("Horsepower","horsepower").withColumnRenamed("Weight","weight")
df = df.withColumnRenamed("Acceleration","acceleration").withColumnRenamed("Model","model").withColumnRenamed("Origin","origin")
df.show(5)

+--------------------+----+---------+------------+----------+------+------------+-----+------+------------+-------------+------------+--------------------+
|                 car| mpg|cylinders|displacement|horsepower|weight|acceleration|model|origin|first_column|second_column|third_column|           car_model|
+--------------------+----+---------+------------+----------+------+------------+-----+------+------------+-------------+------------+--------------------+
|Chevrolet Chevell...|18.0|        8|       307.0|     130.0|3504.0|        12.0|   70|    US|           1|            2|           a|Chevrolet Chevell...|
|   Buick Skylark 320|15.0|        8|       350.0|     165.0|3693.0|        11.5|   70|    US|           1|            2|           a|Buick Skylark 320 70|
|  Plymouth Satellite|18.0|        8|       318.0|     150.0|3436.0|        11.0|   70|    US|           1|            2|           a|Plymouth Satellit...|
|       AMC Rebel SST|16.0|        8|       304.0|     150.0|343

In [87]:
df.groupBy("origin").count()

origin,count
Europe,73
US,254
Japan,79


In [88]:
df.groupBy("origin").count().show(5)

+------+-----+
|origin|count|
+------+-----+
|Europe|   73|
|    US|  254|
| Japan|   79|
+------+-----+



In [89]:
df.groupBy("origin","model").count()

origin,model,count
Europe,71,5
Europe,80,9
Europe,79,4
Japan,75,4
US,72,18
US,80,7
Europe,74,6
Japan,79,2
Europe,76,8
US,75,20


In [90]:
df.groupBy("origin","model").count().show(5)

+------+-----+-----+
|origin|model|count|
+------+-----+-----+
|Europe|   71|    5|
|Europe|   80|    9|
|Europe|   79|    4|
| Japan|   75|    4|
|    US|   72|   18|
+------+-----+-----+
only showing top 5 rows



In [91]:
df = df.drop("first_column")
df = df.drop("second_column").drop("third_column")
df.columns

['car',
 'mpg',
 'cylinders',
 'displacement',
 'horsepower',
 'weight',
 'acceleration',
 'model',
 'origin',
 'car_model']

In [95]:
total_count = df.count()
print("total record count:",total_count)

europe_filtered_count = df.filter(col("origin")=="Europe").count()
print("europe filtered record count:",europe_filtered_count)

total record count: 406
europe filtered record count: 73


In [96]:
df.filter(col("origin") == "Europe").show(5)

+--------------------+----+---------+------------+----------+------+------------+-----+------+--------------------+
|                 car| mpg|cylinders|displacement|horsepower|weight|acceleration|model|origin|           car_model|
+--------------------+----+---------+------------+----------+------+------------+-----+------+--------------------+
|Citroen DS-21 Pallas| 0.0|        4|       133.0|     115.0|3090.0|        17.5|   70|Europe|Citroen DS-21 Pal...|
|Volkswagen 1131 D...|26.0|        4|        97.0|      46.0|1835.0|        20.5|   70|Europe|Volkswagen 1131 D...|
|         Peugeot 504|25.0|        4|       110.0|      87.0|2672.0|        17.5|   70|Europe|      Peugeot 504 70|
|         Audi 100 LS|24.0|        4|       107.0|      90.0|2430.0|        14.5|   70|Europe|      Audi 100 LS 70|
|            Saab 99e|25.0|        4|       104.0|      95.0|2375.0|        17.5|   70|Europe|         Saab 99e 70|
+--------------------+----+---------+------------+----------+------+----

In [100]:
total_count = df.count()
print("total record count:",total_count)

europe_filtered_count = df.filter((col("origin")=="Europe") & (col("cylinders") == 4)).count()
print("europe filtered record count:",europe_filtered_count)

total record count: 406
europe filtered record count: 66


In [101]:
df.filter((col("origin") == "Europe") & (col("cylinders") == 4)).show(5)

+--------------------+----+---------+------------+----------+------+------------+-----+------+--------------------+
|                 car| mpg|cylinders|displacement|horsepower|weight|acceleration|model|origin|           car_model|
+--------------------+----+---------+------------+----------+------+------------+-----+------+--------------------+
|Citroen DS-21 Pallas| 0.0|        4|       133.0|     115.0|3090.0|        17.5|   70|Europe|Citroen DS-21 Pal...|
|Volkswagen 1131 D...|26.0|        4|        97.0|      46.0|1835.0|        20.5|   70|Europe|Volkswagen 1131 D...|
|         Peugeot 504|25.0|        4|       110.0|      87.0|2672.0|        17.5|   70|Europe|      Peugeot 504 70|
|         Audi 100 LS|24.0|        4|       107.0|      90.0|2430.0|        14.5|   70|Europe|      Audi 100 LS 70|
|            Saab 99e|25.0|        4|       104.0|      95.0|2375.0|        17.5|   70|Europe|         Saab 99e 70|
+--------------------+----+---------+------------+----------+------+----

In [102]:
df.select("origin").distinct()

origin
Europe
US
Japan


In [103]:
df.select("origin").distinct().show()

+------+
|origin|
+------+
|Europe|
|    US|
| Japan|
+------+



In [104]:
df.select("origin","model").distinct()

origin,model
Europe,71
Europe,80
Europe,79
Japan,75
US,72
US,80
Europe,74
Japan,79
Europe,76
US,75


In [106]:
df.orderBy("cylinders").show(10)

+--------------------+----+---------+------------+----------+------+------------+-----+------+--------------------+
|                 car| mpg|cylinders|displacement|horsepower|weight|acceleration|model|origin|           car_model|
+--------------------+----+---------+------------+----------+------+------------+-----+------+--------------------+
|     Mazda RX2 Coupe|19.0|        3|        70.0|      97.0|2330.0|        13.5|   72| Japan|  Mazda RX2 Coupe 72|
|           Mazda RX3|18.0|        3|        70.0|      90.0|2124.0|        13.5|   73| Japan|        Mazda RX3 73|
|          Mazda RX-4|21.5|        3|        80.0|     110.0|2720.0|        13.5|   77| Japan|       Mazda RX-4 77|
|       Mazda RX-7 GS|23.7|        3|        70.0|     100.0|2420.0|        12.5|   80| Japan|    Mazda RX-7 GS 80|
|        Datsun PL510|27.0|        4|        97.0|      88.0|2130.0|        14.5|   71| Japan|     Datsun PL510 71|
| Toyota Corolla 1200|31.0|        4|        71.0|      65.0|1773.0|    

In [107]:
df.orderBy("cylinders",ascending=False).show(10)

+--------------------+----+---------+------------+----------+------+------------+-----+------+--------------------+
|                 car| mpg|cylinders|displacement|horsepower|weight|acceleration|model|origin|           car_model|
+--------------------+----+---------+------------+----------+------+------------+-----+------+--------------------+
|Chevrolet Chevell...| 0.0|        8|       350.0|     165.0|4142.0|        11.5|   70|    US|Chevrolet Chevell...|
|Chevrolet Chevell...|18.0|        8|       307.0|     130.0|3504.0|        12.0|   70|    US|Chevrolet Chevell...|
|Buick Estate Wago...|14.0|        8|       455.0|     225.0|3086.0|        10.0|   70|    US|Buick Estate Wago...|
|    Ford Torino (sw)| 0.0|        8|       351.0|     153.0|4034.0|        11.0|   70|    US| Ford Torino (sw) 70|
|  Plymouth Satellite|18.0|        8|       318.0|     150.0|3436.0|        11.0|   70|    US|Plymouth Satellit...|
|Plymouth Satellit...| 0.0|        8|       383.0|     175.0|4166.0|    

In [108]:
df.groupBy("origin").count().orderBy("count",ascending=False)

origin,count
US,254
Japan,79
Europe,73


union() 
- it is used to merge two dataframes of the same structure-schema
- if schemas are not the same it returns an error

unionAll() 
- this function is deprecated since spark 2.0.0 and replaced with union()

unionByName()
- this function is used to merge two dataframes based on column name

In [109]:
europe_cars = df.filter(col('origin')=='Europe')
japan_cars = df.filter(col('origin')=='Japan')
union = europe_cars.union(japan_cars)

print("europe cars:",europe_cars.count())
print("japan cars:",japan_cars.count())
print("after union:",union.count())

europe cars: 73
japan cars: 79
after union: 152


In [110]:
df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])

df1.unionByName(df2).show()

+----+----+----+
|col0|col1|col2|
+----+----+----+
|   1|   2|   3|
|   6|   4|   5|
+----+----+----+



In [111]:
df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])

df1.union(df2).show()

+----+----+----+
|col0|col1|col2|
+----+----+----+
|   1|   2|   3|
|   4|   5|   6|
+----+----+----+



In [113]:
dir(functions)

['Any',
 'ArrayType',
 'Callable',
 'Column',
 'DataFrame',
 'DataType',
 'Dict',
 'Iterable',
 'JVMView',
 'List',
 'Optional',
 'PandasUDFType',
 'PySparkTypeError',
 'PySparkValueError',
 'SparkContext',
 'StringType',
 'StructType',
 'TYPE_CHECKING',
 'Tuple',
 'Type',
 'Union',
 'UserDefinedFunction',
 'UserDefinedTableFunction',
 'ValuesView',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_create_column_from_literal',
 '_create_lambda',
 '_create_py_udf',
 '_create_py_udtf',
 '_from_numpy_type',
 '_get_jvm_function',
 '_get_lambda_parameters',
 '_invoke_binary_math_function',
 '_invoke_function',
 '_invoke_function_over_columns',
 '_invoke_function_over_seq_of_columns',
 '_invoke_higher_order_function',
 '_options_to_str',
 '_test',
 '_to_java_column',
 '_to_seq',
 '_unresolved_named_lambda_variable',
 'abs',
 'acos',
 'acosh',
 'add_months',
 'aes_decrypt',
 'aes_encrypt',
 'aggregate',
 'any_value',
 'approxC

In [116]:
help(substring)

Help on function substring in module pyspark.sql.functions:

substring(str: 'ColumnOrName', pos: int, len: int) -> pyspark.sql.column.Column
    Substring starts at `pos` and is of length `len` when str is String type or
    returns the slice of byte array that starts at `pos` in byte and is of length `len`
    when str is Binary type.

    .. versionadded:: 1.5.0

    .. versionchanged:: 3.4.0
        Supports Spark Connect.

    Notes
    -----
    The position is not zero based, but 1 based index.

    Parameters
    ----------
    str : :class:`~pyspark.sql.Column` or str
        target column to work on.
    pos : int
        starting position in str.
    len : int
        length of chars.

    Returns
    -------
    :class:`~pyspark.sql.Column`
        substring of given value.

    Examples
    --------
    >>> df = spark.createDataFrame([('abcd',)], ['s',])
    >>> df.select(substring(df.s, 1, 2).alias('s')).collect()
    [Row(s='ab')]



In [117]:
df.select(col("car"),lower(col("car")),upper(col("car")))

car,lower(car),upper(car)
Chevrolet Chevell...,chevrolet chevell...,CHEVROLET CHEVELL...
Buick Skylark 320,buick skylark 320,BUICK SKYLARK 320
Plymouth Satellite,plymouth satellite,PLYMOUTH SATELLITE
AMC Rebel SST,amc rebel sst,AMC REBEL SST
Ford Torino,ford torino,FORD TORINO
Ford Galaxie 500,ford galaxie 500,FORD GALAXIE 500
Chevrolet Impala,chevrolet impala,CHEVROLET IMPALA
Plymouth Fury iii,plymouth fury iii,PLYMOUTH FURY III
Pontiac Catalina,pontiac catalina,PONTIAC CATALINA
AMC Ambassador DPL,amc ambassador dpl,AMC AMBASSADOR DPL


In [120]:
df.select(min(col("weight")),max(col("weight")))

min(weight),max(weight)
1613.0,5140.0


In [121]:
df.select(min(col("weight")) + lit(10),max(col("weight")) + lit(10))

(min(weight) + 10),(max(weight) + 10)
1623.0,5150.0


In [129]:
cars_df = spark.createDataFrame([[1, 'car a'],[2, 'car b'],[3, 'car c']], ["id", "car_name"])
cars_df.show()

car_price_df = spark.createDataFrame([[1, 1000],[2, 2000],[3, 3000]], ["id", "car_price"])
car_price_df.show()

+---+--------+
| id|car_name|
+---+--------+
|  1|   car a|
|  2|   car b|
|  3|   car c|
+---+--------+

+---+---------+
| id|car_price|
+---+---------+
|  1|     1000|
|  2|     2000|
|  3|     3000|
+---+---------+



In [130]:
cars_df.join(car_price_df)

id,car_name,id.1,car_price
1,car a,1,1000
1,car a,2,2000
1,car a,3,3000
2,car b,1,1000
2,car b,2,2000
2,car b,3,3000
3,car c,1,1000
3,car c,2,2000
3,car c,3,3000


In [131]:
cars_df.join(car_price_df, cars_df.id == car_price_df.id, "inner").select(cars_df["id"],cars_df["car_name"],car_price_df["car_price"])

id,car_name,car_price
1,car a,1000
2,car b,2000
3,car c,3000
