In [1]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [6]:
spark

In [11]:
## read the dataset
df_pyspark = spark.read.option('header', 'true').csv('Test1.csv',inferSchema=True)
#InferSchema will automatically guess the data types for each field
df_pyspark.show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
| Illia| 20|        10|
| Artem| 21|         8|
|  Vlad| 30|        20|
| Lesia| 45|        10|
|Nastya| 20|         2|
+------+---+----------+



In [12]:
## Check the Schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [13]:
#### second way to read dataset
df_pyspark = spark.read.csv('Test1.csv', header=True, inferSchema=True)
df_pyspark.show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
| Illia| 20|        10|
| Artem| 21|         8|
|  Vlad| 30|        20|
| Lesia| 45|        10|
|Nastya| 20|         2|
+------+---+----------+



In [14]:
## Check the Schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [15]:
### geting the column names
df_pyspark.columns

['Name', 'Age', 'Experience']

In [16]:
### Returns the first n rows using .head(n)
df_pyspark.head(3)

[Row(Name='Illia', Age=20, Experience=10),
 Row(Name='Artem', Age=21, Experience=8),
 Row(Name='Vlad', Age=30, Experience=20)]

In [18]:
### select the column
df_pyspark.select('Name').show()

+------+
|  Name|
+------+
| Illia|
| Artem|
|  Vlad|
| Lesia|
|Nastya|
+------+



In [19]:
## select the multiple columns
df_pyspark.select(['Name', 'Experience']).show()

+------+----------+
|  Name|Experience|
+------+----------+
| Illia|        10|
| Artem|         8|
|  Vlad|        20|
| Lesia|        10|
|Nastya|         2|
+------+----------+



In [20]:
## check data types
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [21]:
df_pyspark.describe().show()

+-------+-----+-----------------+----------------+
|summary| Name|              Age|      Experience|
+-------+-----+-----------------+----------------+
|  count|    5|                5|               5|
|   mean| null|             27.2|            10.0|
| stddev| null|10.80277742064512|6.48074069840786|
|    min|Artem|               20|               2|
|    max| Vlad|               45|              20|
+-------+-----+-----------------+----------------+



In [24]:
### adding and dropping columns
df_pyspark = df_pyspark.withColumn('Experience After 2 years', df_pyspark['Experience']+2)

In [26]:
df_pyspark.show()

+------+---+----------+------------------------+
|  Name|Age|Experience|Experience After 2 years|
+------+---+----------+------------------------+
| Illia| 20|        10|                      12|
| Artem| 21|         8|                      10|
|  Vlad| 30|        20|                      22|
| Lesia| 45|        10|                      12|
|Nastya| 20|         2|                       4|
+------+---+----------+------------------------+



In [29]:
#### Drop to column
df_pyspark = df_pyspark.drop('Experience after 2 years')

In [30]:
df_pyspark.show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
| Illia| 20|        10|
| Artem| 21|         8|
|  Vlad| 30|        20|
| Lesia| 45|        10|
|Nastya| 20|         2|
+------+---+----------+



In [32]:
#### rename the column
df_pyspark = df_pyspark.withColumnRenamed('Name', 'New Name')


In [35]:
df_pyspark.show()

+--------+---+----------+
|New Name|Age|Experience|
+--------+---+----------+
|   Illia| 20|        10|
|   Artem| 21|         8|
|    Vlad| 30|        20|
|   Lesia| 45|        10|
|  Nastya| 20|         2|
+--------+---+----------+

