In [None]:
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('SimpleDF').getOrCreate()     # Creating Spark Session Object

In [None]:
cols = ['currency', 'value']
inputdata = [('Euro', 90), ('Pound', 100), ('Yuan', 11), ('Yen', 2), ('US Dollar', 84), ('K Dinar', 242)]

### Creating DataFrame (df) using RDD (parallelize method)

In [None]:
rdd = spark.sparkContext.parallelize(inputdata)
rdd.collect()

In [None]:
# Covert RDD to DataFrame (This method create df without proper column names)
df = rdd.toDF()
#df.show()

# Rename column using withColumnRenamed method
# This method takes 2 parameters; first existing column name, second new column name
df.withColumnRenamed("_1", "Currency").show()

## Create DataFrame using createDataFrame method

In [None]:
# pass rdd as argument to createDataFrame method.
# pass list of column names as argument to toDF() method
df = spark.createDataFrame(rdd).toDF(*cols)
df.show()

In [None]:
# createDataFrame by passing data and column names as arguments

df = spark.createDataFrame(data=inputdata, schema=cols)
df.printSchema()

In [None]:
myschema = "`Currency` STRING, `Value` INT"
df = spark.createDataFrame(data=inputdata, schema=myschema)
df.printSchema()        #printSchema() display column names with corrosponding data types

### Create DataFrame using read() method

In [None]:
df = spark.read.csv('D:/Dataset/FIFA2022.csv', header=True, inferSchema = True)  # change path to your loaction
#df.show()
df.printSchema()

In [None]:
from pyspark.sql.functions import regexp_extract, col, expr, udf, regexp_extract
from pyspark.sql.types import IntegerType

movie = spark.read.csv('D:/Dataset/movie.csv', header=True, inferSchema=True)  # change path to your loaction
movie.printSchema()

In [None]:
movie.show(truncate=False)

In [None]:
movie.select('type').distinct().count()   # select() is used to get column/s from dataframe

In [None]:
movie.select('movieId', 'type').show()

In [None]:
#col() gives column values of selected column

movie.filter(col('movieId').between(20,30)).select('movieId','type').show()

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
f_path = 'D:/deepak/daytoday/Sept 2022/ABD/QPs/Semester/Lab/Datasets/FIFA2022.csv'
#fifa_df = spark.read.csv('D:/Dataset/FIFA2022.csv', header=True, inferSchema=True)
fifa_df = spark.read.csv(f_path, header=True, inferSchema=True)

fifa_df = fifa_df.na.drop()
def mapGroup(country):
    countryList = [['Netherlands', 'Senegal', 'Ecuador', 'Qatar'],
                   ['Portugal', 'Uruguay', 'South Korea', 'Ghana'],
                   ['Argentina', 'Mexico', 'Poland', 'Saudi Arabia'],
                   ['France', 'Denmark', 'Tunisia', 'Australia'],
                   ['Brazil', 'Switzerland', 'Serbia', 'Cameroon'],
                   ['Belgium', 'Croatia', 'Morocco', 'Canada'],
                   ['Spain', 'Germany', 'Japan', 'Costa Rica'],
                   ['England', 'United States', 'Iran', 'Wales']                  
                  ]
    if country in countryList[0]:
        return 'A'
    elif country in countryList[1]:
        return 'B'
    elif country in countryList[2]:
        return 'C'
    elif country in countryList[3]:
        return 'D'
    elif country in countryList[4]:
        return 'E'
    elif country in countryList[5]:
        return 'F'
    elif country in countryList[6]:
        return 'G'
    elif country in countryList[7]:
        return 'H'
    
group_udf = udf(mapGroup, StringType())
fifa_group = fifa_df.withColumn('Group', group_udf(col('Team')))

In [None]:
fifa_group.select('Team','Group').show()