# CSV Files

## Pandas

In [1]:
import pandas as pd

filepath = '../data/titanic.csv'

# Read the CSV file
df = pd.read_csv(filepath)

**To view the top 5 rows**

In [2]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


**Save dataframe to CSV**

In [3]:
df.to_csv('file-pandasDF.csv')

## PySpark

In [4]:
# import spark session
from pyspark.sql import SparkSession

# Create a session
spark = SparkSession.builder \
    .master('local') \
    .appName('csvFileHandling') \
    .config('spark.executor.memory', '1gb') \
    .config("spark.cores.max", "2") \
    .getOrCreate()

# initialise sparkContext
sc = spark.sparkContext

In [5]:
filepath = '../data/titanic.csv'

In [6]:
from pyspark.sql import SQLContext

# A SQLContext can be used create DataFrame, register DataFrame as tables,
# execute SQL over tables, cache tables, and read parquet files.
sqlContext = SQLContext(sc)

df = sqlContext.read.format('com.databricks.spark.csv') \
    .options(header='true', inferschema='true') \
    .load(filepath) # this is your csv file

In [7]:
df.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

**Save dataframe to CSV**

In [8]:
# We use the 'overwrite' mode to avoid 'file already exists' error
df.write.csv('file-pysparkDF.csv', mode='overwrite')

In [9]:
# To stop spark
spark.stop()

# Text Files

## Pandas

In [10]:
import pandas as pd

filepath = '../data/frogFox.txt'

# Read the CSV file
# sep (seperator) can be changed as per the requirement
# for eg., sep=' ', will make rows from words
df = pd.read_csv(filepath, sep='\n', header=None)

In [11]:
df

Unnamed: 0,0
0,crazy crazy fox jumped over the fence
1,crazy fox jumped
2,the fence is high for fox
3,crazy fox is smart
4,fox jumped very high


**Save as text file**

In [12]:
# 'header=None' and 'index=none' will not save the 
# header and index data
df.to_csv('file-pandas.txt', header=None, sep='\n', index=None)

## PySpark

In [13]:
# import spark session
from pyspark.sql import SparkSession

# Create a session
spark = SparkSession.builder \
    .master('local') \
    .appName('txtFileHandling') \
    .config('spark.executor.memory', '1gb') \
    .config("spark.cores.max", "2") \
    .getOrCreate()

# initialise sparkContext
sc = spark.sparkContext

In [14]:
filepath = '../data/frogFox.txt'

# first we read the txt file
# then we split it using '\n'
# finally we convert it to dataframe
df = sc.textFile(filepath) \
    .map(lambda x: x.split('\n')) \
    .toDF()

In [15]:
df.show()

+--------------------+
|                  _1|
+--------------------+
|crazy crazy fox j...|
|    crazy fox jumped|
|the fence is high...|
|  crazy fox is smart|
|fox jumped very high|
+--------------------+



In [16]:
df.write.csv('file-pysparkDF.txt', mode='overwrite')

In [17]:
spark.stop()

# Parquet file

## Pandas

In [18]:
import pandas as pd

filepath = '../data/titanic.csv'

# Read the CSV file
df = pd.read_csv(filepath)

In [19]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [20]:
# save dataframe to parquet
df.to_parquet('file-pandas.parquet')

In [21]:
# read parquet file
df1 = pd.read_parquet('file-pandas.parquet')
# need to use "engine='pyarrow'" if reading a 
# parquet file saved using PySpark
# need to install pyarrow "pip install pyarrow"

In [22]:
df1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## PySpark

In [23]:
# import spark session
from pyspark.sql import SparkSession

# Create a session
spark = SparkSession.builder \
    .master('local') \
    .appName('txtFileHandling') \
    .config('spark.executor.memory', '1gb') \
    .config("spark.cores.max", "2") \
    .getOrCreate()

# initialise sparkContext
sc = spark.sparkContext

In [24]:
filepath = '../data/titanic.csv'

In [25]:
from pyspark.sql import SQLContext

# A SQLContext can be used create DataFrame, register DataFrame as tables,
# execute SQL over tables, cache tables, and read parquet files.
sqlContext = SQLContext(sc)

df = sqlContext.read.format('com.databricks.spark.csv') \
    .options(header='true', inferschema='true') \
    .load(filepath) # this is your csv file

In [26]:
df.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [27]:
# save dataframe as parquet
df.write.parquet('file-pyspark.parquet', mode='overwrite')

In [28]:
# read parquet file
df1 = sqlContext.read.parquet('file-pyspark.parquet')

In [29]:
df1.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------