# Connecting and inserting data from CSV in Hadoop to MongoDB

This notebook inserts data to MongoDB

This should be run using the command:
<code>pyspark --packages org.mongodb.spark:mongo-spark-connector_2.12:10.1.1</code> so that it works with mongoDB connector for Spark
- Uses mongoDB 7.0.3
- Uses PySpark 3.2.4
- Uses Mongo Spark Connector 10.1.1 compatible with Spark 3.2.4 and MongoDB 6 onwards


In [1]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType, BooleanType

# Read from CSV in hadoop and save it to mongoDB
dataPath = "/CA2/data/ProjectTweets.csv"
schema = StructType([
    StructField("_id", IntegerType(), True),
    StructField("tweet_id", StringType(), True),
    StructField("date", StringType(), True),
    StructField("flag", StringType(), True),
    StructField("user", StringType(), True),
    StructField("tweet", StringType(), True),
])
dataset = spark.read.option("multiline", "true").option("quote", '"').option("escape", "\\").option("escape", '"').csv(dataPath ,header=None, schema=schema)
dataset.select('*').show(2)

+---+----------+--------------------+--------+---------------+--------------------+
|_id|  tweet_id|                date|    flag|           user|               tweet|
+---+----------+--------------------+--------+---------------+--------------------+
|  0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|  1|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
+---+----------+--------------------+--------+---------------+--------------------+
only showing top 2 rows



In [2]:
write_config = {
    "uri": "mongodb://localhost:27017/CA2.raw_tweets?retryWrites=true&w=majority",
    "database": "CA2",
    "collection": "raw_tweets",
    "writeConcern.w": "majority"
}

dataset.write\
    .format("mongodb")\
    .mode("append")\
    .option("uri", write_config["uri"])\
    .option("database", write_config["database"])\
    .option("collection", write_config["collection"])\
    .option("writeConcern.w", write_config["writeConcern.w"])\
    .save()


2023-11-11 00:28:58,070 WARN util.CaseInsensitiveStringMap: Converting duplicated key writeconcern.w into CaseInsensitiveStringMap.
                                                                                

In [3]:
dataset.select("*").count()

                                                                                

1600000