In [19]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("myApp") \
    .getOrCreate()

In [20]:
# Load the ProjectTweets.csv into hadoop in the named folder 'user1'
df = spark.read.csv('/user1/ProjectTweets.csv', header=False, inferSchema=True)

                                                                                

In [21]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: long (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)



In [22]:
df.show(5)

+---+----------+--------------------+--------+---------------+--------------------+
|_c0|       _c1|                 _c2|     _c3|            _c4|                 _c5|
+---+----------+--------------------+--------+---------------+--------------------+
|  0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|  1|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|  2|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|  3|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|  4|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
+---+----------+--------------------+--------+---------------+--------------------+
only showing top 5 rows



In [23]:
# Drop the _c1 column unused
df = df.drop('_c1')

In [24]:
# Named the columns of df
df = df.withColumnRenamed('_c0', 'id') \
       .withColumnRenamed('_c2', 'date') \
       .withColumnRenamed('_c3', 'flag') \
       .withColumnRenamed('_c4', 'user') \
       .withColumnRenamed('_c5', 'text')

In [25]:
# Display the structure of schema
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- flag: string (nullable = true)
 |-- user: string (nullable = true)
 |-- text: string (nullable = true)



In [26]:
# Display the first 5 rows of the data
df.show(5)

+---+--------------------+--------+---------------+--------------------+
| id|                date|    flag|           user|                text|
+---+--------------------+--------+---------------+--------------------+
|  0|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|  1|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|  2|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|  3|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|  4|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
+---+--------------------+--------+---------------+--------------------+
only showing top 5 rows



In [27]:
df.write.format("mongodb") \
    .option("uri", "mongodb://127.0.0.1:27001/") \
    .option("database", "tweet_mongo") \
    .option("collection", "tweet_collection") \
    .mode("append").save()

                                                                                

In [28]:
# MySQL JDBC connection
jdbc_url = "jdbc:mysql://127.0.0.1:3306/tweet_mysql"

# MySQL user and pass
properties = {
    "user": "root",
    "password": "kalem",
    "driver": "com.mysql.cj.jdbc.Driver"
}

In [29]:
df.write.jdbc(url=jdbc_url, table="tweets", mode="overwrite", properties=properties)

                                                                                

In [30]:
# print of data shape
print('Shape of dataset:',(df.count(),len(df.columns)))

Shape of dataset: (1600000, 5)


[Stage 12:>                                                         (0 + 2) / 2]                                                                                

In [31]:
# Display the summary statistics of the data
df.describe().show()



+-------+------------------+--------------------+--------+-------------------+--------------------+
|summary|                id|                date|    flag|               user|                text|
+-------+------------------+--------------------+--------+-------------------+--------------------+
|  count|           1600000|             1600000| 1600000|            1600000|             1600000|
|   mean|          799999.5|                null|    null|4.325887521835714E9|                null|
| stddev|461880.35968924535|                null|    null|5.16273321845489E10|                null|
|    min|                 0|Fri Apr 17 20:30:...|NO_QUERY|       000catnap000|                 ...|
|    max|           1599999|Wed May 27 07:27:...|NO_QUERY|         zzzzeus111|ï¿½ï¿½ï¿½ï¿½ï¿½ß§...|
+-------+------------------+--------------------+--------+-------------------+--------------------+



                                                                                

In [39]:

!/home/hduser/ycsb-0.17.0/bin/ycsb.sh run jdbc -P /home/hduser/ycsb-0.17.0/workloads/workloada -p db.url=jdbc:mysql://localhost:3306/tweet_mysql -p db.user=root -p db.passwd=kalem -p db.driver=com.mysql.cj.jdbc.Driver

/usr/bin/java  -classpath /home/hduser/ycsb-0.17.0/conf:/home/hduser/ycsb-0.17.0/lib/HdrHistogram-2.1.4.jar:/home/hduser/ycsb-0.17.0/lib/core-0.17.0.jar:/home/hduser/ycsb-0.17.0/lib/htrace-core4-4.1.0-incubating.jar:/home/hduser/ycsb-0.17.0/lib/jackson-core-asl-1.9.4.jar:/home/hduser/ycsb-0.17.0/lib/jackson-mapper-asl-1.9.4.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/conf:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/commons-collections-3.2.1.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/commons-lang-2.4.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/commons-pool-1.5.4.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/geronimo-jms_1.1_spec-1.1.1.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/geronimo-jta_1.1_spec-1.1.1.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/jdbc-binding-0.17.0.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/mysql-connector-j-8.0.33.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/openjpa-jdbc-2.1.1.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/openjpa-kernel-2.1.1.jar:/home/hdus

In [None]:
bin/ycsb.sh run mongodb -P workloads/workloada -p mongodb.url=mongodb://localhost:27017 -p mongodb.database=mydatabase -p mongodb.user=myuser -p mongodb.password=mypassword
