In [None]:
import pyspark
import config

In [None]:
# creating spark session
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [None]:
# reading data from the source database
df = spark.read.jdbc(
    url        = f'jdbc:mysql://source-db:{config.MySQL.port}/{config.MySQL.database}',
    table      = config.MySQL.table,
    properties = {
        'user'    : config.MySQL.user,
        'password': config.MySQL.password,
        'driver'  : 'com.mysql.cj.jdbc.Driver'
    }
)

# df = df.limit(1_000_000)
df.show(10)

In [None]:
# observing schema of the data
df.printSchema()

In [None]:
# describing the dataset
df.describe().show()

In [None]:
# removing missing values
df = df.dropna()

In [None]:
# selecting only the useful columns
df = df.select(['createdAt', 'userName', 'text'])

In [None]:
# preprocessing the text of tweets
df = df.withColumn(
    # replacing every username with '@user'
    colName= 'text',
    col    = pyspark.sql.functions.regexp_replace(
        pyspark.sql.functions.col('text'), 
        pattern    = r'@\S*',
        replacement= '@user'
    )
).withColumn(
    # replacing http/https urls with 'http'
    colName = 'text',
    col     = pyspark.sql.functions.regexp_replace(
        pyspark.sql.functions.col('text'), 
        pattern    = r'http\S*',
        replacement= 'http'
    )
)

In [None]:
df.show(10)

In [None]:
( df.write
    .format('mongodb')
    .option('database'      , f'{config.MongoDb.database}')
    .option('collection'    , f'{config.MongoDb.collection}')
    .option('connection.uri', f'mongodb://{config.MongoDb.user}:{config.MongoDb.password}@{config.MongoDb.host}:{config.MongoDb.port}')
    .mode('overwrite')
	.save()
)