In [1]:
import pyspark
import json

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType
from pyspark.sql.functions import lit # to add null column
from pyspark.sql.functions import monotonically_increasing_id

# pyspark init ===========================================
spark = pyspark.sql.SparkSession\
    .builder\
    .master('local')\
    .appName('ML-learning')\
    .config(conf=pyspark.SparkConf())\
    .getOrCreate()

# read json file and print schema =======================
disasterDf = spark.read.json("data/disaster.json")
non_disasterDf = spark.read.json("data/non_disaster.json")
print("\n============== before transform ==================")
disasterDf.printSchema()
non_disasterDf.printSchema()

# drop language username created_at columns ============
disasterDf = disasterDf.drop("language").drop('username').drop("created_at")
non_disasterDf = non_disasterDf.drop("language").drop('username').drop("created_at")

# merge hashtags with "%20" and chname hashtags to keyword 
mergeHashtags = udf(lambda x: "%20".join(x), StringType())

disasterDf = disasterDf.withColumn("keyword", mergeHashtags(disasterDf["hashtags"]))
disasterDf = disasterDf.drop('hashtags')

non_disasterDf = non_disasterDf.withColumn("keyword", mergeHashtags(non_disasterDf["hashtags"]))
non_disasterDf = non_disasterDf.drop('hashtags')

# chname tweet to text ================================
disasterDf = disasterDf.withColumnRenamed("tweet", "text")
non_disasterDf = non_disasterDf.withColumnRenamed("tweet", "text")

# add null column named location ======================
disasterDf = disasterDf.withColumn("location", lit(None).cast(StringType()))
non_disasterDf = non_disasterDf.withColumn("location", lit(None).cast(StringType()))

# add "id" column =====================================
disasterDf = disasterDf.withColumn("id", monotonically_increasing_id())
non_disasterDf = non_disasterDf.withColumn("id", monotonically_increasing_id())

# rearrange order of columns ==========================
disasterDf = disasterDf.select("id", "keyword", "location", "text")
non_disasterDf = non_disasterDf.select("id", "keyword", "location", "text")
print(disasterDf.count(), non_disasterDf.count())

print("\n============== after transform ===================")
disasterDf.printSchema()
non_disasterDf.printSchema()
print(disasterDf.count(), non_disasterDf.count())

21/11/09 13:16:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/11/09 13:16:44 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                


root
 |-- created_at: long (nullable = true)
 |-- hashtags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- language: string (nullable = true)
 |-- tweet: string (nullable = true)
 |-- username: string (nullable = true)

root
 |-- created_at: long (nullable = true)
 |-- hashtags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- language: string (nullable = true)
 |-- tweet: string (nullable = true)
 |-- username: string (nullable = true)

400 400

root
 |-- id: long (nullable = false)
 |-- keyword: string (nullable = true)
 |-- location: string (nullable = true)
 |-- text: string (nullable = true)

root
 |-- id: long (nullable = false)
 |-- keyword: string (nullable = true)
 |-- location: string (nullable = true)
 |-- text: string (nullable = true)

400 400


### HTML 제거

In [2]:
import re

html_regexps = re.compile(r"https?://[a-zA-Z0-9/.]*\b")
removeHtml = udf(lambda x: html_regexps.sub("", x), StringType())

disasterDf = disasterDf.withColumn("text", removeHtml(disasterDf['text']))
non_disasterDf = non_disasterDf.withColumn("text", removeHtml(non_disasterDf['text']))


### 이모티콘 제거

In [3]:
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)

remove_emoji = udf(lambda x: emoji_pattern.sub("", x), StringType())
disasterDf = disasterDf.withColumn("text", remove_emoji(disasterDf["text"]))
non_disasterDf = non_disasterDf.withColumn("text", remove_emoji(non_disasterDf["text"]))

### punctuations 제거

In [4]:
import string

myStr = 'asdfa ;ja;9j2r; ok;aoisjd f;j;aoIJ R;OA2J'
table = str.maketrans('', '', string.punctuation)
myStr.translate(table)

'asdfa ja9j2r okaoisjd fjaoIJ ROA2J'

In [5]:
table = str.maketrans('', '', string.punctuation)
remove_punctuation = udf(lambda x: x.translate(table), StringType())
disasterDf = disasterDf.withColumn("text", remove_punctuation(disasterDf["text"]))
non_disasterDf = non_disasterDf.withColumn("text", remove_punctuation(non_disasterDf["text"]))

## strip

In [6]:
strip_udf = udf(lambda x: x.strip(), StringType())
disasterDf = disasterDf.withColumn("text", strip_udf(disasterDf["text"]))
non_disasterDf = non_disasterDf.withColumn("text", strip_udf(non_disasterDf["text"]))

## Save json files

In [8]:
# save DataFrame as json file =========================
disasterDf.write.json('data/disaster-new.json')
non_disasterDf.write.json('data/non_disaster-new.json')

disasterDf.write.csv('data/disaster-new.csv')
non_disasterDf.write.csv('data/non_disaster-new.csv')

                                                                                

In [9]:
spark.stop()

In [None]:
import string

string.punctuation