In [16]:
from os.path import abspath
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover, Tokenizer
from pyspark.sql.functions import col,isnan, when, count, udf
from pyspark.sql.types import StringType, ArrayType
import emoji
from preproc_functions import *
import numpy as np

In [17]:
spark = SparkSession \
        .builder \
        .master('spark://10.10.28.172:7077') \
        .appName('pre_processing') \
        .enableHiveSupport() \
        .config("spark.pyfiles", "preproc_functions.py") \
        .getOrCreate()

# Preventing errors of having too many settings in hive-site.xml
spark.sparkContext.setLogLevel('OFF')
spark.sparkContext.addPyFile("preproc_functions.py")
# name of database
spark.sql('use twitter_data')
spark.sql('show databases').show()

+------------+
|   namespace|
+------------+
|     default|
|twitter_data|
+------------+



## Pre processing
        1. Remove null rows (text and created_at and id) âœ…
        2. Change emojis to words âœ…
        3. Remove links âœ…
        4. Remove unwanted charachters âœ…
        5. Spell correction âœ…
        6. Write to table

### 1. Selecting all data from the raw_data table where text is present

In [18]:
query = '''
    SELECT *
    FROM raw_data
    WHERE text IS NOT NULL
    AND created_at IS NOT NULL
    AND id IS NOT NULL
'''
raw_data = spark.sql(query)
# raw_data.count() # To validate that rows where removed

### 2. Change Emojis to Words

In [19]:
# 2.1 Change emojis to: emoji_meaning -> emoji meaning
udf_emoji_to_words = udf(lambda text: emoji_to_words(text), StringType())
raw_data.limit(5).collect()[4]['text'] # To see text from a tweet with emojis

"@TerryVernonsmi3 Trump's 2020 Campaign is Damaged Goods after cover-ups w ðŸ‡ºðŸ‡¦ Ukraine, and ðŸ‡·ðŸ‡º Russia, failed deals w ðŸ‡¨ðŸ‡³ China, ðŸ‡°ðŸ‡· N Korea and ðŸ‡®ðŸ‡· Iran, leaving KURDS to die, unhappy farmers, pissed off consumers, failed healthcare, extremely low approval ratings, his LIES and being labeled RACIST!"

In [20]:
# 2.2 Change emojis to words
raw_data = raw_data.withColumn('text', udf_emoji_to_words(col('text')))
raw_data.limit(5).collect()[4]['text'] # Validation that emojis have been changed to text

                                                                                

"@TerryVernonsmi3 Trump's 2020 Campaign is Damaged Goods after cover-ups w  Ukraine  Ukraine, and  Russia  Russia, failed deals w  China  China,  south korea  N Korea and  Iran  Iran, leaving KURDS to die, unhappy farmers, pissed off consumers, failed healthcare, extremely low approval ratings, his LIES and being labeled RACIST!"

### 3. Remove URLs

In [21]:
udf_remove_urls = udf(lambda text: remove_urls(text) , StringType())
# TODO: Print text with URL(s)

In [22]:
raw_data = raw_data.withColumn('text', udf_remove_urls(col('text')))
# TODO: See the same line after transform

### 4. Remove unvanted characters

In [23]:
udf_remove_special_chars = udf(lambda text: remove_urls(text), StringType())
# TODO: Print text with spelling error(s)

In [24]:
raw_data = raw_data.withColumn('text', udf_remove_special_chars(col('text')))
# TODO: print same line after transformation

### 5. Spell Correction

In [25]:
udf_spell_correction = udf(lambda text: spell_correction(text), StringType())
# TODO: Print text with spelling error(s)

In [26]:
raw_data = raw_data.withColumn('text', udf_spell_correction(col('text')))
# TODO: print same line after transformation

### 6. Write to table

In [27]:
#raw_data.select('id','text').write.format('hive').mode("append").saveAsTable("twitter_data.proccesed_data")
#raw_data.select('id','text').write.mode("overwrite").saveAsTable("twitter_data.proccesed_data_ov")

[Stage 8:>                                                        (0 + 12) / 16]

KeyboardInterrupt: 

[Stage 8:>                                                        (0 + 12) / 16]