In [8]:
from os.path import abspath
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover, Tokenizer
from pyspark.sql.functions import col,isnan, when, count, udf
from pyspark.sql.types import StringType
import emoji
from preproc_functions import *
import numpy as np

In [9]:
spark = SparkSession \
        .builder \
        .master('spark://10.10.28.172:7077') \
        .appName('pre_processing') \
        .enableHiveSupport() \
        .config("spark.pyfiles", "preproc_functions.py") \
        .getOrCreate()

# Preventing errors of having too many settings in hive-site.xml
spark.sparkContext.setLogLevel('OFF')
spark.sparkContext.addPyFile("preproc_functions.py")
# name of database
spark.sql('use twitter_data')
spark.sql('show databases').show()

+------------+
|   namespace|
+------------+
|     default|
|twitter_data|
+------------+



## Pre processing
        1. Remove null rows (text and created_at and id) ✅
        2. Change emojis to words ✅
        3. Remove links ✅
        4. Remove unwanted charachters
        5. Spell correction
        6. Make list of words
        7. Remove stop words (remove neutral words)


### 1. Selecting all data from the raw_data table where text is present

In [10]:
query = '''
    SELECT *
    FROM raw_data
    WHERE text IS NOT NULL
    AND created_at IS NOT NULL
    AND id IS NOT NULL
'''
raw_data = spark.sql(query)
# raw_data.count() # To validate that rows where removed

### 2. Change Emojis to Words

In [11]:
# TODO: OLA, need your opinion on thisss
# 2.1 Change emojis to: emoji_meaning -> emoji meaning
udf_emoji_to_words = udf(lambda text: snake_case_to_words(emoji_to_word(text)), StringType())
raw_data.limit(5).collect()[4]['text'] # To see text from a tweet with emojis

"@TerryVernonsmi3 Trump's 2020 Campaign is Damaged Goods after cover-ups w 🇺🇦 Ukraine, and 🇷🇺 Russia, failed deals w 🇨🇳 China, 🇰🇷 N Korea and 🇮🇷 Iran, leaving KURDS to die, unhappy farmers, pissed off consumers, failed healthcare, extremely low approval ratings, his LIES and being labeled RACIST!"

In [12]:
# 2.2 Change emojis to words
raw_data = raw_data.withColumn('text', udf_emoji_to_words(col('text')))
raw_data.limit(5).collect()[4]['text'] # Validation that emojis have been changed to text

                                                                                

"@TerryVernonsmi3 Trump's 2020 Campaign is Damaged Goods after cover-ups w  Ukraine  Ukraine, and  Russia  Russia, failed deals w  China  China,  south korea  N Korea and  Iran  Iran, leaving KURDS to die, unhappy farmers, pissed off consumers, failed healthcare, extremely low approval ratings, his LIES and being labeled RACIST!"

### 3. Remove URLs

In [13]:
udf_remove_urls = udf(lambda text: remove_urls(text) , StringType())
# TODO: Print text with URL(s)

In [14]:
raw_data = raw_data.withColumn('text', udf_remove_urls(col('text')))
# TODO: See the same line after

### Selecting only the interesting columns and create a new column where the text is tokenized and lowercase

In [8]:
df = raw_data  #.select('id', 'text')

tokenizer = Tokenizer(inputCol='text', outputCol='tokens')
df = tokenizer.transform(df)
df.show(5)

[Stage 4:>                                                          (0 + 1) / 1]

+--------------------+---+------------------+----------+-------------------+--------------------+
|                text|geo|                id| author_id|         created_at|              tokens|
+--------------------+---+------------------+----------+-------------------+--------------------+
|                text|geo|              null|      null|               null|              [text]|
|EU hopes for posi...|   |572488029434200064|2745054115|2015-03-02 20:05:57|[eu, hopes, for, ...|
|Ukraine pilot nea...|   |572488005732200448|2214245953|2015-03-02 20:05:51|[ukraine, pilot, ...|
|EU Mediates Gas T...|   |572487964321820672|2396493517|2015-03-02 20:05:41|[eu, mediates, ga...|
|Instead of sancti...|   |572487956570759168|  31025396|2015-03-02 20:05:39|[instead, of, san...|
+--------------------+---+------------------+----------+-------------------+--------------------+
only showing top 5 rows



                                                                                

### Filter the tokens by the use of pyspark.ml.feature.StopWordsRemover

In [9]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                       outputCol='filtered_text',
                       caseSensitive=False)
df = swr.transform(df)
df.show(5)

[Stage 5:>                                                          (0 + 1) / 1]

+--------------------+---+------------------+----------+-------------------+--------------------+--------------------+
|                text|geo|                id| author_id|         created_at|              tokens|       filtered_text|
+--------------------+---+------------------+----------+-------------------+--------------------+--------------------+
|                text|geo|              null|      null|               null|              [text]|              [text]|
|EU hopes for posi...|   |572488029434200064|2745054115|2015-03-02 20:05:57|[eu, hopes, for, ...|[eu, hopes, posit...|
|Ukraine pilot nea...|   |572488005732200448|2214245953|2015-03-02 20:05:51|[ukraine, pilot, ...|[ukraine, pilot, ...|
|EU Mediates Gas T...|   |572487964321820672|2396493517|2015-03-02 20:05:41|[eu, mediates, ga...|[eu, mediates, ga...|
|Instead of sancti...|   |572487956570759168|  31025396|2015-03-02 20:05:39|[instead, of, san...|[instead, sanctio...|
+--------------------+---+------------------+---

                                                                                

# TODO: MORE PREPROCESSING

# TODO: PREPARE FOR SENTIMENT ANALYSIS -> SAVE AS HIVE TABLE

* RECREATE TO A PYTHON FILE SO THAT WE CAN RUN IT ON THE CLUSTER