# Labeling Data automatically

For this code along we will evaluate each word and calculate the text value! We'll use the various NLP tools we learned about.

In [1]:
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import string
import unicodedata
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
import re
from pyspark.conf import SparkConf
import pyspark.sql.functions as f

In [2]:
spark = SparkSession.builder\
    .master("local[*]")\
    .appName("nlp")\
    .config("spark.executor.memory", "32g")\
    .config("spark.driver.memory", "32g")\
    .config("spark.memory.offHeap.enabled",True) \
    .config("spark.memory.offHeap.size","16g")\
    .config("spark.debug.maxToStringFields","200")\
    .getOrCreate()

In [39]:
tweets=spark.read.csv("./data/7_july_1.csv", inferSchema=True, encoding = 'utf8', header=True).select('_c0','_c1')

In [40]:
tweets=tweets.selectExpr("_c0 as date", "_c1 as text")

In [41]:
tweets.show(5)

+-------------------+--------------------+
|               date|                text|
+-------------------+--------------------+
|2018-07-07 18:52:11|b'Intensity build...|
|2018-07-07 18:52:11|b'Never die attit...|
|2018-07-07 18:52:11|b'RT @FIFAWorldCu...|
|2018-07-07 18:52:11|b'RT @HNS_CFF: \x...|
|2018-07-07 18:52:10|b'RT @ThunderStur...|
+-------------------+--------------------+
only showing top 5 rows



In [42]:
# Remove https in the text
def remove_https(text):
    text = str(text)
    text = re.sub(r"http\S+", "", text)
    return text
udfhttps=udf(lambda text: remove_https(text), StringType())
tweets = tweets.withColumn("text", udfhttps(tweets["text"]))

In [43]:
def normalizeData(text):    
    text = unicodedata.normalize('NFKD', str(text))
    text = text.replace(r'\n', '')
    text = ' '.join(text.split())
    replace_punctuation = str.maketrans(string.punctuation,' '*len(string.punctuation))
    text = text.translate(replace_punctuation)
    text = text.encode('ASCII', 'ignore')
    text = text.decode('unicode_escape')
    text = ''.join([i for i in text if not i.isdigit()])
    return text

udfNormalizeData=udf(lambda text: normalizeData(text), StringType())

tweets = tweets.withColumn("text", udfNormalizeData(tweets["text"]))

In [44]:
LATIN_1_CHARS = (
    (' xe2 x80 x99', "'"),
    (' xc3 xa9', 'e'),
    (' xe2 x80 x90', '-'),
    (' xe2 x80 x91', '-'),
    (' xe2 x80 x92', '-'),
    (' xe2 x80 x93', '-'),
    (' xe2 x80 x94', '-'),
    (' xe2 x80 x94', '-'),
    (' xe2 x80 x98', "'"),
    (' xe2 x80 x9b', "'"),
    (' xe2 x80 x9c', '"'),
    (' xe2 x80 x9c', '"'),
    (' xe2 x80 x9d', '"'),
    (' xe2 x80 x9e', '"'),
    (' xe2 x80 x9f', '"'),
    #(' xe2 x80 xa6', '...'),
    (' xe2 x80 xa6', ''),
    (' xe2 x80 xb2', "'"),
    (' xe2 x80 xb3', "'"),
    (' xe2 x80 xb4', "'"),
    (' xe2 x80 xb5', "'"),
    (' xe2 x80 xb6', "'"),
    (' xe2 x80 xb7', "'"),
    (' xe2 x81 xba', "+"),
    (' xe2 x81 xbb', "-"),
    (' xe2 x81 xbc', "="),
    (' xe2 x81 xbd', "("),
    (' xe2 x81 xbe', ")"),
    (' xe2 x80 xa7', "."),
    ('.', " "),
)
def clean_latin1(data):
    for _hex, _char in LATIN_1_CHARS:
        data = data.replace(_hex, _char)
    return data

udfDecoding=udf(lambda text: clean_latin1(text), StringType())
tweets = tweets.withColumn("text", udfDecoding(tweets["text"]))

In [45]:
fields = tweets.schema.fields
stringFields = filter(lambda f: isinstance(f.dataType, StringType), fields)
nonStringFields = map(lambda f: col(f.name), filter(lambda f: not isinstance(f.dataType, StringType), fields))
stringFieldsTransformed = map(lambda f: lower(col(f.name)), stringFields) 
allFields = [*nonStringFields, *stringFieldsTransformed]

tweets = tweets.select(allFields)

In [46]:
tweets = tweets.select(col("date").alias("date"),col("lower(text)").alias("text"))

In [47]:
regexTokenizer = RegexTokenizer(minTokenLength= 3, inputCol="text", outputCol="words", pattern="\\W")
tweets = regexTokenizer.transform(tweets)

In [48]:
remover = StopWordsRemover(inputCol="words", outputCol="words_")
tweets_ = remover.transform(tweets)
tweets=tweets_.drop("words")

In [49]:
with open('./data/emoji.txt', 'r') as f:
    first_list=f.read().strip().splitlines()

second_list = []
for item in first_list:
    x = item.split('\\')
    second_list.append(x)

third_list = []
for item in second_list:
    new_list = []
    for e in item:
        if e not in (''):
            new_list.append(e)
    third_list.append(new_list)
    
fourth_list = [item for sublist in third_list for item in sublist]

emojies = []
for word in fourth_list:   #for each word in line.split()
    if word not in emojies:    #if a word isn't in line.split            
        emojies.append(word)
        
remover = StopWordsRemover(inputCol="words_", outputCol="words", stopWords=emojies)
tweets_ = remover.transform(tweets)

tweets=tweets_.drop("words_")
tweets=tweets.selectExpr("date as date", "text as text", "words as words")

In [55]:
tweets.select('words').show(5, False)

+-------------------------------------------------------------------------------------------------------------------+
|words                                                                                                              |
+-------------------------------------------------------------------------------------------------------------------+
|[intensity, building, sochi, ruscro]                                                                               |
|[never, die, attitude, like, places, wada, ama, pls, partwe, need, clean, honest, athletes, worldcup, ruscro, cuak]|
|[fifaworldcup, fantastic, goal, xbaa, strong, response, ruscro, worldcup]                                          |
|[hns, cff, xball, play, sochi, players, take, break, beproud, croatia, flamingpride, ruscro, family, worldcup]     |
|[thundersturm, caption, needed, ruscro, worldcup]                                                                  |
+-------------------------------------------------------

## Training the model to find the weight of each words
### Sumation of the words' weights and evaluate the text score 

** for doing this we use a concepts in the following article to calculate the value of each text.

## The End