-------------------------------------------------------------
Libs e Preparação do ambiente Spark
-------------------------------------------------------------



In [1]:
# Java jdk utilitários 
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [2]:
#Download do Spark
!wget -q https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz

# Descompactando os arquivos
!tar xf spark-3.1.2-bin-hadoop2.7.tgz

In [3]:
# Importando a biblioteca os
import os

# Definindo a variável de ambiente do Java
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

# Definindo a variável de ambiente do Spark
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"

In [4]:
# Findspark
!pip install -q findspark

In [5]:
import findspark

# Iniciando o findspark
findspark.init()

In [6]:
# iniciar uma seção Spark
from pyspark.sql import SparkSession

# iniciando o spark context
spark = SparkSession.builder.master('local[*]').getOrCreate()
spark

# **Utilizando JSON**
-------------------------------

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType
from datetime import date,datetime
from pyspark.sql import Row

In [10]:
#  JSON
path = "/content/sentimento.json"
sentimentoDF = spark.read.json(path,multiLine = "true")


sentimentoDF.show()

+--------------------+--------------------+
|             context|                 qas|
+--------------------+--------------------+
|spent the entire ...|[{[{55, my boss w...|
| oh! good idea ab...|[{[{5, good}], 25...|
|says good (or sho...|[{[{0, says good ...|
| i dont think you...|[{[{1, i dont thi...|
| haha better drun...|[{[{6, better}], ...|
|headache  wanna s...|[{[{0, headache}]...|
|had an awsome sal...|[{[{0, had an aws...|
| fine! going to d...|[{[{1, fine!}], a...|
| thank a yoou  ho...|[{[{1, thank}], a...|
|why don't adobe r...|[{[{0, why don't ...|
|prd take a long t...|[{[{0, prd take a...|
|_2008 well, havin...|[{[{1, 2008 well,...|
|    miss you my dear|[{[{1, miss you m...|
|have just bought ...|[{[{0, have just ...|
| ya mine too but ...|[{[{1, ya mine to...|
|today dan bought ...|[{[{107, , my tum...|
| oo noo thats not...|[{[{12, s not goo...|
|misses her phone....|[{[{20, having no...|
|so i have like no...|[{[{15, no more f...|
|i have perused th...|[{[{0, i h

In [15]:
#Exibindo o schema JSON
sentimentoDF.printSchema()

root
 |-- context: string (nullable = true)
 |-- qas: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- answers: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- answer_start: long (nullable = true)
 |    |    |    |    |-- text: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- is_impossible: boolean (nullable = true)
 |    |    |-- question: string (nullable = true)



In [16]:
# Especificando quais vars
sent2DF = sentimentoDF.select("context","qas.id", "qas.is_impossible","qas.question")
sent2DF.printSchema()

root
 |-- context: string (nullable = true)
 |-- id: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- is_impossible: array (nullable = true)
 |    |-- element: boolean (containsNull = true)
 |-- question: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [17]:
# Exibindo os registros
sent2DF.show(10, False)

+------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+----------+
|context                                                                                                                             |id          |is_impossible|question  |
+------------------------------------------------------------------------------------------------------------------------------------+------------+-------------+----------+
|spent the entire morning in a meeting w/ a vendor, and my boss was not happy w/ them. lots of fun.  i had other plans for my morning|[a3d0a7d5ad]|[false]      |[neutral] |
| oh! good idea about putting them on ice cream                                                                                      |[251b6a6766]|[false]      |[positive]|
|says good (or should i say bad?) afternoon!  http://plurk.com/p/wxpdj                                                               |[

In [18]:
# Separando em apenas 2 var para uso no SQL
sent3DF = sentimentoDF.select("context", "qas.question")
sent3DF.show()

+--------------------+----------+
|             context|  question|
+--------------------+----------+
|spent the entire ...| [neutral]|
| oh! good idea ab...|[positive]|
|says good (or sho...| [neutral]|
| i dont think you...|[negative]|
| haha better drun...|[positive]|
|headache  wanna s...|[negative]|
|had an awsome sal...|[positive]|
| fine! going to d...|[positive]|
| thank a yoou  ho...|[positive]|
|why don't adobe r...| [neutral]|
|prd take a long t...| [neutral]|
|_2008 well, havin...| [neutral]|
|    miss you my dear|[negative]|
|have just bought ...|[positive]|
| ya mine too but ...| [neutral]|
|today dan bought ...|[negative]|
| oo noo thats not...|[negative]|
|misses her phone....|[negative]|
|so i have like no...|[negative]|
|i have perused th...| [neutral]|
+--------------------+----------+
only showing top 20 rows



In [21]:
#Spark cria uma visualização temporária
sent3DF.createOrReplaceTempView("sentimentos")

#Select
spark.sql("select context from sentimentos where context like '%hope%'").show()

+--------------------+
|             context|
+--------------------+
|get into me not g...|
| sorry to hear ab...|
| aww im sorry im ...|
|  hope you have a...|
|  is there going ...|
| you can find mor...|
|is predicting a h...|
| soon i hope... r...|
| aww hope uve had...|
| good evening ter...|
|just woke up, hop...|
| well sure hope t...|
|  that's no bueno...|
| i hope you had f...|
| i hope it's some...|
| oh no! hope you ...|
|yeha i broke thei...|
|back from brunch....|
|not feeling too g...|
| i hope you feel ...|
+--------------------+
only showing top 20 rows



In [25]:
# Convertendo array em string 
sent4DF = spark.sql("select context, question, concat_ws(',',question)" + \
                 " as tps from sentimentos")

sent4DF.createOrReplaceTempView("tipo_sentimento")

#Sentimento negativo
spark.sql("select context, question from tipo_sentimento where tps = 'negative'").show()

+--------------------+----------+
|             context|  question|
+--------------------+----------+
| i dont think you...|[negative]|
|headache  wanna s...|[negative]|
|    miss you my dear|[negative]|
|today dan bought ...|[negative]|
| oo noo thats not...|[negative]|
|misses her phone....|[negative]|
|so i have like no...|[negative]|
|also i popped the...|[negative]|
|               uh oh|[negative]|
| what brody how d...|[negative]|
|i feel useless i ...|[negative]|
|kate is leaving m...|[negative]|
|i lost a follower...|[negative]|
| saying goodbye t...|[negative]|
|just got back in,...|[negative]|
|   my dog ran awayyy|[negative]|
|is missing someon...|[negative]|
| s'ok, trying to ...|[negative]|
|hot ****, i'm at ...|[negative]|
|just figured out ...|[negative]|
+--------------------+----------+
only showing top 20 rows



In [27]:
#Contagem 
spark.sql("select tps, count(*) from tipo_sentimento group by tps").show()

+--------+--------+
|     tps|count(1)|
+--------+--------+
|positive|    8582|
| neutral|   11117|
|negative|    7786|
+--------+--------+

