<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Data extraction solution for Mariana's research project

## Extract data with an Amazon EMR Apache Spark cluster

## Year 2023 data test

### Import the required libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lower, col, array_contains

### Load data into an Amazon EMR EMR Apache Spark DataFrame

#### Year 2021 sample

In [2]:
# Set the S3 bucket and folder paths
data_source = '2021_29.json'

# Create a SparkSession
spark = SparkSession.builder.appName('The Twitter Grab Corpus').getOrCreate()

# Read the JSONL files into a DataFrame
#tweets_spark_df = spark.read.json(args.data_source) # RevA parameters
tweets_spark_df = spark.read.option('recursiveFileLookup', 'true').json(data_source) # RevB parameters

In [3]:
# Show the first few rows of the DataFrame
tweets_spark_df.show()

+------------+-----------+--------------------+------+------------------+------------------------+--------------------+--------------+--------------+---------+------------+----+-------------------+-------------------+-----------------------+---------------------+-------------------------+-------------------+-----------------------+---------------+----+-----+------------------+-----------+-------------+----------------+--------------------+-----------------------+-----------+-------------+---------+--------------------+--------------------+------------------------------------+-------------+---------+--------------------+
|contributors|coordinates|          created_at|delete|display_text_range|                entities|   extended_entities|extended_tweet|favorite_count|favorited|filter_level| geo|                 id|             id_str|in_reply_to_screen_name|in_reply_to_status_id|in_reply_to_status_id_str|in_reply_to_user_id|in_reply_to_user_id_str|is_quote_status|lang|place|possibly_sen

In [4]:
# Show the quantity of columns of the DataFrame
len(tweets_spark_df.columns)

37

In [5]:
# Show the quantity of rows (tweets) of the DataFrame
tweets_spark_df.count()

3480

### Filter the dataframe

In [6]:
# Define the list of hashtags for DataFrame filtering
hashtags = [
    'acolhimento', 
    'Acolhimento', 
    'ACOLHIMENTO', 
    'aporofobia', 
    'Aporofobia', 
    'APOROFOBIA', 
    'brasilvaivirarvenezuela', 
    'Brasilvaivirarvenezuela', 
    'BrasilVaiVirarVenezuela', 
    'BRASILVAIVIRARVENEZUELA', 
    'crisehumanitária', 
    'Crisehumanitária', 
    'CriseHumanitária', 
    'CRISEHUMANITÁRIA', 
    'crisevenezuelana', 
    'Crisevenezuelana', 
    'CriseVenezuelana', 
    'CRISEVENEZUELANA', 
    'Discriminação', 
    'discriminação', 
    'DISCRIMINAÇÃO', 
    'estereótipo', 
    'Estereótipo', 
    'ESTEREÓTIPO', 
    'fronteira', 
    'Fronteira', 
    'FRONTEIRA', 
    'migrantes', 
    'Migrantes', 
    'MIGRANTES', 
    'preconceito', 
    'Preconceito', 
    'PRECONCEITO', 
    'refugiados', 
    'Refugiados', 
    'REFUGIADOS', 
    'roraizuela', 
    'Roraizuela', 
    'RORAIZUELA', 
    'venebrasil', 
    'Venebrasil', 
    'VeneBrasil', 
    'VENEBRASIL', 
    'venezuelanosnobrasil', 
    'Venezuelanosnobrasil', 
    'VenezuelanosNoBrasil', 
    'VENEZUELANOSNOBRASIL', 
    'venezuraima', 
    'Venezuraima', 
    'VENEZURAIMA', 
    'violência', 
    'Violência', 
    'VIOLÊNCIA', 
    'xenofobia', 
    'Xenofobia', 
    'XENOFOBIA'
]

expressions = [
    'ameaça', 
    'aporofobia', 
    'carga', 
    'conflito', 
    'crise', 
    'delinquência', 
    'desconfiança', 
    'desemprego', 
    'desigualdade', 
    'desordem', 
    'direitos', 
    'discriminação', 
    'estigma', 
    'estrangeiro', 
    'exclusão', 
    'fronteira',  
    'hostilidade', 
    'humanitário', 
    'identidade', 
    'inferior', 
    'intolerância', 
    'invasão', 
    'marginal', 
    'perigo', 
    'preconceito', 
    'problema', 
    'racismo', 
    'refugiada', 
    'refugiado', 
    'refugiados', 
    'rejeição', 
    'roraizuela', 
    'venebrasil', 
    'venezuela', 
    'venezuelana', 
    'venezuelanas', 
    'venezuelano', 
    'venezuelanos', 
    'venezuraima', 
    'violência', 
    'xenofobia'
]

# Create a filtered DataFrame
filtered_tweets_spark_df = tweets_spark_df.filter(
    array_contains('entities.hashtags.text', hashtags[0]) |\
    array_contains('entities.hashtags.text', hashtags[1]) |\
    array_contains('entities.hashtags.text', hashtags[2]) |\
    array_contains('entities.hashtags.text', hashtags[3]) |\
    array_contains('entities.hashtags.text', hashtags[4]) |\
    array_contains('entities.hashtags.text', hashtags[5]) |\
    array_contains('entities.hashtags.text', hashtags[6]) |\
    array_contains('entities.hashtags.text', hashtags[7]) |\
    array_contains('entities.hashtags.text', hashtags[8]) |\
    array_contains('entities.hashtags.text', hashtags[9]) |\
    array_contains('entities.hashtags.text', hashtags[10]) |\
    array_contains('entities.hashtags.text', hashtags[11]) |\
    array_contains('entities.hashtags.text', hashtags[12]) |\
    array_contains('entities.hashtags.text', hashtags[13]) |\
    array_contains('entities.hashtags.text', hashtags[14]) |\
    array_contains('entities.hashtags.text', hashtags[15]) |\
    array_contains('entities.hashtags.text', hashtags[16]) |\
    array_contains('entities.hashtags.text', hashtags[17]) |\
    array_contains('entities.hashtags.text', hashtags[18]) |\
    array_contains('entities.hashtags.text', hashtags[19]) |\
    array_contains('entities.hashtags.text', hashtags[20]) |\
    array_contains('entities.hashtags.text', hashtags[21]) |\
    array_contains('entities.hashtags.text', hashtags[22]) |\
    array_contains('entities.hashtags.text', hashtags[23]) |\
    array_contains('entities.hashtags.text', hashtags[24]) |\
    array_contains('entities.hashtags.text', hashtags[25]) |\
    array_contains('entities.hashtags.text', hashtags[26]) |\
    array_contains('entities.hashtags.text', hashtags[27]) |\
    array_contains('entities.hashtags.text', hashtags[28]) |\
    array_contains('entities.hashtags.text', hashtags[29]) |\
    array_contains('entities.hashtags.text', hashtags[30]) |\
    array_contains('entities.hashtags.text', hashtags[31]) |\
    array_contains('entities.hashtags.text', hashtags[32]) |\
    array_contains('entities.hashtags.text', hashtags[33]) |\
    array_contains('entities.hashtags.text', hashtags[34]) |\
    array_contains('entities.hashtags.text', hashtags[35]) |\
    array_contains('entities.hashtags.text', hashtags[36]) |\
    array_contains('entities.hashtags.text', hashtags[37]) |\
    array_contains('entities.hashtags.text', hashtags[38]) |\
    array_contains('entities.hashtags.text', hashtags[39]) |\
    array_contains('entities.hashtags.text', hashtags[40]) |\
    array_contains('entities.hashtags.text', hashtags[41]) |\
    array_contains('entities.hashtags.text', hashtags[42]) |\
    array_contains('entities.hashtags.text', hashtags[43]) |\
    array_contains('entities.hashtags.text', hashtags[44]) |\
    array_contains('entities.hashtags.text', hashtags[45]) |\
    array_contains('entities.hashtags.text', hashtags[46]) |\
    array_contains('entities.hashtags.text', hashtags[47]) |\
    array_contains('entities.hashtags.text', hashtags[48]) |\
    array_contains('entities.hashtags.text', hashtags[49]) |\
    array_contains('entities.hashtags.text', hashtags[50]) |\
    array_contains('entities.hashtags.text', hashtags[51]) |\
    array_contains('entities.hashtags.text', hashtags[52]) |\
    array_contains('entities.hashtags.text', hashtags[53]) |\
    array_contains('entities.hashtags.text', hashtags[54]) |\
    array_contains('entities.hashtags.text', hashtags[55]) |\
    lower(col('text')).contains(expressions[0]) |\
    lower(col('text')).contains(expressions[1]) |\
    lower(col('text')).contains(expressions[2]) |\
    lower(col('text')).contains(expressions[3]) |\
    lower(col('text')).contains(expressions[4]) |\
    lower(col('text')).contains(expressions[5]) |\
    lower(col('text')).contains(expressions[6]) |\
    lower(col('text')).contains(expressions[7]) |\
    lower(col('text')).contains(expressions[8]) |\
    lower(col('text')).contains(expressions[9]) |\
    lower(col('text')).contains(expressions[10]) |\
    lower(col('text')).contains(expressions[11]) |\
    lower(col('text')).contains(expressions[12]) |\
    lower(col('text')).contains(expressions[13]) |\
    lower(col('text')).contains(expressions[14]) |\
    lower(col('text')).contains(expressions[15]) |\
    lower(col('text')).contains(expressions[16]) |\
    lower(col('text')).contains(expressions[17]) |\
    lower(col('text')).contains(expressions[18]) |\
    lower(col('text')).contains(expressions[19]) |\
    lower(col('text')).contains(expressions[20]) |\
    lower(col('text')).contains(expressions[21]) |\
    lower(col('text')).contains(expressions[22]) |\
    lower(col('text')).contains(expressions[23]) |\
    lower(col('text')).contains(expressions[24]) |\
    lower(col('text')).contains(expressions[25]) |\
    lower(col('text')).contains(expressions[26]) |\
    lower(col('text')).contains(expressions[27]) |\
    lower(col('text')).contains(expressions[28]) |\
    lower(col('text')).contains(expressions[29]) |\
    lower(col('text')).contains(expressions[30]) |\
    lower(col('text')).contains(expressions[31]) |\
    lower(col('text')).contains(expressions[32]) |\
    lower(col('text')).contains(expressions[33]) |\
    lower(col('text')).contains(expressions[34]) |\
    lower(col('text')).contains(expressions[35]) |\
    lower(col('text')).contains(expressions[36]) |\
    lower(col('text')).contains(expressions[37]) |\
    lower(col('text')).contains(expressions[38]) |\
    lower(col('text')).contains(expressions[39]) |\
    lower(col('text')).contains(expressions[40])
)

#### Year 2022 sample

In [7]:
# Set the S3 bucket and folder paths
data_source = '2022_20220101000000.json'

# Create a SparkSession
spark = SparkSession.builder.appName('The Twitter Grab Corpus').getOrCreate()

# Read the JSONL files into a DataFrame
#tweets_spark_df = spark.read.json(args.data_source) # RevA parameters
tweets_spark_df = spark.read.option('recursiveFileLookup', 'true').json(data_source) # RevB parameters

In [8]:
# Show the first few rows of the DataFrame
tweets_spark_df.show()

+------------+-----------+--------------------+------------------+------------------------+--------------------+--------------+--------------+---------+------------+----+-------------------+-------------------+-----------------------+---------------------+-------------------------+-------------------+-----------------------+---------------+----+-----+------------------+-----------+-------------+----------------+--------------------+-----------------------+-----------+-------------+---------+--------------------+--------------------+-------------------------------------+-------------+---------+--------------------+
|contributors|coordinates|          created_at|display_text_range|                entities|   extended_entities|extended_tweet|favorite_count|favorited|filter_level| geo|                 id|             id_str|in_reply_to_screen_name|in_reply_to_status_id|in_reply_to_status_id_str|in_reply_to_user_id|in_reply_to_user_id_str|is_quote_status|lang|place|possibly_sensitive|quote_

In [9]:
# Show the quantity of columns of the DataFrame
len(tweets_spark_df.columns)

36

In [10]:
# Show the quantity of rows (tweets) of the DataFrame
tweets_spark_df.count()

2958

### Filter the dataframe

In [11]:
# Define the list of hashtags for DataFrame filtering
hashtags = [
    'acolhimento', 
    'Acolhimento', 
    'ACOLHIMENTO', 
    'aporofobia', 
    'Aporofobia', 
    'APOROFOBIA', 
    'brasilvaivirarvenezuela', 
    'Brasilvaivirarvenezuela', 
    'BrasilVaiVirarVenezuela', 
    'BRASILVAIVIRARVENEZUELA', 
    'crisehumanitária', 
    'Crisehumanitária', 
    'CriseHumanitária', 
    'CRISEHUMANITÁRIA', 
    'crisevenezuelana', 
    'Crisevenezuelana', 
    'CriseVenezuelana', 
    'CRISEVENEZUELANA', 
    'Discriminação', 
    'discriminação', 
    'DISCRIMINAÇÃO', 
    'estereótipo', 
    'Estereótipo', 
    'ESTEREÓTIPO', 
    'fronteira', 
    'Fronteira', 
    'FRONTEIRA', 
    'migrantes', 
    'Migrantes', 
    'MIGRANTES', 
    'preconceito', 
    'Preconceito', 
    'PRECONCEITO', 
    'refugiados', 
    'Refugiados', 
    'REFUGIADOS', 
    'roraizuela', 
    'Roraizuela', 
    'RORAIZUELA', 
    'venebrasil', 
    'Venebrasil', 
    'VeneBrasil', 
    'VENEBRASIL', 
    'venezuelanosnobrasil', 
    'Venezuelanosnobrasil', 
    'VenezuelanosNoBrasil', 
    'VENEZUELANOSNOBRASIL', 
    'venezuraima', 
    'Venezuraima', 
    'VENEZURAIMA', 
    'violência', 
    'Violência', 
    'VIOLÊNCIA', 
    'xenofobia', 
    'Xenofobia', 
    'XENOFOBIA'
]

expressions = [
    'ameaça', 
    'aporofobia', 
    'carga', 
    'conflito', 
    'crise', 
    'delinquência', 
    'desconfiança', 
    'desemprego', 
    'desigualdade', 
    'desordem', 
    'direitos', 
    'discriminação', 
    'estigma', 
    'estrangeiro', 
    'exclusão', 
    'fronteira',  
    'hostilidade', 
    'humanitário', 
    'identidade', 
    'inferior', 
    'intolerância', 
    'invasão', 
    'marginal', 
    'perigo', 
    'preconceito', 
    'problema', 
    'racismo', 
    'refugiada', 
    'refugiado', 
    'refugiados', 
    'rejeição', 
    'roraizuela', 
    'venebrasil', 
    'venezuela', 
    'venezuelana', 
    'venezuelanas', 
    'venezuelano', 
    'venezuelanos', 
    'venezuraima', 
    'violência', 
    'xenofobia'
]

# Create a filtered DataFrame
filtered_tweets_spark_df = tweets_spark_df.filter(
    array_contains('entities.hashtags.text', hashtags[0]) |\
    array_contains('entities.hashtags.text', hashtags[1]) |\
    array_contains('entities.hashtags.text', hashtags[2]) |\
    array_contains('entities.hashtags.text', hashtags[3]) |\
    array_contains('entities.hashtags.text', hashtags[4]) |\
    array_contains('entities.hashtags.text', hashtags[5]) |\
    array_contains('entities.hashtags.text', hashtags[6]) |\
    array_contains('entities.hashtags.text', hashtags[7]) |\
    array_contains('entities.hashtags.text', hashtags[8]) |\
    array_contains('entities.hashtags.text', hashtags[9]) |\
    array_contains('entities.hashtags.text', hashtags[10]) |\
    array_contains('entities.hashtags.text', hashtags[11]) |\
    array_contains('entities.hashtags.text', hashtags[12]) |\
    array_contains('entities.hashtags.text', hashtags[13]) |\
    array_contains('entities.hashtags.text', hashtags[14]) |\
    array_contains('entities.hashtags.text', hashtags[15]) |\
    array_contains('entities.hashtags.text', hashtags[16]) |\
    array_contains('entities.hashtags.text', hashtags[17]) |\
    array_contains('entities.hashtags.text', hashtags[18]) |\
    array_contains('entities.hashtags.text', hashtags[19]) |\
    array_contains('entities.hashtags.text', hashtags[20]) |\
    array_contains('entities.hashtags.text', hashtags[21]) |\
    array_contains('entities.hashtags.text', hashtags[22]) |\
    array_contains('entities.hashtags.text', hashtags[23]) |\
    array_contains('entities.hashtags.text', hashtags[24]) |\
    array_contains('entities.hashtags.text', hashtags[25]) |\
    array_contains('entities.hashtags.text', hashtags[26]) |\
    array_contains('entities.hashtags.text', hashtags[27]) |\
    array_contains('entities.hashtags.text', hashtags[28]) |\
    array_contains('entities.hashtags.text', hashtags[29]) |\
    array_contains('entities.hashtags.text', hashtags[30]) |\
    array_contains('entities.hashtags.text', hashtags[31]) |\
    array_contains('entities.hashtags.text', hashtags[32]) |\
    array_contains('entities.hashtags.text', hashtags[33]) |\
    array_contains('entities.hashtags.text', hashtags[34]) |\
    array_contains('entities.hashtags.text', hashtags[35]) |\
    array_contains('entities.hashtags.text', hashtags[36]) |\
    array_contains('entities.hashtags.text', hashtags[37]) |\
    array_contains('entities.hashtags.text', hashtags[38]) |\
    array_contains('entities.hashtags.text', hashtags[39]) |\
    array_contains('entities.hashtags.text', hashtags[40]) |\
    array_contains('entities.hashtags.text', hashtags[41]) |\
    array_contains('entities.hashtags.text', hashtags[42]) |\
    array_contains('entities.hashtags.text', hashtags[43]) |\
    array_contains('entities.hashtags.text', hashtags[44]) |\
    array_contains('entities.hashtags.text', hashtags[45]) |\
    array_contains('entities.hashtags.text', hashtags[46]) |\
    array_contains('entities.hashtags.text', hashtags[47]) |\
    array_contains('entities.hashtags.text', hashtags[48]) |\
    array_contains('entities.hashtags.text', hashtags[49]) |\
    array_contains('entities.hashtags.text', hashtags[50]) |\
    array_contains('entities.hashtags.text', hashtags[51]) |\
    array_contains('entities.hashtags.text', hashtags[52]) |\
    array_contains('entities.hashtags.text', hashtags[53]) |\
    array_contains('entities.hashtags.text', hashtags[54]) |\
    array_contains('entities.hashtags.text', hashtags[55]) |\
    lower(col('text')).contains(expressions[0]) |\
    lower(col('text')).contains(expressions[1]) |\
    lower(col('text')).contains(expressions[2]) |\
    lower(col('text')).contains(expressions[3]) |\
    lower(col('text')).contains(expressions[4]) |\
    lower(col('text')).contains(expressions[5]) |\
    lower(col('text')).contains(expressions[6]) |\
    lower(col('text')).contains(expressions[7]) |\
    lower(col('text')).contains(expressions[8]) |\
    lower(col('text')).contains(expressions[9]) |\
    lower(col('text')).contains(expressions[10]) |\
    lower(col('text')).contains(expressions[11]) |\
    lower(col('text')).contains(expressions[12]) |\
    lower(col('text')).contains(expressions[13]) |\
    lower(col('text')).contains(expressions[14]) |\
    lower(col('text')).contains(expressions[15]) |\
    lower(col('text')).contains(expressions[16]) |\
    lower(col('text')).contains(expressions[17]) |\
    lower(col('text')).contains(expressions[18]) |\
    lower(col('text')).contains(expressions[19]) |\
    lower(col('text')).contains(expressions[20]) |\
    lower(col('text')).contains(expressions[21]) |\
    lower(col('text')).contains(expressions[22]) |\
    lower(col('text')).contains(expressions[23]) |\
    lower(col('text')).contains(expressions[24]) |\
    lower(col('text')).contains(expressions[25]) |\
    lower(col('text')).contains(expressions[26]) |\
    lower(col('text')).contains(expressions[27]) |\
    lower(col('text')).contains(expressions[28]) |\
    lower(col('text')).contains(expressions[29]) |\
    lower(col('text')).contains(expressions[30]) |\
    lower(col('text')).contains(expressions[31]) |\
    lower(col('text')).contains(expressions[32]) |\
    lower(col('text')).contains(expressions[33]) |\
    lower(col('text')).contains(expressions[34]) |\
    lower(col('text')).contains(expressions[35]) |\
    lower(col('text')).contains(expressions[36]) |\
    lower(col('text')).contains(expressions[37]) |\
    lower(col('text')).contains(expressions[38]) |\
    lower(col('text')).contains(expressions[39]) |\
    lower(col('text')).contains(expressions[40])
)

#### Year 2023 sample

The data corresponding to year 2023 is malformed. While the data samples of year 2021 and 2020 have 37 and 36 columns respectively, the data of year 2023 has only 3 columns, which makes it unsuitable.

In [12]:
# Set the S3 bucket and folder paths
data_source = '2023_0.json'

# Create a SparkSession
spark = SparkSession.builder.appName('The Twitter Grab Corpus').getOrCreate()

# Read the JSONL files into a DataFrame
#tweets_spark_df = spark.read.json(args.data_source) # RevA parameters
tweets_spark_df = spark.read.option('recursiveFileLookup', 'true').json(data_source) # RevB parameters

In [13]:
# Show the first few rows of the DataFrame
tweets_spark_df.show()

+--------------------+------+--------------------+
|                data|errors|            includes|
+--------------------+------+--------------------+
|{{null, null}, 83...|  null|{null, null, null...|
|{{[7_160596835600...|  null|{[{null, 15000, 6...|
|{{null, null}, 15...|  null|{null, null, null...|
|{{[3_116075026331...|  null|{[{null, null, 23...|
|{{null, null}, 16...|  null|{null, null, null...|
|{{null, null}, 73...|  null|{null, null, null...|
|{{null, null}, 15...|  null|{null, null, null...|
|{{[16_16092951206...|  null|{[{null, null, 27...|
|{{null, null}, 61...|  null|{null, null, null...|
|{{null, null}, 15...|  null|{null, null, null...|
|{{null, null}, 88...|  null|{null, null, null...|
|{{[3_160918518494...|  null|{[{null, null, 11...|
|{{[3_111192783003...|  null|{[{null, null, 23...|
|{{null, null}, 15...|  null|{null, null, null...|
|{{[3_160933860191...|  null|{[{null, null, 28...|
|{{null, null}, 11...|  null|{null, null, null...|
|{{[16_16092254162...|  null|{[

In [14]:
# Show the quantity of columns of the DataFrame
len(tweets_spark_df.columns)

3

In [15]:
# Show the quantity of rows (tweets) of the DataFrame
tweets_spark_df.count()

2783

### Filter the dataframe

In [16]:
# Define the list of hashtags for DataFrame filtering
hashtags = [
    'acolhimento', 
    'Acolhimento', 
    'ACOLHIMENTO', 
    'aporofobia', 
    'Aporofobia', 
    'APOROFOBIA', 
    'brasilvaivirarvenezuela', 
    'Brasilvaivirarvenezuela', 
    'BrasilVaiVirarVenezuela', 
    'BRASILVAIVIRARVENEZUELA', 
    'crisehumanitária', 
    'Crisehumanitária', 
    'CriseHumanitária', 
    'CRISEHUMANITÁRIA', 
    'crisevenezuelana', 
    'Crisevenezuelana', 
    'CriseVenezuelana', 
    'CRISEVENEZUELANA', 
    'Discriminação', 
    'discriminação', 
    'DISCRIMINAÇÃO', 
    'estereótipo', 
    'Estereótipo', 
    'ESTEREÓTIPO', 
    'fronteira', 
    'Fronteira', 
    'FRONTEIRA', 
    'migrantes', 
    'Migrantes', 
    'MIGRANTES', 
    'preconceito', 
    'Preconceito', 
    'PRECONCEITO', 
    'refugiados', 
    'Refugiados', 
    'REFUGIADOS', 
    'roraizuela', 
    'Roraizuela', 
    'RORAIZUELA', 
    'venebrasil', 
    'Venebrasil', 
    'VeneBrasil', 
    'VENEBRASIL', 
    'venezuelanosnobrasil', 
    'Venezuelanosnobrasil', 
    'VenezuelanosNoBrasil', 
    'VENEZUELANOSNOBRASIL', 
    'venezuraima', 
    'Venezuraima', 
    'VENEZURAIMA', 
    'violência', 
    'Violência', 
    'VIOLÊNCIA', 
    'xenofobia', 
    'Xenofobia', 
    'XENOFOBIA'
]

expressions = [
    'ameaça', 
    'aporofobia', 
    'carga', 
    'conflito', 
    'crise', 
    'delinquência', 
    'desconfiança', 
    'desemprego', 
    'desigualdade', 
    'desordem', 
    'direitos', 
    'discriminação', 
    'estigma', 
    'estrangeiro', 
    'exclusão', 
    'fronteira',  
    'hostilidade', 
    'humanitário', 
    'identidade', 
    'inferior', 
    'intolerância', 
    'invasão', 
    'marginal', 
    'perigo', 
    'preconceito', 
    'problema', 
    'racismo', 
    'refugiada', 
    'refugiado', 
    'refugiados', 
    'rejeição', 
    'roraizuela', 
    'venebrasil', 
    'venezuela', 
    'venezuelana', 
    'venezuelanas', 
    'venezuelano', 
    'venezuelanos', 
    'venezuraima', 
    'violência', 
    'xenofobia'
]

# Create a filtered DataFrame
filtered_tweets_spark_df = tweets_spark_df.filter(
    array_contains('entities.hashtags.text', hashtags[0]) |\
    array_contains('entities.hashtags.text', hashtags[1]) |\
    array_contains('entities.hashtags.text', hashtags[2]) |\
    array_contains('entities.hashtags.text', hashtags[3]) |\
    array_contains('entities.hashtags.text', hashtags[4]) |\
    array_contains('entities.hashtags.text', hashtags[5]) |\
    array_contains('entities.hashtags.text', hashtags[6]) |\
    array_contains('entities.hashtags.text', hashtags[7]) |\
    array_contains('entities.hashtags.text', hashtags[8]) |\
    array_contains('entities.hashtags.text', hashtags[9]) |\
    array_contains('entities.hashtags.text', hashtags[10]) |\
    array_contains('entities.hashtags.text', hashtags[11]) |\
    array_contains('entities.hashtags.text', hashtags[12]) |\
    array_contains('entities.hashtags.text', hashtags[13]) |\
    array_contains('entities.hashtags.text', hashtags[14]) |\
    array_contains('entities.hashtags.text', hashtags[15]) |\
    array_contains('entities.hashtags.text', hashtags[16]) |\
    array_contains('entities.hashtags.text', hashtags[17]) |\
    array_contains('entities.hashtags.text', hashtags[18]) |\
    array_contains('entities.hashtags.text', hashtags[19]) |\
    array_contains('entities.hashtags.text', hashtags[20]) |\
    array_contains('entities.hashtags.text', hashtags[21]) |\
    array_contains('entities.hashtags.text', hashtags[22]) |\
    array_contains('entities.hashtags.text', hashtags[23]) |\
    array_contains('entities.hashtags.text', hashtags[24]) |\
    array_contains('entities.hashtags.text', hashtags[25]) |\
    array_contains('entities.hashtags.text', hashtags[26]) |\
    array_contains('entities.hashtags.text', hashtags[27]) |\
    array_contains('entities.hashtags.text', hashtags[28]) |\
    array_contains('entities.hashtags.text', hashtags[29]) |\
    array_contains('entities.hashtags.text', hashtags[30]) |\
    array_contains('entities.hashtags.text', hashtags[31]) |\
    array_contains('entities.hashtags.text', hashtags[32]) |\
    array_contains('entities.hashtags.text', hashtags[33]) |\
    array_contains('entities.hashtags.text', hashtags[34]) |\
    array_contains('entities.hashtags.text', hashtags[35]) |\
    array_contains('entities.hashtags.text', hashtags[36]) |\
    array_contains('entities.hashtags.text', hashtags[37]) |\
    array_contains('entities.hashtags.text', hashtags[38]) |\
    array_contains('entities.hashtags.text', hashtags[39]) |\
    array_contains('entities.hashtags.text', hashtags[40]) |\
    array_contains('entities.hashtags.text', hashtags[41]) |\
    array_contains('entities.hashtags.text', hashtags[42]) |\
    array_contains('entities.hashtags.text', hashtags[43]) |\
    array_contains('entities.hashtags.text', hashtags[44]) |\
    array_contains('entities.hashtags.text', hashtags[45]) |\
    array_contains('entities.hashtags.text', hashtags[46]) |\
    array_contains('entities.hashtags.text', hashtags[47]) |\
    array_contains('entities.hashtags.text', hashtags[48]) |\
    array_contains('entities.hashtags.text', hashtags[49]) |\
    array_contains('entities.hashtags.text', hashtags[50]) |\
    array_contains('entities.hashtags.text', hashtags[51]) |\
    array_contains('entities.hashtags.text', hashtags[52]) |\
    array_contains('entities.hashtags.text', hashtags[53]) |\
    array_contains('entities.hashtags.text', hashtags[54]) |\
    array_contains('entities.hashtags.text', hashtags[55]) |\
    lower(col('text')).contains(expressions[0]) |\
    lower(col('text')).contains(expressions[1]) |\
    lower(col('text')).contains(expressions[2]) |\
    lower(col('text')).contains(expressions[3]) |\
    lower(col('text')).contains(expressions[4]) |\
    lower(col('text')).contains(expressions[5]) |\
    lower(col('text')).contains(expressions[6]) |\
    lower(col('text')).contains(expressions[7]) |\
    lower(col('text')).contains(expressions[8]) |\
    lower(col('text')).contains(expressions[9]) |\
    lower(col('text')).contains(expressions[10]) |\
    lower(col('text')).contains(expressions[11]) |\
    lower(col('text')).contains(expressions[12]) |\
    lower(col('text')).contains(expressions[13]) |\
    lower(col('text')).contains(expressions[14]) |\
    lower(col('text')).contains(expressions[15]) |\
    lower(col('text')).contains(expressions[16]) |\
    lower(col('text')).contains(expressions[17]) |\
    lower(col('text')).contains(expressions[18]) |\
    lower(col('text')).contains(expressions[19]) |\
    lower(col('text')).contains(expressions[20]) |\
    lower(col('text')).contains(expressions[21]) |\
    lower(col('text')).contains(expressions[22]) |\
    lower(col('text')).contains(expressions[23]) |\
    lower(col('text')).contains(expressions[24]) |\
    lower(col('text')).contains(expressions[25]) |\
    lower(col('text')).contains(expressions[26]) |\
    lower(col('text')).contains(expressions[27]) |\
    lower(col('text')).contains(expressions[28]) |\
    lower(col('text')).contains(expressions[29]) |\
    lower(col('text')).contains(expressions[30]) |\
    lower(col('text')).contains(expressions[31]) |\
    lower(col('text')).contains(expressions[32]) |\
    lower(col('text')).contains(expressions[33]) |\
    lower(col('text')).contains(expressions[34]) |\
    lower(col('text')).contains(expressions[35]) |\
    lower(col('text')).contains(expressions[36]) |\
    lower(col('text')).contains(expressions[37]) |\
    lower(col('text')).contains(expressions[38]) |\
    lower(col('text')).contains(expressions[39]) |\
    lower(col('text')).contains(expressions[40])
)

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `entities`.`hashtags`.`text` cannot be resolved. Did you mean one of the following? [`data`, `includes`, `errors`].;
'Filter ((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((array_contains('entities.hashtags.text, acolhimento) OR array_contains('entities.hashtags.text, Acolhimento)) OR array_contains('entities.hashtags.text, ACOLHIMENTO)) OR array_contains('entities.hashtags.text, aporofobia)) OR array_contains('entities.hashtags.text, Aporofobia)) OR array_contains('entities.hashtags.text, APOROFOBIA)) OR array_contains('entities.hashtags.text, brasilvaivirarvenezuela)) OR array_contains('entities.hashtags.text, Brasilvaivirarvenezuela)) OR array_contains('entities.hashtags.text, BrasilVaiVirarVenezuela)) OR array_contains('entities.hashtags.text, BRASILVAIVIRARVENEZUELA)) OR array_contains('entities.hashtags.text, crisehumanitária)) OR array_contains('entities.hashtags.text, Crisehumanitária)) OR array_contains('entities.hashtags.text, CriseHumanitária)) OR array_contains('entities.hashtags.text, CRISEHUMANITÁRIA)) OR array_contains('entities.hashtags.text, crisevenezuelana)) OR array_contains('entities.hashtags.text, Crisevenezuelana)) OR array_contains('entities.hashtags.text, CriseVenezuelana)) OR array_contains('entities.hashtags.text, CRISEVENEZUELANA)) OR array_contains('entities.hashtags.text, Discriminação)) OR array_contains('entities.hashtags.text, discriminação)) OR array_contains('entities.hashtags.text, DISCRIMINAÇÃO)) OR array_contains('entities.hashtags.text, estereótipo)) OR array_contains('entities.hashtags.text, Estereótipo)) OR array_contains('entities.hashtags.text, ESTEREÓTIPO)) OR array_contains('entities.hashtags.text, fronteira)) OR array_contains('entities.hashtags.text, Fronteira)) OR array_contains('entities.hashtags.text, FRONTEIRA)) OR array_contains('entities.hashtags.text, migrantes)) OR array_contains('entities.hashtags.text, Migrantes)) OR array_contains('entities.hashtags.text, MIGRANTES)) OR array_contains('entities.hashtags.text, preconceito)) OR array_contains('entities.hashtags.text, Preconceito)) OR array_contains('entities.hashtags.text, PRECONCEITO)) OR array_contains('entities.hashtags.text, refugiados)) OR array_contains('entities.hashtags.text, Refugiados)) OR array_contains('entities.hashtags.text, REFUGIADOS)) OR array_contains('entities.hashtags.text, roraizuela)) OR array_contains('entities.hashtags.text, Roraizuela)) OR array_contains('entities.hashtags.text, RORAIZUELA)) OR array_contains('entities.hashtags.text, venebrasil)) OR array_contains('entities.hashtags.text, Venebrasil)) OR array_contains('entities.hashtags.text, VeneBrasil)) OR array_contains('entities.hashtags.text, VENEBRASIL)) OR array_contains('entities.hashtags.text, venezuelanosnobrasil)) OR array_contains('entities.hashtags.text, Venezuelanosnobrasil)) OR array_contains('entities.hashtags.text, VenezuelanosNoBrasil)) OR array_contains('entities.hashtags.text, VENEZUELANOSNOBRASIL)) OR array_contains('entities.hashtags.text, venezuraima)) OR array_contains('entities.hashtags.text, Venezuraima)) OR array_contains('entities.hashtags.text, VENEZURAIMA)) OR array_contains('entities.hashtags.text, violência)) OR array_contains('entities.hashtags.text, Violência)) OR array_contains('entities.hashtags.text, VIOLÊNCIA)) OR array_contains('entities.hashtags.text, xenofobia)) OR array_contains('entities.hashtags.text, Xenofobia)) OR array_contains('entities.hashtags.text, XENOFOBIA)) OR Contains(lower('text), ameaça)) OR Contains(lower('text), aporofobia)) OR Contains(lower('text), carga)) OR Contains(lower('text), conflito)) OR Contains(lower('text), crise)) OR Contains(lower('text), delinquência)) OR Contains(lower('text), desconfiança)) OR Contains(lower('text), desemprego)) OR Contains(lower('text), desigualdade)) OR Contains(lower('text), desordem)) OR Contains(lower('text), direitos)) OR Contains(lower('text), discriminação)) OR Contains(lower('text), estigma)) OR Contains(lower('text), estrangeiro)) OR Contains(lower('text), exclusão)) OR Contains(lower('text), fronteira)) OR Contains(lower('text), hostilidade)) OR Contains(lower('text), humanitário)) OR Contains(lower('text), identidade)) OR Contains(lower('text), inferior)) OR Contains(lower('text), intolerância)) OR Contains(lower('text), invasão)) OR Contains(lower('text), marginal)) OR Contains(lower('text), perigo)) OR Contains(lower('text), preconceito)) OR Contains(lower('text), problema)) OR Contains(lower('text), racismo)) OR Contains(lower('text), refugiada)) OR Contains(lower('text), refugiado)) OR Contains(lower('text), refugiados)) OR Contains(lower('text), rejeição)) OR Contains(lower('text), roraizuela)) OR Contains(lower('text), venebrasil)) OR Contains(lower('text), venezuela)) OR Contains(lower('text), venezuelana)) OR Contains(lower('text), venezuelanas)) OR Contains(lower('text), venezuelano)) OR Contains(lower('text), venezuelanos)) OR Contains(lower('text), venezuraima)) OR Contains(lower('text), violência)) OR Contains(lower('text), xenofobia))
+- Relation [data#734,errors#735,includes#736] json
