# PySpark - Project Tweets

In [1]:
# Load the libraries
import os
import numpy as np
import pandas as pd
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.sql import functions as f
from pyspark.sql.functions import udf, StringType
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer



# Initialize Spark Session

In [2]:

spark = SparkSession.builder.appName('project_tweets').getOrCreate()

24/04/22 23:39:18 WARN Utils: Your hostname, BDS-2023 resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
24/04/22 23:39:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/22 23:39:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Read the Dataset

In [3]:
# Load the ProjectTweets into Hadoop in the named folder 'user1'

data = spark.read.csv('hdfs://localhost:9000/user1/ProjectTweets.csv', header=True, inferSchema =True)

                                                                                

In [4]:
# Display the structure of schema
data.printSchema()

root
 |-- 0: integer (nullable = true)
 |-- 1467810369: long (nullable = true)
 |-- Mon Apr 06 22:19:45 PDT 2009: string (nullable = true)
 |-- NO_QUERY: string (nullable = true)
 |-- _TheSpecialOne_: string (nullable = true)
 |-- @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D: string (nullable = true)



In [5]:
first_row = data.limit(1)

# Duplicar a primeira linha - unindo-a com o DataFrame original
df = data.union(first_row)


# Mostrar as primeiras linhas para verificar se a duplicação funcionou
df.show(2)




+---+----------+----------------------------+--------+---------------+-------------------------------------------------------------------------------------------------------------------+
|  0|1467810369|Mon Apr 06 22:19:45 PDT 2009|NO_QUERY|_TheSpecialOne_|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D|
+---+----------+----------------------------+--------+---------------+-------------------------------------------------------------------------------------------------------------------+
|  1|1467810672|        Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|                                                                                               is upset that he ...|
|  2|1467810917|        Mon Apr 06 22:19:...|NO_QUERY|       mattycus|                                                                                               @Kenichan I dived...|
+---+----------+----------------------------+--------+-----------

                                                                                

In [None]:
num_rows = df.count()
print(f"Num of rows: {num_rows}")

num_columns = len(df.columns)
print(f"Num of columns: {num_columns}")

[Stage 7:=====>                                                    (1 + 0) / 11]

# Rename Target Column

In [None]:
# Supondo que o seu DataFrame seja 'df'
# Renomeie as colunas
new_column_names = ["index", "ids", "date", "flag", "user", "text"]
df_renamed = df.toDF(*new_column_names)

# Mostrar o DataFrame com as colunas renomeadas
df_renamed.show()

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('ProjectTweets.csv')
df.head(5)

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,1,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,2,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,3,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,4,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,5,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [3]:
df.shape

(1599999, 6)

In [4]:
df.dtypes

0                                                                                                                       int64
1467810369                                                                                                              int64
Mon Apr 06 22:19:45 PDT 2009                                                                                           object
NO_QUERY                                                                                                               object
_TheSpecialOne_                                                                                                        object
@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D    object
dtype: object

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599999 entries, 0 to 1599998
Data columns (total 6 columns):
 #   Column                                                                                                               Non-Null Count    Dtype 
---  ------                                                                                                               --------------    ----- 
 0   0                                                                                                                    1599999 non-null  int64 
 1   1467810369                                                                                                           1599999 non-null  int64 
 2   Mon Apr 06 22:19:45 PDT 2009                                                                                         1599999 non-null  object
 3   NO_QUERY                                                                                                             1599999 non-null  object
 4   _

In [6]:
df.isnull().sum()

0                                                                                                                      0
1467810369                                                                                                             0
Mon Apr 06 22:19:45 PDT 2009                                                                                           0
NO_QUERY                                                                                                               0
_TheSpecialOne_                                                                                                        0
@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D    0
dtype: int64

In [7]:
df.loc[-1] = df.columns
df.index = df.index + 1
df.sort_index(inplace=True)

In [8]:
df.head(5)

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,1,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,2,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,3,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,4,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [9]:
new_columns = ["index", "ids", "date", "flag", "user", "text"]
df.columns = new_columns
df1 = df

In [10]:
df1.head(3)

Unnamed: 0,index,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,1,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,2,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...


In [11]:
print(df1.dtypes)

index    object
ids      object
date     object
flag     object
user     object
text     object
dtype: object


In [16]:
# Converter colunas para tipos mais específicos
df1['index'] = pd.to_numeric(df1['index'], errors='coerce')
df1['ids'] = pd.to_numeric(df1['ids'], errors='coerce')
df1['date'] = pd.to_datetime(df1['date'], errors='coerce', format='%a %b %d %H:%M:%S PDT %Y')  # Especificando o formato para evitar confusões de parse

# Garantir que todas as outras colunas são tratadas como strings
df1['flag'] = df1['flag'].astype(str)
df1['user'] = df1['user'].astype(str)
df1['text'] = df1['text'].astype(str)

# Tentar salvar novamente em formato Parquet
try:
    df1.to_parquet("/home/hduser/Desktop/New-tweets.parquet")
    print("Arquivo Parquet salvo com sucesso!")
except Exception as e:
    print("Falha ao salvar o arquivo Parquet:", e)


Arquivo Parquet salvo com sucesso!


In [17]:
print(df1.dtypes)

index             int64
ids               int64
date     datetime64[ns]
flag             object
user             object
text             object
dtype: object


## Reading Data from Hadoop

In [20]:
import pandas as pd
import pyarrow.parquet as pq

# Read the Parquet file using PyArrow
table = pq.read_table("hdfs://localhost:9000/user/hduser/New-tweets.parquet")
df = table.to_pandas()

# Convert TIMESTAMP(NANOS) to TIMESTAMP(MILLIS)
df['timestamp_column'] = df['timestamp_column'].astype('datetime64[ms]')

# Now df can be used in Spark
spark_df = spark.createDataFrame(df)
spark_df.show()



OSError: Unable to load libhdfs: ./libhdfs.so: cannot open shared object file: No such file or directory

In [None]:
tweets_spark.printSchema()

In [None]:
# Obtém o número de linhas
num_rows = tweets_spark.count()
print(f"Número de linhas: {num_rows}")

# Obtém o número de colunas
num_columns = len(tweets_spark.columns)
print(f"Número de colunas: {num_columns}")

In [None]:
tweets_spark.summary().show()