In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import concat_ws

spark = SparkSession.builder.appName('ProjectTweets').enableHiveSupport().getOrCreate()

In [2]:
from pyspark.sql.functions import regexp_replace, col, lower
from pyspark.ml.feature import Tokenizer, StopWordsRemover
import nltk
from nltk.stem import WordNetLemmatizer
from pyspark.sql.functions import udf
from pymongo import MongoClient
from pyspark.sql.types import StringType
import nltk
#nltk.download('omw-1.4')
#nltk.download('wordnet')

# DATA PREPARATION

In [3]:
df = spark.read.csv('/user1/ProjectTweets.csv', header=True, inferSchema=True)

                                                                                

In [4]:
df.printSchema()

root
 |-- 0: integer (nullable = true)
 |-- 1467810369: long (nullable = true)
 |-- Mon Apr 06 22:19:45 PDT 2009: string (nullable = true)
 |-- NO_QUERY: string (nullable = true)
 |-- _TheSpecialOne_: string (nullable = true)
 |-- @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D: string (nullable = true)



In [5]:
df.show(5, truncate=False)

+---+----------+----------------------------+--------+---------------+-------------------------------------------------------------------------------------------------------------------+
|0  |1467810369|Mon Apr 06 22:19:45 PDT 2009|NO_QUERY|_TheSpecialOne_|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D|
+---+----------+----------------------------+--------+---------------+-------------------------------------------------------------------------------------------------------------------+
|1  |1467810672|Mon Apr 06 22:19:49 PDT 2009|NO_QUERY|scotthamilton  |is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!    |
|2  |1467810917|Mon Apr 06 22:19:53 PDT 2009|NO_QUERY|mattycus       |@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds                          |
|3  |1467811184|Mon Apr 06 22:19:57 PDT 2009|NO_QUERY|ElleCTF    

In [6]:
from pyspark.sql.functions import col

In [7]:
new_cols = ['ids', 'date', 'flag', 'user', 'text']

for i, column_name in enumerate(new_cols):
    df = df.withColumnRenamed(df.columns[i + 1], column_name)

In [8]:
from pyspark.sql.functions import to_date, date_format

In [9]:
spark.conf.set('spark.sql.legacy.timeParserPolicy', 'LEGACY')

In [10]:
date_column = df.select('date')

In [11]:
df = df.withColumn('date', to_date(df['date'], 'EEE MMM dd HH:mm:ss zzz yyyy'))

In [12]:
df = df.withColumn('date', to_date(col('date'), 'dd/MM/yyyy'))

In [13]:
df.show()

+---+----------+----------+--------+---------------+--------------------+
|  0|       ids|      date|    flag|           user|                text|
+---+----------+----------+--------+---------------+--------------------+
|  1|1467810672|2009-04-07|NO_QUERY|  scotthamilton|is upset that he ...|
|  2|1467810917|2009-04-07|NO_QUERY|       mattycus|@Kenichan I dived...|
|  3|1467811184|2009-04-07|NO_QUERY|        ElleCTF|my whole body fee...|
|  4|1467811193|2009-04-07|NO_QUERY|         Karoli|@nationwideclass ...|
|  5|1467811372|2009-04-07|NO_QUERY|       joy_wolf|@Kwesidei not the...|
|  6|1467811592|2009-04-07|NO_QUERY|        mybirch|         Need a hug |
|  7|1467811594|2009-04-07|NO_QUERY|           coZZ|@LOLTrish hey  lo...|
|  8|1467811795|2009-04-07|NO_QUERY|2Hood4Hollywood|@Tatiana_K nope t...|
|  9|1467812025|2009-04-07|NO_QUERY|        mimismo|@twittera que me ...|
| 10|1467812416|2009-04-07|NO_QUERY| erinx3leannexo|spring break in p...|
| 11|1467812579|2009-04-07|NO_QUERY|  

In [14]:
df.printSchema()

root
 |-- 0: integer (nullable = true)
 |-- ids: long (nullable = true)
 |-- date: date (nullable = true)
 |-- flag: string (nullable = true)
 |-- user: string (nullable = true)
 |-- text: string (nullable = true)



In [15]:
# Convert text data to lowercase and clean unnecessary characters
df = df.withColumn("text", lower(regexp_replace(col("text"), "[^a-zA-Z0-9\\s]", " ")))

In [16]:
# Remove special symbols, and links from text data
df = df.withColumn("text", regexp_replace(col("text"), r'[@#]\w+|https?://\S+|\W', " "))

In [17]:
df.show(1, truncate=False)

+---+----------+----------+--------+-------------+---------------------------------------------------------------------------------------------------------------+
|0  |ids       |date      |flag    |user         |text                                                                                                           |
+---+----------+----------+--------+-------------+---------------------------------------------------------------------------------------------------------------+
|1  |1467810672|2009-04-07|NO_QUERY|scotthamilton|is upset that he can t update his facebook by texting it    and might cry as a result  school today also  blah |
+---+----------+----------+--------+-------------+---------------------------------------------------------------------------------------------------------------+
only showing top 1 row



In [18]:
# Lemmatization using NLTK
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

lemmatize_udf = udf(lemmatize_text, StringType())
df = df.withColumn("text", lemmatize_udf("text"))

In [19]:
# Tokenization
tokenizer = Tokenizer(inputCol="text", outputCol="filtered_words")
df = tokenizer.transform(df)

In [20]:
df.show(1, truncate=False)

[Stage 5:>                                                          (0 + 1) / 1]

+---+----------+----------+--------+-------------+--------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------+
|0  |ids       |date      |flag    |user         |text                                                                                                    |filtered_words                                                                                                                 |
+---+----------+----------+--------+-------------+--------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------+
|1  |1467810672|2009-04-07|NO_QUERY|scotthamilton|is upset that he can t update his facebook by texting it and might cry a a result school today als

                                                                                

In [21]:
# Use StopWordsRemover on the "filtered_words" column in your example DataFrame
remover = StopWordsRemover(inputCol="filtered_words", outputCol="filtered_words_without_stopwords")
df = remover.transform(df)

# You can update the column name as per your needs
df = df.withColumnRenamed("filtered_words_without_stopwords", "filtered_words_final")

In [22]:
# Just pick the necessary columns
df = df.select('0', 'ids', 'date', 'flag', 'user', 'filtered_words_final')

# Rename the "0" column to "index"
df = df.withColumnRenamed("0", "tweet_index")

# Show the result
df.show(truncate=False)

[Stage 6:>                                                          (0 + 1) / 1]

+-----------+----------+----------+--------+---------------+---------------------------------------------------------------------------------+
|tweet_index|ids       |date      |flag    |user           |filtered_words_final                                                             |
+-----------+----------+----------+--------+---------------+---------------------------------------------------------------------------------+
|1          |1467810672|2009-04-07|NO_QUERY|scotthamilton  |[upset, update, facebook, texting, might, cry, result, school, today, also, blah]|
|2          |1467810917|2009-04-07|NO_QUERY|mattycus       |[kenichan, dived, many, time, ball, managed, save, 50, rest, go, bound]          |
|3          |1467811184|2009-04-07|NO_QUERY|ElleCTF        |[whole, body, feel, itchy, like, fire]                                           |
|4          |1467811193|2009-04-07|NO_QUERY|Karoli         |[nationwideclass, behaving, m, mad, see]                                         |

                                                                                

In [23]:
df.dropna().show()

[Stage 7:>                                                          (0 + 1) / 1]

+-----------+----------+----------+--------+---------------+--------------------+
|tweet_index|       ids|      date|    flag|           user|filtered_words_final|
+-----------+----------+----------+--------+---------------+--------------------+
|          1|1467810672|2009-04-07|NO_QUERY|  scotthamilton|[upset, update, f...|
|          2|1467810917|2009-04-07|NO_QUERY|       mattycus|[kenichan, dived,...|
|          3|1467811184|2009-04-07|NO_QUERY|        ElleCTF|[whole, body, fee...|
|          4|1467811193|2009-04-07|NO_QUERY|         Karoli|[nationwideclass,...|
|          5|1467811372|2009-04-07|NO_QUERY|       joy_wolf|[kwesidei, whole,...|
|          6|1467811592|2009-04-07|NO_QUERY|        mybirch|         [need, hug]|
|          7|1467811594|2009-04-07|NO_QUERY|           coZZ|[loltrish, hey, l...|
|          8|1467811795|2009-04-07|NO_QUERY|2Hood4Hollywood|[tatiana, k, nope...|
|          9|1467812025|2009-04-07|NO_QUERY|        mimismo|[twittera, que, m...|
|         10|146

                                                                                

In [24]:
# Count the total number of values in the dataframe
total_count = df.count()

# Show the total count
print("Total count of values in the dataframe", total_count)



Total count of values in the dataframe 1599999


                                                                                

# MySQL

In [25]:
import pymysql

# Connect to the database
connection = pymysql.connect(
    host="localhost",
    user="root",
    password="password",
    database="ProjectTweets",
    charset='utf8mb4',
    cursorclass=pymysql.cursors.DictCursor
)

In [26]:
# # Create a cursor
# cursor = connection.cursor()

# # Create a table
# create_table_sql = """
# CREATE TABLE Tweets (
#     tweet_index INT AUTO_INCREMENT PRIMARY KEY,
#     ids BIGINT,
#     date DATE,
#     flag VARCHAR(55),
#     user VARCHAR(255),
#     filtered_words_final TEXT
# );
# """

In [27]:
# Create a table
#cursor.execute(create_table_sql)

# Save changes
#connection.commit()

In [28]:
df.printSchema()

root
 |-- tweet_index: integer (nullable = true)
 |-- ids: long (nullable = true)
 |-- date: date (nullable = true)
 |-- flag: string (nullable = true)
 |-- user: string (nullable = true)
 |-- filtered_words_final: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [29]:
# 'filtered_words_final' adındaki sütunu virgülle ayrılmış bir metin sütunu olarak birleştirin.
df = df.withColumn('concatenated_words', concat_ws(",", df['filtered_words_final']))
df.printSchema()

root
 |-- tweet_index: integer (nullable = true)
 |-- ids: long (nullable = true)
 |-- date: date (nullable = true)
 |-- flag: string (nullable = true)
 |-- user: string (nullable = true)
 |-- filtered_words_final: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- concatenated_words: string (nullable = false)



In [30]:
df = df.select('tweet_index', 'ids', 'date', 'flag', 'user', 'concatenated_words')
df.show(20, truncate=False)

[Stage 10:>                                                         (0 + 1) / 1]

+-----------+----------+----------+--------+---------------+---------------------------------------------------------------------+
|tweet_index|ids       |date      |flag    |user           |concatenated_words                                                   |
+-----------+----------+----------+--------+---------------+---------------------------------------------------------------------+
|1          |1467810672|2009-04-07|NO_QUERY|scotthamilton  |upset,update,facebook,texting,might,cry,result,school,today,also,blah|
|2          |1467810917|2009-04-07|NO_QUERY|mattycus       |kenichan,dived,many,time,ball,managed,save,50,rest,go,bound          |
|3          |1467811184|2009-04-07|NO_QUERY|ElleCTF        |whole,body,feel,itchy,like,fire                                      |
|4          |1467811193|2009-04-07|NO_QUERY|Karoli         |nationwideclass,behaving,m,mad,see                                   |
|5          |1467811372|2009-04-07|NO_QUERY|joy_wolf       |kwesidei,whole,crew    

                                                                                

In [31]:
df.printSchema()

root
 |-- tweet_index: integer (nullable = true)
 |-- ids: long (nullable = true)
 |-- date: date (nullable = true)
 |-- flag: string (nullable = true)
 |-- user: string (nullable = true)
 |-- concatenated_words: string (nullable = false)



In [32]:
mysql_url = "jdbc:mysql://localhost:3306/ProjectTweets"
mysql_properties = {
    "user": "root",
    "password": "password",
}


In [33]:
# df.write.jdbc(url=mysql_url, table="Tweets", mode="overwrite", properties=mysql_properties)

In [34]:
df_from_mysql = spark.read.jdbc(url=mysql_url, table="Tweets", properties=mysql_properties)
df_from_mysql.show()

[Stage 11:>                                                         (0 + 1) / 1]

+-----------+----------+----------+--------+---------------+--------------------+
|tweet_index|       ids|      date|    flag|           user|  concatenated_words|
+-----------+----------+----------+--------+---------------+--------------------+
|          1|1467810672|2009-04-07|NO_QUERY|  scotthamilton|upset,update,face...|
|          2|1467810917|2009-04-07|NO_QUERY|       mattycus|kenichan,dived,ma...|
|          3|1467811184|2009-04-07|NO_QUERY|        ElleCTF|whole,body,feel,i...|
|          4|1467811193|2009-04-07|NO_QUERY|         Karoli|nationwideclass,b...|
|          5|1467811372|2009-04-07|NO_QUERY|       joy_wolf| kwesidei,whole,crew|
|          6|1467811592|2009-04-07|NO_QUERY|        mybirch|            need,hug|
|          7|1467811594|2009-04-07|NO_QUERY|           coZZ|loltrish,hey,long...|
|          8|1467811795|2009-04-07|NO_QUERY|2Hood4Hollywood| tatiana,k,nope,didn|
|          9|1467812025|2009-04-07|NO_QUERY|        mimismo|  twittera,que,muera|
|         10|146

                                                                                

# Hive

In [35]:
df.createOrReplaceTempView("temp_table")

In [36]:
create_table_sql = """
CREATE TABLE ProjectTweets (
    tweet_index INT,
    ids BIGINT,
    date DATE,
    flag STRING,
    user STRING,
    filtered_words_final STRING
)
STORED AS PARQUET
"""
spark.sql(create_table_sql)

2023-11-05 01:13:56,415 WARN conf.HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
2023-11-05 01:13:56,420 WARN conf.HiveConf: HiveConf of name hive.stats.retries.wait does not exist
2023-11-05 01:14:07,924 WARN metastore.ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
2023-11-05 01:14:07,925 WARN metastore.ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore hduser@127.0.1.1
2023-11-05 01:14:07,961 WARN metastore.ObjectStore: Failed to get database default, returning NoSuchObjectException
2023-11-05 01:14:09,838 WARN session.SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
2023-11-05 01:14:09,914 WARN conf.HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
2023-11-05 01:14:09

DataFrame[]

In [37]:
hive_insert_data_sql = """
INSERT INTO ProjectTweets SELECT * FROM temp_table
"""

In [38]:
spark.sql(hive_insert_data_sql)

2023-11-05 01:16:19,343 WARN metastore.ObjectStore: Failed to get database global_temp, returning NoSuchObjectException


DataFrame[]

In [41]:
# Veriyi sorgulayın
result = spark.sql("SELECT * FROM ProjectTweets")

# Sonuçları gösterin
result.show()

+-----------+----------+----------+--------+---------------+--------------------+
|tweet_index|       ids|      date|    flag|           user|filtered_words_final|
+-----------+----------+----------+--------+---------------+--------------------+
|          1|1467810672|2009-04-07|NO_QUERY|  scotthamilton|upset,update,face...|
|          2|1467810917|2009-04-07|NO_QUERY|       mattycus|kenichan,dived,ma...|
|          3|1467811184|2009-04-07|NO_QUERY|        ElleCTF|whole,body,feel,i...|
|          4|1467811193|2009-04-07|NO_QUERY|         Karoli|nationwideclass,b...|
|          5|1467811372|2009-04-07|NO_QUERY|       joy_wolf| kwesidei,whole,crew|
|          6|1467811592|2009-04-07|NO_QUERY|        mybirch|            need,hug|
|          7|1467811594|2009-04-07|NO_QUERY|           coZZ|loltrish,hey,long...|
|          8|1467811795|2009-04-07|NO_QUERY|2Hood4Hollywood| tatiana,k,nope,didn|
|          9|1467812025|2009-04-07|NO_QUERY|        mimismo|  twittera,que,muera|
|         10|146

In [39]:
££fd.amglamha
mmm

SyntaxError: invalid character '£' (U+00A3) (2277693821.py, line 1)

In [None]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.graph_objs as go

In [None]:
# Let's check flag column distribution
def create_bar_chart(df):
    flag_counts = df.groupBy("flag").count().orderBy("flag")
    x = flag_counts.select("flag").rdd.flatMap(lambda x: x).collect()
    y = flag_counts.select("count").rdd.flatMap(lambda x: x).collect()

    data = [go.Bar(x=x, y=y)]

    layout = go.Layout(title="Flag Distribution", xaxis=dict(title="Flag"), yaxis=dict(title="Count"))
    fig = go.Figure(data=data, layout=layout)

    return fig

# Dash uygulamasını oluşturun
app = dash.Dash(__name__)

app.layout = html.Div([
    dcc.Graph(id='flag-chart'),
])

@app.callback(
    Output('flag-chart', 'figure'),
    [Input('flag-chart', 'relayoutData')]
)
def update_chart(relayoutData):
    return create_bar_chart(df)

if __name__ == '__main__':
    app.run_server(debug=True)