In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("myApp") \
    .getOrCreate()

In [2]:
# Load the ProjectTweets.csv into hadoop in the named folder 'user1'
df = spark.read.csv('/user1/ProjectTweets.csv', header=False, inferSchema=True)

                                                                                

In [3]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: long (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)



In [4]:
df.show(5)

+---+----------+--------------------+--------+---------------+--------------------+
|_c0|       _c1|                 _c2|     _c3|            _c4|                 _c5|
+---+----------+--------------------+--------+---------------+--------------------+
|  0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|  1|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|  2|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|  3|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|  4|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
+---+----------+--------------------+--------+---------------+--------------------+
only showing top 5 rows



In [5]:
# Drop the _c1 column unused
df = df.drop('_c1')

In [6]:
# Named the columns of df
df = df.withColumnRenamed('_c0', 'id') \
       .withColumnRenamed('_c2', 'date') \
       .withColumnRenamed('_c3', 'flag') \
       .withColumnRenamed('_c4', 'user') \
       .withColumnRenamed('_c5', 'text')

In [7]:
# Display the structure of schema
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- flag: string (nullable = true)
 |-- user: string (nullable = true)
 |-- text: string (nullable = true)



In [8]:
# Display the first 5 rows of the data
df.show(5)

+---+--------------------+--------+---------------+--------------------+
| id|                date|    flag|           user|                text|
+---+--------------------+--------+---------------+--------------------+
|  0|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|  1|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|  2|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|  3|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|  4|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
+---+--------------------+--------+---------------+--------------------+
only showing top 5 rows



In [63]:
df.write.format("mongodb") \
    .option("uri", "mongodb://127.0.0.1:27001/") \
    .option("database", "tweet_mongo") \
    .option("collection", "tweet_collection") \
    .mode("append").save()

                                                                                

In [72]:
# MySQL JDBC connection
jdbc_url = "jdbc:mysql://127.0.0.1:3306/tweet_mysql"

# MySQL user and pass
properties = {
    "user": "root",
    "password": "kalem",
    "driver": "com.mysql.cj.jdbc.Driver"
}

In [29]:
df.write.jdbc(url=jdbc_url, table="tweets", mode="overwrite", properties=properties)

                                                                                

In [64]:
# print of data shape
print('Shape of dataset:',(df.count(),len(df.columns)))

Shape of dataset: (1600000, 5)




In [65]:
# Display the summary statistics of the data
df.describe().show()



+-------+------------------+--------------------+--------+-------------------+--------------------+
|summary|                id|                date|    flag|               user|                text|
+-------+------------------+--------------------+--------+-------------------+--------------------+
|  count|           1600000|             1600000| 1600000|            1600000|             1600000|
|   mean|          799999.5|                null|    null|4.325887521835714E9|                null|
| stddev|461880.35968924535|                null|    null|5.16273321845489E10|                null|
|    min|                 0|Fri Apr 17 20:30:...|NO_QUERY|       000catnap000|                 ...|
|    max|           1599999|Wed May 27 07:27:...|NO_QUERY|         zzzzeus111|ï¿½ï¿½ï¿½ï¿½ï¿½ß§...|
+-------+------------------+--------------------+--------+-------------------+--------------------+



                                                                                

In [39]:
# YCSB test for MYSQL
!/home/hduser/ycsb-0.17.0/bin/ycsb.sh run jdbc -P /home/hduser/ycsb-0.17.0/workloads/workloada -p db.url=jdbc:mysql://localhost:3306/tweet_mysql -p db.user=root -p db.passwd=kalem -p db.driver=com.mysql.cj.jdbc.Driver

/usr/bin/java  -classpath /home/hduser/ycsb-0.17.0/conf:/home/hduser/ycsb-0.17.0/lib/HdrHistogram-2.1.4.jar:/home/hduser/ycsb-0.17.0/lib/core-0.17.0.jar:/home/hduser/ycsb-0.17.0/lib/htrace-core4-4.1.0-incubating.jar:/home/hduser/ycsb-0.17.0/lib/jackson-core-asl-1.9.4.jar:/home/hduser/ycsb-0.17.0/lib/jackson-mapper-asl-1.9.4.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/conf:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/commons-collections-3.2.1.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/commons-lang-2.4.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/commons-pool-1.5.4.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/geronimo-jms_1.1_spec-1.1.1.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/geronimo-jta_1.1_spec-1.1.1.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/jdbc-binding-0.17.0.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/mysql-connector-j-8.0.33.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/openjpa-jdbc-2.1.1.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/openjpa-kernel-2.1.1.jar:/home/hdus

In [42]:
# YCSB test for MONGO DB
!/home/hduser/ycsb-0.17.0/bin/ycsb.sh run mongodb -P /home/hduser/ycsb-0.17.0/workloads/workloada -p mongodb.url=mongodb://localhost:27017 -p mongodb.database=tweet_mongo 


/usr/bin/java  -classpath /home/hduser/ycsb-0.17.0/conf:/home/hduser/ycsb-0.17.0/lib/HdrHistogram-2.1.4.jar:/home/hduser/ycsb-0.17.0/lib/core-0.17.0.jar:/home/hduser/ycsb-0.17.0/lib/htrace-core4-4.1.0-incubating.jar:/home/hduser/ycsb-0.17.0/lib/jackson-core-asl-1.9.4.jar:/home/hduser/ycsb-0.17.0/lib/jackson-mapper-asl-1.9.4.jar:/home/hduser/ycsb-0.17.0/mongodb-binding/lib/logback-classic-1.1.2.jar:/home/hduser/ycsb-0.17.0/mongodb-binding/lib/logback-core-1.1.2.jar:/home/hduser/ycsb-0.17.0/mongodb-binding/lib/mongo-java-driver-3.8.0.jar:/home/hduser/ycsb-0.17.0/mongodb-binding/lib/mongodb-async-driver-2.0.1.jar:/home/hduser/ycsb-0.17.0/mongodb-binding/lib/mongodb-binding-0.17.0.jar:/home/hduser/ycsb-0.17.0/mongodb-binding/lib/slf4j-api-1.7.25.jar:/home/hduser/ycsb-0.17.0/mongodb-binding/lib/snappy-java-1.1.7.1.jar site.ycsb.Client -t -db site.ycsb.db.MongoDbClient -P /home/hduser/ycsb-0.17.0/workloads/workloada -p mongodb.url=mongodb://localhost:27017 -p mongodb.database=tweet_mongo
Com

Nothing updated for key user6184078644860972864
Nothing updated for key user2996251037375572516
Nothing updated for key user7460881851310751334
Nothing updated for key user4640687271668624146
Nothing updated for key user4152828024211893584
Nothing updated for key user1573987489603120213
Nothing updated for key user4626287080316522538
Nothing updated for key user176616418245585776
Nothing updated for key user2538312093647326066
Nothing updated for key user7345341813716561613
Nothing updated for key user6290028003198785340
Nothing updated for key user8502137115770724564
Nothing updated for key user5339761945212382530
Nothing updated for key user6400663543555865497
Nothing updated for key user4987430714527649991
Nothing updated for key user4278642031651057680
Nothing updated for key user3107446538244793121
Nothing updated for key user7123510772490260851
Nothing updated for key user6762367138279133398
Nothing updated for key user195921106834796948
Nothing updated for key user51182725492803

Nothing updated for key user2413276361938144301
Nothing updated for key user4153387984724034032
Nothing updated for key user6877907175873323119
Nothing updated for key user5817347222824138717
Nothing updated for key user1573987489603120213
Nothing updated for key user7934122592197538405
Nothing updated for key user6772299462829809208
Nothing updated for key user3121846729596894729
Nothing updated for key user1574547450115260661
Nothing updated for key user759249448388715426
[OVERALL], RunTime(ms), 718
[OVERALL], Throughput(ops/sec), 1392.757660167131
[TOTAL_GCS_PS_Scavenge], Count, 2
[TOTAL_GC_TIME_PS_Scavenge], Time(ms), 8
[TOTAL_GC_TIME_%_PS_Scavenge], Time(%), 1.1142061281337048
[TOTAL_GCS_PS_MarkSweep], Count, 0
[TOTAL_GC_TIME_PS_MarkSweep], Time(ms), 0
[TOTAL_GC_TIME_%_PS_MarkSweep], Time(%), 0.0
[TOTAL_GCs], Count, 2
[TOTAL_GC_TIME], Time(ms), 8
[TOTAL_GC_TIME_%], Time(%), 1.1142061281337048
[UPDATE-FAILED], Operations, 484
[UPDATE-FAILED], AverageLatency(us

In [9]:
import re

def clean_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    cleaned_text = cleaned_text.lower()
    return cleaned_text


In [10]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

clean_text_udf = udf(lambda text: clean_text(text) if text else '', StringType())

df = df.withColumn('cleaned_text', clean_text_udf(df['text']))

df.show(5)


[Stage 4:>                                                          (0 + 1) / 1]

+---+--------------------+--------+---------------+--------------------+--------------------+
| id|                date|    flag|           user|                text|        cleaned_text|
+---+--------------------+--------+---------------+--------------------+--------------------+
|  0|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|switchfoot httptw...|
|  1|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|is upset that he ...|
|  2|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|kenichan i dived ...|
|  3|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|my whole body fee...|
|  4|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|nationwideclass n...|
+---+--------------------+--------+---------------+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [11]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- flag: string (nullable = true)
 |-- user: string (nullable = true)
 |-- text: string (nullable = true)
 |-- cleaned_text: string (nullable = true)



In [12]:
from pyspark.ml.feature import Word2Vec

# Önce DataFrame'inizi uygun bir formata getirin
from pyspark.sql.functions import split

# 'text' sütununu boşluklara göre ayırarak bir liste oluşturun
df = df.withColumn('words', split(df['cleaned_text'], ' '))

# Word2Vec modelini eğit
word2Vec = Word2Vec(vectorSize=100, minCount=5, inputCol='words', outputCol='word_vectors')
model = word2Vec.fit(df)
result = model.transform(df)


                                                                                

In [13]:
df.show(5)

+---+--------------------+--------+---------------+--------------------+--------------------+--------------------+
| id|                date|    flag|           user|                text|        cleaned_text|               words|
+---+--------------------+--------+---------------+--------------------+--------------------+--------------------+
|  0|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|switchfoot httptw...|[switchfoot, http...|
|  1|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|is upset that he ...|[is, upset, that,...|
|  2|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|kenichan i dived ...|[kenichan, i, div...|
|  3|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|my whole body fee...|[my, whole, body,...|
|  4|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|nationwideclass n...|[nationwideclass,...|
+---+--------------------+--------+---------------+--------------------+--------

In [14]:
# Gerekli kütüphaneleri içe aktarın
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Veri setini hazırlayın
# Örneğin, 'result' DataFrame'inde 'word_vectors' kolonu vektörlerini içeriyor

# Veriyi eğitim ve test setlerine ayırın
train_data = result.limit(1000000)  # Örneğin, ilk 1 milyon veriyi eğitim için kullanalım
test_data = result.limit(200000).withColumnRenamed('word_vectors', 'features') 

In [None]:
# Eğitim verisini uygun formata dönüştürün
X_train = train_data.select('word_vectors').rdd.flatMap(lambda x: x).collect()
X_train = [x.toArray() for x in X_train]
X_train = np.array(X_train)

y_train = train_data.select('label').rdd.flatMap(lambda x: x).collect()
y_train = np.array(y_train)

[Stage 12:>                                                         (0 + 1) / 1]

In [None]:
# Test verisini uygun formata dönüştürün
X_test = test_data.select('features').rdd.flatMap(lambda x: x).collect()
X_test = [x.toArray() for x in X_test]
X_test = np.array(X_test)

In [None]:
# Modeli oluşturun
model = Sequential()
model.add(LSTM(100, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(1))

In [None]:

# Modeli derleyin
model.compile(loss='mean_squared_error', optimizer='adam')

# Modeli eğitin
model.fit(X_train, y_train, epochs=10, batch_size=64)

# Tahminler yapın
predictions = model.predict(X_test)


2023-10-26 20:25:07,739 ERROR util.Utils: Uncaught exception in thread stdout writer for python3
java.lang.OutOfMemoryError: Java heap space
	at java.util.Arrays.copyOf(Arrays.java:3236)
	at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118)
	at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
	at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
	at java.io.OutputStream.write(OutputStream.java:75)
	at net.razorvine.pickle.Pickler.put_float(Pickler.java:715)
	at net.razorvine.pickle.Pickler.dispatch(Pickler.java:294)
	at net.razorvine.pickle.Pickler.save(Pickler.java:193)
	at net.razorvine.pickle.Pickler.put_collection(Pickler.java:415)
	at net.razorvine.pickle.Pickler.dispatch(Pickler.java:371)
	at net.razorvine.pickle.Pickler.save(Pickler.java:193)
	at org.apache.spark.sql.execution.python.EvaluatePython$RowPickler.pickle(EvaluatePython.scala:271)
	at net.razorvine.pickle.Pickler.dispatch(Pickler.java:305)
	at net.razorvine