# Kütüphanelerin İmport Edilmesi

In [2]:
import pyspark 
from pyspark.sql import SparkSession
from pyspark.sql.functions import rand
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\celal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\celal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


# Spark Ortamının Hazırlanması

In [3]:
spark = SparkSession.builder.appName("simulator").master("local").config("spark.driver.memory", "40g") \
  .config("spark.executor.memory", "50g").getOrCreate()

In [7]:
import json

with open('C:/Users/celal/Documents/Github/reddit_big_data_analysis_project/secret/apikey.json') as f:
    data = json.load(f)
    api_key = data['key']
    client_id = data['client_id']
    user_agent = data['user_agent']

with open('C:/Users/celal/Documents/Github/reddit_big_data_analysis_project/secret/mongo.json') as m:
    data = json.load(m)
    conn_string = data["connection_string"]

In [6]:
df = spark.read.csv("C:/Users/celal/Documents/Github/reddit_big_data_analysis_project/data/turkish-subreddits.csv", header=True)
df.show()

+-----------------+
|       Subreddits|
+-----------------+
|        liseliler|
|        Hacettepe|
|          Yatirim|
|           kripto|
|           Finans|
|         Videoyun|
|  androidoyunclub|
|       burdurland|
|            Kafes|
|sariyerbelediyesi|
|      TurkeyJerky|
+-----------------+



In [8]:
df_with_random = df.withColumn("random", rand())
random_row_1 = df_with_random.orderBy("random").limit(1)
random_row_1.head()["Subreddits"]

'androidoyunclub'

# Praw kütüphanesi kullanılarak reddit veri çekilmesi

In [9]:
import praw

In [10]:
# PRAW ile Reddit'e bağlan
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=api_key,
    user_agent=user_agent
)


In [11]:
import requests as req
import time 
import random

import pymongo

In [12]:
mongo_client = pymongo.MongoClient(conn_string)

mydb = mongo_client["bigData"]

# Verinin geçeceği transform işlemi

In [13]:
def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    text = text.lower()
    
    text = ' '.join(text.split())
    
    return text

In [14]:
def remove_stopwords(text):
    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('turkish'))
    
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    
    filtered_text = ' '.join(filtered_tokens)
    
    return filtered_text

In [15]:
def Transformation(submission_data):
    # Eğer selftext değişkeni boşsa
    if not submission_data["selftext"]:
        # subreddit_name değişkenini selftext olarak kullan
        submission_data["selftext"] = submission_data["subreddit_name"]
    submission_data["selftext"] = remove_stopwords(preprocess_text(submission_data["selftext"]))
    submission_data["title"] = remove_stopwords(preprocess_text(submission_data["title"]))
    return submission_data

# Sentiment tahmini için Apiye istek atılması

In [16]:
def request_submission(submission_data):
    url = "http://127.0.0.1:8080/sentiment"  

    if not submission_data.get("selftext"):
        submission_data["selftext"] = submission_data["title"]

    body = {
        "text" : submission_data["selftext"]
    }

    
    # JSON verisini oluştur
    payload = json.dumps(body)
    
    # POST isteği gönder
    response = req.post(url, data=payload, headers={'Content-Type': 'application/json'})
    
    # Yanıtı kontrol et
    if response.status_code == 200:
        #print("Submission successfully posted!")
        response_data = response.json()  # Yanıtı JSON formatında al
        submission_data["sentiment"] = response_data.get("sentiment", -1)
    else:
        print(f"Failed to post submission. Status code: {response.status_code}, Response: {response.text}")

    
    return submission_data

# Main

In [18]:
while True:
    df_with_random = df.withColumn("random", rand())
    random_row_1 = df_with_random.orderBy("random").limit(1)
    subreddit_name = random_row_1.head()["Subreddits"]
    subreddit = reddit.subreddit(subreddit_name)
    
    random_hot_limit = random.randint(1, 100)

    random_hot_limit = random.randint(1, 100)
    hot_submissions = reddit.subreddit(subreddit_name).hot(limit=random_hot_limit)

    # Choose a random submission from the fetched results
    random_submission = random.choice(list(hot_submissions))

    # Extract relevant information from the submission
    submission_data = {
        "subreddit_name": subreddit_name,
        "title": random_submission.title,
        "permalink": random_submission.permalink,
        "selftext": random_submission.selftext  # Add selftext if available
    }
    print("-----önce------")
    print(submission_data)
    submission_data = request_submission(submission_data)
    print("-------------sonra--------------------")
    print(submission_data)
    submission_data = Transformation(submission_data)
    # veritabanına ekleme
    collection = mydb[submission_data["subreddit_name"]]
    x = collection.insert_one(submission_data)
    time.sleep(3)

-----önce------
{'subreddit_name': 'Yatirim', 'title': 'Hahahaha bu ne ya?', 'permalink': '/r/Yatirim/comments/1d2mjdz/hahahaha_bu_ne_ya/', 'selftext': ''}
-------------sonra--------------------
{'subreddit_name': 'Yatirim', 'title': 'Hahahaha bu ne ya?', 'permalink': '/r/Yatirim/comments/1d2mjdz/hahahaha_bu_ne_ya/', 'selftext': 'Hahahaha bu ne ya?', 'sentiment': 1}
-----önce------
{'subreddit_name': 'kripto', 'title': 'Binance referans kodu en fazla %20 mi indirim sağlıyor?', 'permalink': '/r/kripto/comments/17y9tic/binance_referans_kodu_en_fazla_20_mi_indirim/', 'selftext': '[https://www.karekod.org/blog/binance-referans-kodu-nedir/](https://www.karekod.org/blog/binance-referans-kodu-nedir/) burada Binance referans kodu en fazla %20 komisyon indirimi sağladığı yazmakta. Bazı sitelerde  %45 komisyon indirimi yazıyor. Buna BNB indirimi mi dahil ediyorlar?\n\n[View Poll](https://www.reddit.com/poll/17y9tic)'}
-------------sonra--------------------
{'subreddit_name': 'kripto', 'title': '

KeyboardInterrupt: 